diff --git a/src/core/__tests__/trigger-auth.test.ts b/src/core/__tests__/trigger-auth.test.ts index 39f89ff..c3db785 100644 --- a/src/core/__tests__/trigger-auth.test.ts +++ b/src/core/__tests__/trigger-auth.test.ts @@ -19,7 +19,7 @@ describe("/trigger endpoint auth", () => { let server: ReturnType; let baseUrl: string; - beforeAll(() => { + beforeAll(async () => { // Back up the existing mcp.yaml so we can restore it after tests if (existsSync(mcpConfigPath)) { originalMcpYaml = readFileSync(mcpConfigPath, "utf-8"); @@ -38,11 +38,8 @@ describe("/trigger endpoint auth", () => { mkdirSync("config", { recursive: true }); writeFileSync(mcpConfigPath, YAML.stringify(mcpConfig), "utf-8"); - // Start server with a random port - server = startServer({ name: "test", port: 0, role: "base" } as never, Date.now()); - baseUrl = `http://localhost:${server.port}`; - - // Wire trigger deps with a mock runtime + // Wire trigger deps before starting the server so the /trigger + // handler is ready on the first request. setTriggerDeps({ runtime: { handleMessage: async () => ({ @@ -52,6 +49,15 @@ describe("/trigger endpoint auth", () => { }), } as never, }); + + // Start server after deps are wired. Use server.url (Bun guarantees + // it is populated once serve() returns) instead of manually building + // the URL from server.port, which can race in CI environments. + server = startServer({ name: "test", port: 0, role: "base" } as never, Date.now()); + baseUrl = server.url.origin; + + // Ensure the server is accepting connections before tests run. + await fetch(`${baseUrl}/health`); }); afterAll(() => { diff --git a/src/db/__tests__/migrate.test.ts b/src/db/__tests__/migrate.test.ts index cc06d02..4662419 100644 --- a/src/db/__tests__/migrate.test.ts +++ b/src/db/__tests__/migrate.test.ts @@ -36,7 +36,7 @@ describe("runMigrations", () => { runMigrations(db); const migrationCount = db.query("SELECT COUNT(*) as count FROM _migrations").get() as { count: number }; - expect(migrationCount.count).toBe(12); + expect(migrationCount.count).toBe(13); }); test("tracks applied migration indices", () => { @@ -48,6 +48,6 @@ describe("runMigrations", () => { .all() .map((r) => (r as { index_num: number }).index_num); - expect(indices).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + expect(indices).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); }); }); diff --git a/src/db/schema.ts b/src/db/schema.ts index 4fb3302..060f5c6 100644 --- a/src/db/schema.ts +++ b/src/db/schema.ts @@ -126,4 +126,6 @@ export const MIGRATIONS: string[] = [ // Appended, never inserted mid-array: existing deployments have already // applied migrations 0–10, so the new column must land at index 11. "ALTER TABLE loops ADD COLUMN trigger_message_ts TEXT", + + "ALTER TABLE loops ADD COLUMN checkpoint_interval INTEGER", ]; diff --git a/src/evolution/judges/client.ts b/src/evolution/judges/client.ts index 6254e99..2a5e331 100644 --- a/src/evolution/judges/client.ts +++ b/src/evolution/judges/client.ts @@ -10,6 +10,19 @@ import { type VotingStrategy, } from "./types.ts"; +/** Thrown when the API call succeeds but structured output parsing fails. Carries token usage so cost can still be tracked. */ +export class JudgeParseError extends Error { + constructor( + message: string, + public readonly inputTokens: number, + public readonly outputTokens: number, + public readonly costUsd: number, + ) { + super(message); + this.name = "JudgeParseError"; + } +} + let _client: Anthropic | null = null; function getClient(): Anthropic { @@ -58,14 +71,19 @@ export async function callJudge(options: { }); const parsed = message.parsed_output; - if (!parsed) { - throw new Error(`Judge returned no structured output (stop_reason: ${message.stop_reason})`); - } - const inputTokens = message.usage.input_tokens; const outputTokens = message.usage.output_tokens; const costUsd = estimateCost(options.model, inputTokens, outputTokens); + if (!parsed) { + throw new JudgeParseError( + `Judge returned no structured output (stop_reason: ${message.stop_reason})`, + inputTokens, + outputTokens, + costUsd, + ); + } + // Extract verdict and confidence from the parsed data if present const data = parsed as Record; const verdict = (data.verdict as "pass" | "fail") ?? "pass"; diff --git a/src/index.ts b/src/index.ts index bd9986e..8315421 100644 --- a/src/index.ts +++ b/src/index.ts @@ -116,8 +116,9 @@ async function main(): Promise { runtime.setRoleTemplate(activeRole); } + let contextBuilder: MemoryContextBuilder | undefined; if (memory.isReady()) { - const contextBuilder = new MemoryContextBuilder(memory, memoryConfig); + contextBuilder = new MemoryContextBuilder(memory, memoryConfig); runtime.setMemoryContextBuilder(contextBuilder); } @@ -160,7 +161,17 @@ async function main(): Promise { let mcpServer: PhantomMcpServer | null = null; let scheduler: Scheduler | null = null; - const loopRunner = new LoopRunner({ db, runtime }); + const postLoopDeps = + evolution || memory.isReady() + ? { + evolution: evolution ?? undefined, + memory: memory.isReady() ? memory : undefined, + onEvolvedConfigUpdate: evolution + ? (config: ReturnType) => runtime.setEvolvedConfig(config) + : undefined, + } + : undefined; + const loopRunner = new LoopRunner({ db, runtime, memoryContextBuilder: contextBuilder, postLoopDeps }); try { mcpServer = new PhantomMcpServer({ config, diff --git a/src/loop/__tests__/evolution-integration.test.ts b/src/loop/__tests__/evolution-integration.test.ts new file mode 100644 index 0000000..074b297 --- /dev/null +++ b/src/loop/__tests__/evolution-integration.test.ts @@ -0,0 +1,364 @@ +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { runMigrations } from "../../db/migrate.ts"; +import { LoopRunner } from "../runner.ts"; +import { LoopStartInputSchema } from "../types.ts"; + +type HandleMessageImpl = ( + channel: string, + conversationId: string, + text: string, +) => Promise<{ + text: string; + sessionId: string; + cost: { totalUsd: number; inputTokens: number; outputTokens: number; modelUsage: Record }; + durationMs: number; +}>; + +function createMockRuntime(impl?: HandleMessageImpl) { + const defaultImpl: HandleMessageImpl = async () => ({ + text: "ok", + sessionId: "s", + cost: { totalUsd: 0.01, inputTokens: 10, outputTokens: 10, modelUsage: {} }, + durationMs: 10, + }); + return { handleMessage: mock(impl ?? defaultImpl) }; +} + +function agentFinishes(stateFile: string, loopId: string): HandleMessageImpl { + return async () => { + writeFileSync(stateFile, `---\nloop_id: ${loopId}\nstatus: done\niteration: 1\n---\n\nDone.\n`, "utf-8"); + return { + text: "done", + sessionId: "s", + cost: { totalUsd: 0.01, inputTokens: 1, outputTokens: 1, modelUsage: {} }, + durationMs: 1, + }; + }; +} + +describe("LoopRunner evolution integration", () => { + let db: Database; + let dataDir: string; + + beforeEach(() => { + db = new Database(":memory:"); + db.run("PRAGMA journal_mode = WAL"); + db.run("PRAGMA foreign_keys = ON"); + runMigrations(db); + dataDir = mkdtempSync(join(tmpdir(), "phantom-loop-evo-")); + }); + + afterEach(() => { + db.close(); + rmSync(dataDir, { recursive: true, force: true }); + }); + + test("memory context is cached once at start and reused across ticks", async () => { + const buildMock = mock(async () => "## Known Facts\n- User prefers TS"); + const mockContextBuilder = { build: buildMock } as never; + + const runtime = createMockRuntime(); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + memoryContextBuilder: mockContextBuilder, + }); + const loop = runner.start({ goal: "Test memory caching", maxIterations: 5 }); + + // Allow the async cache to resolve + await Bun.sleep(10); + + // buildMock called once at start + expect(buildMock).toHaveBeenCalledTimes(1); + expect(buildMock).toHaveBeenCalledWith("Test memory caching"); + + // Tick multiple times - build should still only be called once + await runner.tick(loop.id); + await runner.tick(loop.id); + expect(buildMock).toHaveBeenCalledTimes(1); + + // Verify the prompt contains memory context + const promptArg = runtime.handleMessage.mock.calls[0][2] as string; + expect(promptArg).toContain("RECALLED MEMORIES"); + expect(promptArg).toContain("User prefers TS"); + }); + + test("memory context is cleared on finalize", async () => { + const buildMock = mock(async () => "some context"); + const mockContextBuilder = { build: buildMock } as never; + + const runtime = createMockRuntime(); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + memoryContextBuilder: mockContextBuilder, + }); + const loop = runner.start({ goal: "clean up", maxIterations: 1 }); + await Bun.sleep(10); + + runtime.handleMessage.mockImplementation(agentFinishes(loop.stateFile, loop.id)); + await runner.tick(loop.id); + + expect(runner.getLoop(loop.id)?.status).toBe("done"); + + // Start another loop - build should be called again (cache was cleared) + runner.start({ goal: "another" }); + await Bun.sleep(10); + expect(buildMock).toHaveBeenCalledTimes(2); + }); + + test("post-loop evolution is called on finalize with correct session data", async () => { + const afterSessionMock = mock(async () => ({ version: 1, changes_applied: [], changes_rejected: [] })); + const mockEvolution = { + afterSession: afterSessionMock, + getConfig: () => ({ userProfile: "", domainKnowledge: "" }), + }; + const mockMemory = { isReady: () => false }; + const onUpdate = mock(() => {}); + + const runtime = createMockRuntime(); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + postLoopDeps: { + evolution: mockEvolution as never, + memory: mockMemory as never, + onEvolvedConfigUpdate: onUpdate, + }, + }); + const loop = runner.start({ goal: "evolve this" }); + runtime.handleMessage.mockImplementation(agentFinishes(loop.stateFile, loop.id)); + await runner.tick(loop.id); + + // Allow fire-and-forget to complete + await Bun.sleep(50); + + expect(afterSessionMock).toHaveBeenCalledTimes(1); + const summary = (afterSessionMock.mock.calls as unknown[][])[0][0] as Record; + expect(summary.session_id).toBe(loop.id); + expect(summary.outcome).toBe("success"); + }); + + test("post-loop evolution failure does not affect loop status", async () => { + const mockEvolution = { + afterSession: mock(async () => { + throw new Error("evolution broke"); + }), + getConfig: () => ({ userProfile: "", domainKnowledge: "" }), + }; + const mockMemory = { isReady: () => false }; + + const runtime = createMockRuntime(); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + postLoopDeps: { + evolution: mockEvolution as never, + memory: mockMemory as never, + onEvolvedConfigUpdate: () => {}, + }, + }); + const loop = runner.start({ goal: "survive evolution failure" }); + runtime.handleMessage.mockImplementation(agentFinishes(loop.stateFile, loop.id)); + await runner.tick(loop.id); + + await Bun.sleep(50); + + // Loop should still be done, not failed + expect(runner.getLoop(loop.id)?.status).toBe("done"); + }); + + test("critique does NOT fire when postLoopDeps is absent", async () => { + const runtime = createMockRuntime(); + // No postLoopDeps = no evolution engine = critique should never fire + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + }); + const loop = runner.start({ goal: "no evolution", checkpointInterval: 1 }); + + await runner.tick(loop.id); + + // The prompt should NOT contain reviewer feedback (critique skipped) + const promptArg = runtime.handleMessage.mock.calls[0][2] as string; + expect(promptArg).not.toContain("REVIEWER FEEDBACK"); + }); + + test("critique does NOT fire when usesLLMJudges returns false", async () => { + const mockEvolution = { + afterSession: mock(async () => ({ version: 1, changes_applied: [], changes_rejected: [] })), + getConfig: () => ({ userProfile: "", domainKnowledge: "" }), + usesLLMJudges: () => false, + isWithinCostCap: () => true, + trackExternalJudgeCost: mock(() => {}), + }; + + const runtime = createMockRuntime(); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + postLoopDeps: { + evolution: mockEvolution as never, + memory: { isReady: () => false } as never, + onEvolvedConfigUpdate: () => {}, + }, + }); + const loop = runner.start({ goal: "judges disabled", checkpointInterval: 1 }); + + await runner.tick(loop.id); + await runner.tick(loop.id); + + // Second tick prompt should NOT have critique (judges disabled) + const secondPrompt = runtime.handleMessage.mock.calls[1][2] as string; + expect(secondPrompt).not.toContain("REVIEWER FEEDBACK"); + expect(mockEvolution.trackExternalJudgeCost).not.toHaveBeenCalled(); + }); + + test("critique does NOT fire when cost cap is exceeded", async () => { + const mockEvolution = { + afterSession: mock(async () => ({ version: 1, changes_applied: [], changes_rejected: [] })), + getConfig: () => ({ userProfile: "", domainKnowledge: "" }), + usesLLMJudges: () => true, + isWithinCostCap: () => false, + trackExternalJudgeCost: mock(() => {}), + }; + + const runtime = createMockRuntime(); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + postLoopDeps: { + evolution: mockEvolution as never, + memory: { isReady: () => false } as never, + onEvolvedConfigUpdate: () => {}, + }, + }); + const loop = runner.start({ goal: "over budget", checkpointInterval: 1 }); + + await runner.tick(loop.id); + await runner.tick(loop.id); + + const secondPrompt = runtime.handleMessage.mock.calls[1][2] as string; + expect(secondPrompt).not.toContain("REVIEWER FEEDBACK"); + expect(mockEvolution.trackExternalJudgeCost).not.toHaveBeenCalled(); + }); + + test("tick 1 summary is recorded in transcript", async () => { + const runtime = createMockRuntime(); + const afterSessionMock = mock(async () => ({ version: 1, changes_applied: [], changes_rejected: [] })); + const runner = new LoopRunner({ + db, + runtime, + dataDir, + autoSchedule: false, + postLoopDeps: { + evolution: { + afterSession: afterSessionMock, + getConfig: () => ({ userProfile: "", domainKnowledge: "" }), + usesLLMJudges: () => false, + isWithinCostCap: () => true, + trackExternalJudgeCost: mock(() => {}), + } as never, + memory: { isReady: () => false } as never, + onEvolvedConfigUpdate: () => {}, + }, + }); + const loop = runner.start({ goal: "check tick 1 summary" }); + + runtime.handleMessage.mockImplementation(agentFinishes(loop.stateFile, loop.id)); + await runner.tick(loop.id); + + // Allow fire-and-forget to complete + await Bun.sleep(50); + + // The session data should include a Tick 1 summary + const summary = (afterSessionMock.mock.calls as unknown[][])[0][0] as Record; + const userMsgs = summary.user_messages as string[]; + const hasTick1Summary = userMsgs.some((m) => m.includes("Tick 1:")); + expect(hasTick1Summary).toBe(true); + }); + + test("checkpoint_interval round-trips through start/store", () => { + const runtime = createMockRuntime(); + const runner = new LoopRunner({ db, runtime, dataDir, autoSchedule: false }); + + const loop = runner.start({ goal: "with checkpoint", checkpointInterval: 5 }); + expect(loop.checkpointInterval).toBe(5); + + const reloaded = runner.getLoop(loop.id); + expect(reloaded?.checkpointInterval).toBe(5); + }); + + test("checkpoint_interval defaults to null when omitted", () => { + const runtime = createMockRuntime(); + const runner = new LoopRunner({ db, runtime, dataDir, autoSchedule: false }); + + const loop = runner.start({ goal: "no checkpoint" }); + expect(loop.checkpointInterval).toBeNull(); + }); +}); + +describe("LoopStartInputSchema checkpoint_interval validation", () => { + test("accepts valid checkpoint_interval", () => { + const result = LoopStartInputSchema.safeParse({ + goal: "test", + checkpoint_interval: 5, + }); + expect(result.success).toBe(true); + }); + + test("accepts 0 (disabled)", () => { + const result = LoopStartInputSchema.safeParse({ + goal: "test", + checkpoint_interval: 0, + }); + expect(result.success).toBe(true); + }); + + test("rejects negative values", () => { + const result = LoopStartInputSchema.safeParse({ + goal: "test", + checkpoint_interval: -1, + }); + expect(result.success).toBe(false); + }); + + test("rejects values above ceiling", () => { + const result = LoopStartInputSchema.safeParse({ + goal: "test", + checkpoint_interval: 201, + }); + expect(result.success).toBe(false); + }); + + test("rejects non-integer values", () => { + const result = LoopStartInputSchema.safeParse({ + goal: "test", + checkpoint_interval: 5.5, + }); + expect(result.success).toBe(false); + }); + + test("accepts omitted (optional)", () => { + const result = LoopStartInputSchema.safeParse({ goal: "test" }); + expect(result.success).toBe(true); + }); +}); diff --git a/src/loop/__tests__/notifications.test.ts b/src/loop/__tests__/notifications.test.ts index 1443125..39840d4 100644 --- a/src/loop/__tests__/notifications.test.ts +++ b/src/loop/__tests__/notifications.test.ts @@ -41,6 +41,7 @@ function makeLoop(overrides: Partial = {}): Loop { successCommand: null, maxIterations: 10, maxCostUsd: 5, + checkpointInterval: null, status: "running", iterationCount: 0, totalCostUsd: 0, diff --git a/src/loop/__tests__/post-loop.test.ts b/src/loop/__tests__/post-loop.test.ts new file mode 100644 index 0000000..ae12196 --- /dev/null +++ b/src/loop/__tests__/post-loop.test.ts @@ -0,0 +1,128 @@ +import { describe, expect, test } from "bun:test"; +import { type LoopTranscript, synthesizeSessionData } from "../post-loop.ts"; +import type { Loop } from "../types.ts"; + +function makeLoop(overrides: Partial = {}): Loop { + return { + id: "loop-123", + goal: "Refactor the auth module", + workspaceDir: "/tmp/ws", + stateFile: "/tmp/ws/state.md", + successCommand: null, + maxIterations: 20, + maxCostUsd: 5, + checkpointInterval: null, + status: "running", + iterationCount: 5, + totalCostUsd: 1.23, + channelId: null, + conversationId: null, + statusMessageTs: null, + triggerMessageTs: null, + interruptRequested: false, + lastError: null, + startedAt: "2024-01-01T00:00:00Z", + lastTickAt: "2024-01-01T00:05:00Z", + finishedAt: "2024-01-01T00:06:00Z", + ...overrides, + }; +} + +function makeTranscript(overrides: Partial = {}): LoopTranscript { + return { + firstPrompt: "First tick prompt", + firstResponse: "First tick response", + summaries: ["Tick 2: in-progress", "Tick 3: in-progress"], + lastPrompt: "Last tick prompt", + lastResponse: "Last tick response", + ...overrides, + }; +} + +describe("synthesizeSessionData", () => { + test("maps done status to success outcome", () => { + const data = synthesizeSessionData(makeLoop(), "done", makeTranscript()); + expect(data.outcome).toBe("success"); + }); + + test("maps stopped status to abandoned outcome", () => { + const data = synthesizeSessionData(makeLoop(), "stopped", makeTranscript()); + expect(data.outcome).toBe("abandoned"); + }); + + test("maps budget_exceeded status to failure outcome", () => { + const data = synthesizeSessionData(makeLoop(), "budget_exceeded", makeTranscript()); + expect(data.outcome).toBe("failure"); + }); + + test("maps failed status to failure outcome", () => { + const data = synthesizeSessionData(makeLoop(), "failed", makeTranscript()); + expect(data.outcome).toBe("failure"); + }); + + test("includes context header with tick count and goal", () => { + const data = synthesizeSessionData(makeLoop(), "done", makeTranscript()); + expect(data.userMessages[0]).toContain("[Loop: 5 ticks"); + expect(data.userMessages[0]).toContain("Refactor the auth module"); + expect(data.userMessages[0]).toContain("outcome: success"); + }); + + test("includes first tick prompt, rolling summaries, and last tick prompt", () => { + const data = synthesizeSessionData(makeLoop(), "done", makeTranscript()); + expect(data.userMessages[0]).toContain("First tick prompt"); + expect(data.userMessages).toContain("Tick 2: in-progress"); + expect(data.userMessages).toContain("Tick 3: in-progress"); + expect(data.userMessages[data.userMessages.length - 1]).toContain("Last tick prompt"); + }); + + test("includes first and last assistant responses", () => { + const data = synthesizeSessionData(makeLoop(), "done", makeTranscript()); + expect(data.assistantMessages).toHaveLength(2); + expect(data.assistantMessages[0]).toContain("First tick response"); + expect(data.assistantMessages[1]).toContain("Last tick response"); + }); + + test("uses channel:channelId for Slack-originated loops", () => { + const loop = makeLoop({ channelId: "C100" }); + const data = synthesizeSessionData(loop, "done", makeTranscript()); + expect(data.userId).toBe("channel:C100"); + }); + + test("uses 'autonomous' for headless loops", () => { + const loop = makeLoop({ channelId: null }); + const data = synthesizeSessionData(loop, "done", makeTranscript()); + expect(data.userId).toBe("autonomous"); + }); + + test("session key uses channel:conversation for Slack loops", () => { + const loop = makeLoop({ channelId: "C100", conversationId: "1700000.000" }); + const data = synthesizeSessionData(loop, "done", makeTranscript()); + expect(data.sessionKey).toBe("C100:1700000.000"); + }); + + test("session key uses loop:id for headless loops", () => { + const loop = makeLoop({ channelId: null }); + const data = synthesizeSessionData(loop, "done", makeTranscript()); + expect(data.sessionKey).toBe("loop:loop-123"); + }); + + test("passes through cost and timestamps", () => { + const data = synthesizeSessionData(makeLoop(), "done", makeTranscript()); + expect(data.costUsd).toBe(1.23); + expect(data.startedAt).toBe("2024-01-01T00:00:00Z"); + expect(data.endedAt).toBe("2024-01-01T00:06:00Z"); + }); + + test("uses empty arrays for toolsUsed and filesTracked", () => { + const data = synthesizeSessionData(makeLoop(), "done", makeTranscript()); + expect(data.toolsUsed).toEqual([]); + expect(data.filesTracked).toEqual([]); + }); + + test("handles empty transcript (no-tick loop)", () => { + const transcript = makeTranscript({ summaries: [] }); + const data = synthesizeSessionData(makeLoop({ iterationCount: 0 }), "stopped", transcript); + expect(data.userMessages.length).toBeGreaterThan(0); + expect(data.outcome).toBe("abandoned"); + }); +}); diff --git a/src/loop/__tests__/prompt.test.ts b/src/loop/__tests__/prompt.test.ts new file mode 100644 index 0000000..eae3dc1 --- /dev/null +++ b/src/loop/__tests__/prompt.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, test } from "bun:test"; +import { buildTickPrompt } from "../prompt.ts"; +import type { Loop } from "../types.ts"; + +function makeLoop(overrides: Partial = {}): Loop { + return { + id: "loop-1", + goal: "Write a haiku", + workspaceDir: "/tmp/ws", + stateFile: "/tmp/ws/state.md", + successCommand: null, + maxIterations: 20, + maxCostUsd: 5, + checkpointInterval: null, + status: "running", + iterationCount: 3, + totalCostUsd: 0.5, + channelId: null, + conversationId: null, + statusMessageTs: null, + triggerMessageTs: null, + interruptRequested: false, + lastError: null, + startedAt: "2024-01-01T00:00:00Z", + lastTickAt: null, + finishedAt: null, + ...overrides, + }; +} + +describe("buildTickPrompt", () => { + test("returns base prompt without optional sections when no options provided", () => { + const prompt = buildTickPrompt(makeLoop(), "state contents"); + expect(prompt).toContain("Write a haiku"); + expect(prompt).toContain("state contents"); + expect(prompt).not.toContain("RECALLED MEMORIES"); + expect(prompt).not.toContain("REVIEWER FEEDBACK"); + }); + + test("injects memory context before state file section", () => { + const memoryContext = "## Known Facts\n- User prefers TypeScript"; + const prompt = buildTickPrompt(makeLoop(), "state contents", { memoryContext }); + + expect(prompt).toContain("RECALLED MEMORIES (from previous sessions)"); + expect(prompt).toContain("User prefers TypeScript"); + + // Memory should appear before state file contents + const memoryIdx = prompt.indexOf("RECALLED MEMORIES"); + const stateIdx = prompt.indexOf("CURRENT STATE FILE CONTENTS"); + expect(memoryIdx).toBeLessThan(stateIdx); + }); + + test("injects critique section", () => { + const critique = "The loop appears stuck in a pattern."; + const prompt = buildTickPrompt(makeLoop(), "state contents", { critique }); + + expect(prompt).toContain("REVIEWER FEEDBACK (from your last checkpoint)"); + expect(prompt).toContain("stuck in a pattern"); + }); + + test("injects both memory and critique when both provided", () => { + const prompt = buildTickPrompt(makeLoop(), "state contents", { + memoryContext: "Some facts", + critique: "Some feedback", + }); + + expect(prompt).toContain("RECALLED MEMORIES"); + expect(prompt).toContain("REVIEWER FEEDBACK"); + + // Memory should come before critique + const memoryIdx = prompt.indexOf("RECALLED MEMORIES"); + const critiqueIdx = prompt.indexOf("REVIEWER FEEDBACK"); + expect(memoryIdx).toBeLessThan(critiqueIdx); + }); + + test("skips empty memory context", () => { + const prompt = buildTickPrompt(makeLoop(), "state contents", { memoryContext: "" }); + expect(prompt).not.toContain("RECALLED MEMORIES"); + }); + + test("skips empty critique", () => { + const prompt = buildTickPrompt(makeLoop(), "state contents", { critique: "" }); + expect(prompt).not.toContain("REVIEWER FEEDBACK"); + }); +}); diff --git a/src/loop/critique.ts b/src/loop/critique.ts new file mode 100644 index 0000000..a777010 --- /dev/null +++ b/src/loop/critique.ts @@ -0,0 +1,65 @@ +import { z } from "zod/v4"; +import { callJudge } from "../evolution/judges/client.ts"; +import { JUDGE_MODEL_SONNET, type JudgeCostEntry } from "../evolution/judges/types.ts"; +import type { LoopTranscript } from "./post-loop.ts"; + +export type CritiqueResult = { + assessment: string; + cost: JudgeCostEntry; +}; + +const CritiqueSchema = z.object({ assessment: z.string() }); + +/** + * Mid-loop critique: Sonnet 4.6 reviews the loop's progress every N ticks + * to detect drift, stuck patterns, or wasted budget before the loop runs out. + * Same cross-model pattern as interactive sessions (Sonnet judging Opus). + */ +export async function runCritiqueJudge( + goal: string, + stateFileContents: string, + transcript: LoopTranscript, + iteration: number, + maxIterations: number, +): Promise { + const system = `You are a reviewer assessing an autonomous agent loop mid-flight. +The agent (Opus 4.6) is running inside a tight iteration loop toward a goal. +Your job is to assess whether the loop is making meaningful progress or if +it is stuck, drifting, or wasting budget. Be direct and actionable.`; + + const summaryBlock = transcript.summaries.length > 0 ? `\nTick summaries:\n${transcript.summaries.join("\n")}` : ""; + + const user = `Goal: ${goal} + +Progress (${iteration}/${maxIterations} ticks used): +${summaryBlock} + +Current state file: +${stateFileContents.slice(0, 3000)} + +Last response (truncated): +${transcript.lastResponse.slice(0, 1000)} + +Is this loop making meaningful progress toward the goal? Is the agent stuck +in a pattern? Should it change approach? Give a brief (2-3 sentence) assessment +and one concrete suggestion if applicable.`; + + const result = await callJudge({ + model: JUDGE_MODEL_SONNET, + systemPrompt: system, + userMessage: user, + schema: CritiqueSchema, + schemaName: "LoopCritique", + maxTokens: 500, + }); + + return { + assessment: result.data.assessment, + cost: { + calls: 1, + totalUsd: result.costUsd, + totalInputTokens: result.inputTokens, + totalOutputTokens: result.outputTokens, + }, + }; +} diff --git a/src/loop/post-loop.ts b/src/loop/post-loop.ts new file mode 100644 index 0000000..093d831 --- /dev/null +++ b/src/loop/post-loop.ts @@ -0,0 +1,146 @@ +import type { EvolutionEngine } from "../evolution/engine.ts"; +import type { SessionData } from "../memory/consolidation.ts"; +import type { MemorySystem } from "../memory/system.ts"; +import type { Loop, LoopStatus } from "./types.ts"; + +export type LoopTranscript = { + firstPrompt: string; + firstResponse: string; + summaries: string[]; + lastPrompt: string; + lastResponse: string; +}; + +export type PostLoopDeps = { + evolution?: EvolutionEngine; + memory?: MemorySystem; + /** Callback to update runtime's evolved config after evolution applies changes. */ + onEvolvedConfigUpdate?: (config: ReturnType) => void; +}; + +function loopStatusToOutcome(status: LoopStatus): SessionData["outcome"] { + switch (status) { + case "done": + return "success"; + case "stopped": + return "abandoned"; + default: + return "failure"; + } +} + +const MAX_ROLLING_SUMMARIES = 10; + +export function recordTranscript( + transcripts: Map, + loopId: string, + iteration: number, + prompt: string, + response: string, + stateStatus: string | undefined, +): void { + let transcript = transcripts.get(loopId); + if (!transcript) { + transcript = { + firstPrompt: prompt, + firstResponse: response, + summaries: [], + lastPrompt: prompt, + lastResponse: response, + }; + transcripts.set(loopId, transcript); + } else { + transcript.lastPrompt = prompt; + transcript.lastResponse = response; + } + const summary = `Tick ${iteration}: ${stateStatus ?? "in-progress"}`; + transcript.summaries.push(summary); + if (transcript.summaries.length > MAX_ROLLING_SUMMARIES) transcript.summaries.shift(); +} + +export function clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); +} + +export function synthesizeSessionData(loop: Loop, status: LoopStatus, transcript: LoopTranscript): SessionData { + const outcome = loopStatusToOutcome(status); + const header = `[Loop: ${loop.iterationCount} ticks, goal: ${loop.goal.slice(0, 200)}, outcome: ${outcome}]`; + + const userMessages = [ + `${header} Tick 1: ${transcript.firstPrompt.slice(0, 500)}`, + ...transcript.summaries, + `Final tick: ${transcript.lastPrompt.slice(0, 500)}`, + ]; + + const assistantMessages = [transcript.firstResponse.slice(0, 1000), transcript.lastResponse.slice(0, 1000)]; + + // userId sentinel: channel-originated loops use channel ID, headless use "autonomous" + const userId = loop.channelId ? `channel:${loop.channelId}` : "autonomous"; + + return { + sessionId: loop.id, + sessionKey: loop.channelId && loop.conversationId ? `${loop.channelId}:${loop.conversationId}` : `loop:${loop.id}`, + userId, + userMessages, + assistantMessages, + toolsUsed: [], + filesTracked: [], + startedAt: loop.startedAt, + endedAt: loop.finishedAt ?? new Date().toISOString(), + costUsd: loop.totalCostUsd, + outcome, + }; +} + +/** + * Run evolution and memory consolidation after a loop finishes. + * Fire-and-forget from the runner's perspective - errors are logged, + * never propagated to affect loop status. + */ +export async function runPostLoopPipeline(deps: PostLoopDeps, sessionData: SessionData): Promise { + const { evolution, memory, onEvolvedConfigUpdate } = deps; + const { consolidateSessionWithLLM, consolidateSession, sessionDataToSummary } = await import( + "../memory/consolidation.ts" + ); + + // Evolution pipeline - runs independently of memory state + if (evolution) { + const summary = sessionDataToSummary(sessionData); + try { + const result = await evolution.afterSession(summary); + if (result.changes_applied.length > 0 && onEvolvedConfigUpdate) { + onEvolvedConfigUpdate(evolution.getConfig()); + } + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`[loop] Post-loop evolution failed: ${msg}`); + } + } + + // Memory consolidation - runs independently of evolution state + if (!memory?.isReady()) return; + try { + const useLLM = evolution?.usesLLMJudges() && evolution?.isWithinCostCap(); + if (useLLM && evolution) { + const evolvedConfig = evolution.getConfig(); + const existingFacts = `${evolvedConfig.userProfile}\n${evolvedConfig.domainKnowledge}`; + const { result, judgeCost } = await consolidateSessionWithLLM(memory, sessionData, existingFacts); + if (judgeCost) evolution.trackExternalJudgeCost(judgeCost); + if (result.episodesCreated > 0 || result.factsExtracted > 0) { + console.log( + `[loop] Consolidated (LLM): ${result.episodesCreated} episodes, ${result.factsExtracted} facts (${result.durationMs}ms)`, + ); + } + } else { + const result = await consolidateSession(memory, sessionData); + if (result.episodesCreated > 0 || result.factsExtracted > 0) { + console.log( + `[loop] Consolidated: ${result.episodesCreated} episodes, ${result.factsExtracted} facts (${result.durationMs}ms)`, + ); + } + } + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`[loop] Post-loop memory consolidation failed: ${msg}`); + } +} diff --git a/src/loop/prompt.ts b/src/loop/prompt.ts index 259ee5b..7862af6 100644 --- a/src/loop/prompt.ts +++ b/src/loop/prompt.ts @@ -1,12 +1,29 @@ import type { Loop } from "./types.ts"; +export type TickPromptOptions = { + memoryContext?: string; + critique?: string; +}; + /** * Per-tick prompt. Each tick is a fresh SDK session with no prior context * (rotating conversation ids in the runner guarantee this), so the prompt * must carry everything the agent needs: the goal, the state file contract, * the current state file contents, and the workspace path. */ -export function buildTickPrompt(loop: Loop, stateFileContents: string): string { +export function buildTickPrompt(loop: Loop, stateFileContents: string, options?: TickPromptOptions): string { + const memorySections: string[] = []; + + if (options?.memoryContext) { + memorySections.push(`RECALLED MEMORIES (from previous sessions)\n\n${options.memoryContext}`); + } + + if (options?.critique) { + memorySections.push(`REVIEWER FEEDBACK (from your last checkpoint)\n\n${options.critique}`); + } + + const injected = memorySections.length > 0 ? `\n\n${memorySections.join("\n\n")}\n` : ""; + return `You are running inside a "ralph loop" - a tight iteration primitive where a fresh agent session is invoked once per tick. You have no memory from previous ticks. All shared memory lives in the state file at: @@ -45,7 +62,7 @@ is one short paragraph telling the next tick exactly what to do first. THE GOAL -${loop.goal} +${loop.goal}${injected} CURRENT STATE FILE CONTENTS diff --git a/src/loop/runner.ts b/src/loop/runner.ts index d2ced99..b6352c0 100644 --- a/src/loop/runner.ts +++ b/src/loop/runner.ts @@ -4,7 +4,17 @@ import { join, relative, resolve } from "node:path"; import type { AgentRuntime } from "../agent/runtime.ts"; import type { SlackChannel } from "../channels/slack.ts"; import { buildSafeEnv } from "../mcp/dynamic-handlers.ts"; +import type { MemoryContextBuilder } from "../memory/context-builder.ts"; +import { runCritiqueJudge } from "./critique.ts"; import { LoopNotifier } from "./notifications.ts"; +import { + type LoopTranscript, + type PostLoopDeps, + clamp, + recordTranscript, + runPostLoopPipeline, + synthesizeSessionData, +} from "./post-loop.ts"; import { buildTickPrompt } from "./prompt.ts"; import { initStateFile, parseFrontmatter, readStateFile } from "./state-file.ts"; import { LoopStore } from "./store.ts"; @@ -18,11 +28,7 @@ import { type LoopStatus, } from "./types.ts"; -/** - * The runner only needs handleMessage from the AgentRuntime - narrowing the - * dependency keeps the runner honest (SRP) and lets tests pass a minimal mock - * without `as never` casts. A real AgentRuntime is assignable to this. - */ +/** Narrowed runtime interface for testability. */ type LoopRuntime = Pick; type RunnerDeps = { @@ -30,31 +36,15 @@ type RunnerDeps = { runtime: LoopRuntime; slackChannel?: SlackChannel; dataDir?: string; - /** - * When true (default), start() and resumeRunning() schedule ticks on the - * event loop automatically. Tests set this to false so they can drive - * ticks deterministically with explicit `await runner.tick(id)` calls. - */ + memoryContextBuilder?: MemoryContextBuilder; + postLoopDeps?: PostLoopDeps; + /** Tests set to false to drive ticks deterministically. */ autoSchedule?: boolean; }; const SUCCESS_COMMAND_TIMEOUT_MS = 5 * 60 * 1000; -/** - * LoopRunner owns the lifecycle of ralph loops: - * start -> tick (N times) -> finalize - * - * Each tick is a fresh SDK session. We achieve that by passing a unique - * conversation id per iteration to runtime.handleMessage, which keys - * sessions on `${channelId}:${conversationId}`. No runtime changes needed. - * - * State lives in a markdown file. The runner only reads the YAML frontmatter - * to decide termination - the body is the agent's working memory. - * - * Budgets (max_iterations, max_cost_usd) are enforced here, never trusted to - * the agent. The agent can self-declare done (status: done) to stop early, - * but cannot extend the loop past its budget. - */ +/** start -> tick (N times) -> finalize. State file is the agent's memory across ticks. */ export class LoopRunner { private store: LoopStore; private runtime: LoopRuntime; @@ -63,6 +53,11 @@ export class LoopRunner { private autoSchedule: boolean; private inFlight = new Set(); private notifier: LoopNotifier; + private memoryContextBuilder: MemoryContextBuilder | undefined; + private postLoopDeps: PostLoopDeps | undefined; + private memoryCache = new Map(); + private transcripts = new Map(); + private pendingCritique = new Map(); constructor(deps: RunnerDeps) { this.store = new LoopStore(deps.db); @@ -71,6 +66,8 @@ export class LoopRunner { this.dataDir = deps.dataDir ?? resolve(process.cwd(), "data"); this.autoSchedule = deps.autoSchedule ?? true; this.notifier = new LoopNotifier(this.slackChannel ?? null, this.store); + this.memoryContextBuilder = deps.memoryContextBuilder; + this.postLoopDeps = deps.postLoopDeps; } setSlackChannel(channel: SlackChannel): void { @@ -78,11 +75,6 @@ export class LoopRunner { this.notifier = new LoopNotifier(channel, this.store); } - /** - * Reject operator-supplied workspace paths that escape the data dir. The - * agent invokes start() via MCP, and a stray `..` could point the state - * file outside the sandbox. Mirrors the isPathSafe idiom in src/ui/serve.ts. - */ private assertWorkspaceInsideDataDir(workspace: string): string { const base = resolve(this.dataDir); const target = resolve(base, workspace); @@ -113,6 +105,7 @@ export class LoopRunner { successCommand: input.successCommand ?? null, maxIterations, maxCostUsd, + checkpointInterval: input.checkpointInterval ?? null, channelId: input.channelId ?? null, conversationId: input.conversationId ?? null, triggerMessageTs: input.triggerMessageTs ?? null, @@ -123,6 +116,9 @@ export class LoopRunner { console.warn(`[loop] Failed to post start notice for ${id}: ${msg}`); }); + // Cache memory context once for the entire loop (goal is constant) + this.cacheMemoryContext(id, input.goal); + this.scheduleTick(id); return loop; } @@ -139,11 +135,11 @@ export class LoopRunner { return this.store.requestStop(id); } - /** On startup, re-queue any loops still marked running. State file is the source of truth. */ resumeRunning(): number { const running = this.store.listByStatus("running"); for (const loop of running) { console.log(`[loop] Resuming ${loop.id} (iteration ${loop.iterationCount})`); + this.cacheMemoryContext(loop.id, loop.goal); this.scheduleTick(loop.id); } return running.length; @@ -151,10 +147,9 @@ export class LoopRunner { private scheduleTick(id: string): void { if (!this.autoSchedule) return; - // setImmediate yields to the event loop so we never recurse on the same stack. setImmediate(() => { - this.tick(id).catch((err: unknown) => { - const msg = err instanceof Error ? err.message : String(err); + this.tick(id).catch((e: unknown) => { + const msg = e instanceof Error ? e.message : String(e); console.error(`[loop] Tick ${id} threw: ${msg}`); this.finalize(id, "failed", msg); }); @@ -179,7 +174,13 @@ export class LoopRunner { } const stateFileContents = readStateFile(loop.stateFile); - const prompt = buildTickPrompt(loop, stateFileContents); + const memoryContext = this.memoryCache.get(id); + const critique = this.pendingCritique.get(id); + if (critique) this.pendingCritique.delete(id); + const prompt = buildTickPrompt(loop, stateFileContents, { + memoryContext, + critique, + }); const conversationId = `${loop.id}:${loop.iterationCount}`; const response = await this.runtime.handleMessage("loop", conversationId, prompt); @@ -192,6 +193,9 @@ export class LoopRunner { const updatedContents = readStateFile(loop.stateFile); const frontmatter = parseFrontmatter(updatedContents); + // Track bounded transcript for post-loop evolution + recordTranscript(this.transcripts, id, nextIteration, prompt, response.text, frontmatter?.status); + if (frontmatter?.status === "done") { this.finalize(id, "done", null); return; @@ -205,6 +209,12 @@ export class LoopRunner { } } + // Mid-loop critique checkpoint (Sonnet reviewing Opus mid-flight). + // Runs after terminal checks so we don't waste a judge call on the final tick. + if (loop.checkpointInterval && loop.checkpointInterval > 0 && nextIteration % loop.checkpointInterval === 0) { + await this.runCritique(id, loop, updatedContents, nextIteration); + } + // Await the tick update so its Slack write finishes before the next // tick can start (and potentially finalize). Without this, a stop on // tick N+1 can race: postFinalNotice strips the Stop button, then the @@ -241,15 +251,49 @@ export class LoopRunner { } private finalize(id: string, status: LoopStatus, error: string | null): void { + const transcript = this.transcripts.get(id); + this.memoryCache.delete(id); + this.transcripts.delete(id); + this.pendingCritique.delete(id); const loop = this.store.finalize(id, status, error); if (!loop) return; this.notifier.postFinalNotice(loop, status).catch((err: unknown) => { const msg = err instanceof Error ? err.message : String(err); console.warn(`[loop] Failed to post final notice for ${id}: ${msg}`); }); + + // Post-loop evolution and consolidation (fire-and-forget, never affects loop status) + if (this.postLoopDeps && transcript) { + runPostLoopPipeline(this.postLoopDeps, synthesizeSessionData(loop, status, transcript)).catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`[loop] Post-loop evolution failed for ${id}: ${msg}`); + }); + } } -} -function clamp(value: number, min: number, max: number): number { - return Math.min(Math.max(value, min), max); + private async runCritique(loopId: string, loop: Loop, stateContents: string, iteration: number): Promise { + const transcript = this.transcripts.get(loopId); + if (!transcript) return; + const evo = this.postLoopDeps?.evolution; + if (!evo || !evo.usesLLMJudges() || !evo.isWithinCostCap()) return; + try { + const r = await runCritiqueJudge(loop.goal, stateContents, transcript, iteration, loop.maxIterations); + this.pendingCritique.set(loopId, r.assessment); + evo.trackExternalJudgeCost(r.cost); + } catch (e: unknown) { + console.warn(`[loop] Critique failed for ${loopId}: ${e instanceof Error ? e.message : e}`); + } + } + + private cacheMemoryContext(loopId: string, goal: string): void { + if (!this.memoryContextBuilder) return; + this.memoryContextBuilder + .build(goal) + .then((ctx) => { + if (ctx) this.memoryCache.set(loopId, ctx); + }) + .catch((e: unknown) => { + console.warn(`[loop] Memory context failed for ${loopId}: ${e instanceof Error ? e.message : e}`); + }); + } } diff --git a/src/loop/store.ts b/src/loop/store.ts index 9f53000..1824605 100644 --- a/src/loop/store.ts +++ b/src/loop/store.ts @@ -9,6 +9,7 @@ export type LoopInsertInput = { successCommand: string | null; maxIterations: number; maxCostUsd: number; + checkpointInterval?: number | null; channelId: string | null; conversationId: string | null; triggerMessageTs: string | null; @@ -23,8 +24,8 @@ export class LoopStore { insert(input: LoopInsertInput): Loop { this.db.run( - `INSERT INTO loops (id, goal, workspace_dir, state_file, success_command, max_iterations, max_cost_usd, status, channel_id, conversation_id, trigger_message_ts) - VALUES (?, ?, ?, ?, ?, ?, ?, 'running', ?, ?, ?)`, + `INSERT INTO loops (id, goal, workspace_dir, state_file, success_command, max_iterations, max_cost_usd, checkpoint_interval, status, channel_id, conversation_id, trigger_message_ts) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'running', ?, ?, ?)`, [ input.id, input.goal, @@ -33,6 +34,7 @@ export class LoopStore { input.successCommand, input.maxIterations, input.maxCostUsd, + input.checkpointInterval ?? null, input.channelId, input.conversationId, input.triggerMessageTs, diff --git a/src/loop/tool.ts b/src/loop/tool.ts index 9c86437..d404cfd 100644 --- a/src/loop/tool.ts +++ b/src/loop/tool.ts @@ -55,6 +55,7 @@ ACTIONS: workspace (defaults to data/loops//), max_iterations (default 20, hard ceiling 200), max_cost_usd (default 5, hard ceiling 50), + checkpoint_interval (run a Sonnet critique every N ticks, 0 or omitted = off), success_command (shell command run after each tick; exit 0 = goal achieved. Runs under bash -c with a 5 minute timeout in a sanitized env containing only PATH, HOME, LANG, TERM, loop_id, and workspace), @@ -72,6 +73,13 @@ regression". Each iteration is fresh - all context must live in the state file.` workspace: z.string().optional(), max_iterations: z.number().int().positive().max(200).optional(), max_cost_usd: z.number().positive().max(50).optional(), + checkpoint_interval: z + .number() + .int() + .min(0) + .max(200) + .optional() + .describe("Run a Sonnet review every N ticks. 0 or omitted = no critique."), success_command: z.string().optional(), channel_id: z.string().optional(), conversation_id: z.string().optional(), @@ -93,6 +101,7 @@ regression". Each iteration is fresh - all context must live in the state file.` workspace: input.workspace, maxIterations: input.max_iterations, maxCostUsd: input.max_cost_usd, + checkpointInterval: input.checkpoint_interval, successCommand: input.success_command, channelId: input.channel_id ?? ctx?.slackChannelId, conversationId: input.conversation_id ?? ctx?.slackThreadTs, diff --git a/src/loop/types.ts b/src/loop/types.ts index d30249e..207fa87 100644 --- a/src/loop/types.ts +++ b/src/loop/types.ts @@ -10,6 +10,7 @@ export type Loop = { successCommand: string | null; maxIterations: number; maxCostUsd: number; + checkpointInterval: number | null; status: LoopStatus; iterationCount: number; totalCostUsd: number; @@ -32,6 +33,7 @@ export type LoopRow = { success_command: string | null; max_iterations: number; max_cost_usd: number; + checkpoint_interval: number | null; status: string; iteration_count: number; total_cost_usd: number; @@ -57,6 +59,7 @@ export type LoopStartInput = { workspace?: string; maxIterations?: number; maxCostUsd?: number; + checkpointInterval?: number; successCommand?: string; channelId?: string; conversationId?: string; @@ -74,6 +77,7 @@ export const LoopStartInputSchema = z.object({ workspace: z.string().optional(), max_iterations: z.number().int().positive().max(LOOP_MAX_ITERATIONS_CEILING).optional(), max_cost_usd: z.number().positive().max(LOOP_MAX_COST_CEILING_USD).optional(), + checkpoint_interval: z.number().int().min(0).max(LOOP_MAX_ITERATIONS_CEILING).optional(), success_command: z.string().optional(), channel_id: z.string().optional(), conversation_id: z.string().optional(), @@ -93,6 +97,7 @@ export function rowToLoop(row: LoopRow): Loop { successCommand: row.success_command, maxIterations: row.max_iterations, maxCostUsd: row.max_cost_usd, + checkpointInterval: row.checkpoint_interval, status: row.status as LoopStatus, iterationCount: row.iteration_count, totalCostUsd: row.total_cost_usd, diff --git a/src/memory/consolidation.ts b/src/memory/consolidation.ts index 35868d7..3cd58f2 100644 --- a/src/memory/consolidation.ts +++ b/src/memory/consolidation.ts @@ -1,3 +1,4 @@ +import { JudgeParseError } from "../evolution/judges/client.ts"; import { runConsolidationJudge } from "../evolution/judges/consolidation-judge.ts"; import type { JudgeCostEntry } from "../evolution/judges/types.ts"; import type { SessionSummary } from "../evolution/types.ts"; @@ -66,11 +67,21 @@ export async function consolidateSessionWithLLM( const msg = error instanceof Error ? error.message : String(error); console.warn(`[memory] Consolidation judge failed, falling back to heuristic: ${msg}`); const result = await consolidateSession(memory, sessionData); - return { result, judgeCost: null }; + // Track cost from successful API calls that failed parsing (tokens were consumed) + const judgeCost = + error instanceof JudgeParseError + ? { + calls: 1, + totalUsd: error.costUsd, + totalInputTokens: error.inputTokens, + totalOutputTokens: error.outputTokens, + } + : null; + return { result, judgeCost }; } } -function sessionDataToSummary(data: SessionData): SessionSummary { +export function sessionDataToSummary(data: SessionData): SessionSummary { return { session_id: data.sessionId, session_key: data.sessionKey,