Skip to content

Commit b04201c

Browse files
committed
feat(runner-pool): add promotion mode
- Allow roles to be configured as a runner list and promote to the next runner based on previous attempt failure kind - Record err.kind and out.last_runner in SQLite KV for promotion decisions - Add codexMedium runner and make it the default executor - Add regression tests and mock agents for missing result + task failure promotion
1 parent 954746d commit b04201c

6 files changed

Lines changed: 358 additions & 3 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
],
2828
"repository": {
2929
"type": "git",
30-
"url": "https://github.com/knot0-com/dagain.git"
30+
"url": "git+https://github.com/knot0-com/dagain.git"
3131
},
3232
"bugs": {
3333
"url": "https://github.com/knot0-com/dagain/issues"

scripts/mock-agent-fail-marker.js

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import fs from "node:fs/promises";
2+
3+
function result(obj) {
4+
process.stdout.write(`<result>${JSON.stringify(obj)}</result>\n`);
5+
}
6+
7+
const marker = String(process.argv[2] || "").trim() || "fail";
8+
9+
await fs.writeFile("runner_marker.txt", `${marker}\n`, "utf8");
10+
await fs.appendFile("invocations.log", `${marker}\n`, "utf8");
11+
12+
result({
13+
version: 1,
14+
role: "executor",
15+
status: "fail",
16+
summary: `fail marker=${marker}`,
17+
next: { addNodes: [], setStatus: [] },
18+
checkpoint: null,
19+
errors: [`fail marker=${marker}`],
20+
confidence: 0,
21+
});

scripts/mock-agent-noresult.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import fs from "node:fs/promises";
2+
3+
const marker = String(process.argv[2] || "").trim() || "noresult";
4+
5+
await fs.writeFile("runner_marker.txt", `${marker}\n`, "utf8");
6+
await fs.appendFile("invocations.log", `${marker}\n`, "utf8");
7+
8+
process.stdout.write(`mock-agent-noresult marker=${marker}\n`);

src/cli.js

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,61 @@ import {
3535
unlockNode as unlockNodeDb,
3636
} from "./lib/db/nodes.js";
3737

38+
function normalizeRunnerPoolMode(mode) {
39+
const m = String(mode || "").toLowerCase().trim();
40+
return m || "off";
41+
}
42+
43+
function normalizeFailureKind(kind) {
44+
return String(kind || "").toLowerCase().trim();
45+
}
46+
47+
function normalizePromoteOnList(value) {
48+
if (!Array.isArray(value)) return [];
49+
const out = [];
50+
for (const entry of value) {
51+
const k = normalizeFailureKind(entry);
52+
if (!k) continue;
53+
out.push(k);
54+
}
55+
return [...new Set(out)];
56+
}
57+
58+
function normalizePromoteAfterAttempts(value) {
59+
const n = Number(value);
60+
if (!Number.isFinite(n) || n < 0) return 0;
61+
return Math.floor(n);
62+
}
63+
64+
async function resolveRunnerNameFromPool({ dbPath, role, seed, attempts, config }) {
65+
const list = normalizeRunnerList(config?.roles?.[role] ?? config?.roles?.main);
66+
if (list.length === 0) return "";
67+
if (list.length === 1) return list[0];
68+
69+
const pool = config?.supervisor?.runnerPool;
70+
const mode = normalizeRunnerPoolMode(pool?.mode);
71+
if (mode !== "promotion") return resolveRoleRunnerPick(role, config, { seed, attempt: attempts });
72+
73+
if (!(Number.isFinite(attempts) && attempts > 0)) return list[0];
74+
75+
const lastRunnerRow = await kvGet({ dbPath, nodeId: seed, key: "out.last_runner" }).catch(() => null);
76+
const lastRunner = typeof lastRunnerRow?.value_text === "string" ? lastRunnerRow.value_text.trim() : "";
77+
const lastIdx = lastRunner ? list.indexOf(lastRunner) : -1;
78+
const currentIdx = lastIdx >= 0 ? lastIdx : 0;
79+
if (currentIdx >= list.length - 1) return list[currentIdx];
80+
81+
const lastKindRow = await kvGet({ dbPath, nodeId: seed, key: "err.kind" }).catch(() => null);
82+
const lastKind = normalizeFailureKind(lastKindRow?.value_text);
83+
84+
const promoteOn = normalizePromoteOnList(pool?.promoteOn);
85+
const promoteAfterAttempts = normalizePromoteAfterAttempts(pool?.promoteAfterAttempts);
86+
const shouldPromoteImmediate = lastKind && promoteOn.includes(lastKind);
87+
const shouldPromoteAfterK = lastKind === "task_failure" && promoteAfterAttempts > 0 && attempts >= promoteAfterAttempts;
88+
if (!shouldPromoteImmediate && !shouldPromoteAfterK) return list[currentIdx];
89+
90+
return list[currentIdx + 1];
91+
}
92+
3893
function usage() {
3994
return `dagain (aliases: taskgraph, choreo)
4095
@@ -2767,7 +2822,9 @@ async function executeNode({
27672822
const role = resolveNodeRole(node);
27682823
let runnerName = typeof node?.runner === "string" ? node.runner.trim() : "";
27692824
if (!runnerName) {
2770-
runnerName = resolveRoleRunnerPick(role, config, { seed: node.id, attempt: Number(node.attempts || 0) });
2825+
const attemptsRaw = Number(node.attempts || 0);
2826+
const attempts = Number.isFinite(attemptsRaw) && attemptsRaw >= 0 ? Math.floor(attemptsRaw) : 0;
2827+
runnerName = await resolveRunnerNameFromPool({ dbPath: paths.dbPath, role, seed: node.id, attempts, config });
27712828
}
27722829

27732830
const claudeSensitiveFallback = String(config.supervisor?.claudeSensitiveFallbackRunner || "codex").trim() || "codex";
@@ -2915,17 +2972,20 @@ async function executeNode({
29152972
}
29162973

29172974
let result = await safeReadResult(resultPath);
2975+
let hadValidResult = Boolean(result);
29182976
if (!result) {
29192977
const stdoutText = await readTextTruncated(stdoutPath, 200_000);
29202978
const extracted = extractResultJson(stdoutText);
29212979
if (extracted) {
29222980
result = extracted;
2981+
hadValidResult = true;
29232982
await writeJsonAtomic(resultPath, result);
29242983
}
29252984
}
29262985
if (!result) {
29272986
await appendLine(errorsPath, `[${nowIso()}] missing/invalid result.json node=${node.id} run=${run} cmd=${execRes.cmd}`);
29282987
result = { status: "fail", summary: "Missing/invalid result.json", next: { addNodes: [], setStatus: [] }, checkpoint: null, errors: [] };
2988+
hadValidResult = false;
29292989
}
29302990

29312991
if (String(result?.status || "").toLowerCase() === "checkpoint") {
@@ -2968,7 +3028,19 @@ async function executeNode({
29683028
? String(result.errors[0] || "").trim()
29693029
: "");
29703030

3031+
const errKind =
3032+
finalStatus === "success" || finalStatus === "checkpoint"
3033+
? ""
3034+
: execRes?.timedOut
3035+
? "timeout"
3036+
: execRes?.error
3037+
? "spawn_error"
3038+
: hadValidResult
3039+
? "task_failure"
3040+
: "missing_result";
3041+
29713042
await kvPut({ dbPath: paths.dbPath, nodeId: node.id, key: "out.summary", valueText: summary, runId: run, attempt, nowIso: nowIso() });
3043+
await kvPut({ dbPath: paths.dbPath, nodeId: node.id, key: "out.last_runner", valueText: runnerName, runId: run, attempt, nowIso: nowIso() });
29723044
await kvPut({
29733045
dbPath: paths.dbPath,
29743046
nodeId: node.id,
@@ -2990,6 +3062,9 @@ async function executeNode({
29903062
if (errSummary) {
29913063
await kvPut({ dbPath: paths.dbPath, nodeId: node.id, key: "err.summary", valueText: errSummary, runId: run, attempt, nowIso: nowIso() });
29923064
}
3065+
if (errKind) {
3066+
await kvPut({ dbPath: paths.dbPath, nodeId: node.id, key: "err.kind", valueText: errKind, runId: run, attempt, nowIso: nowIso() });
3067+
}
29933068

29943069
const defaultRetryPolicy = resolveDefaultRetryPolicy(config);
29953070
await applyResultDb({

src/lib/config.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ export function defaultConfig() {
3333
shellVerify: { cmd: 'node "$CHOREO_SHELL_VERIFIER"' },
3434
shellMerge: { cmd: 'node "$CHOREO_SHELL_MERGE"' },
3535
codex: { cmd: "codex exec --yolo --skip-git-repo-check -" },
36+
codexMedium: { cmd: "codex exec --yolo --skip-git-repo-check -m gpt-5.2-codex -c model_reasoning_effort=medium -" },
3637
// Note: Claude forbids --dangerously-skip-permissions when running as root/sudo.
3738
// choreo strips that flag automatically in those contexts.
3839
claude: {
@@ -44,7 +45,7 @@ export function defaultConfig() {
4445
roles: {
4546
main: "codex",
4647
planner: "codex",
47-
executor: "codex",
48+
executor: "codexMedium",
4849
verifier: "codex",
4950
integrator: "codex",
5051
finalVerifier: "codex",
@@ -55,6 +56,11 @@ export function defaultConfig() {
5556
idleSleepMs: 2000,
5657
staleLockSeconds: 3600,
5758
autoResetFailedMax: 1,
59+
runnerPool: {
60+
mode: "off",
61+
promoteOn: ["timeout", "missing_result", "spawn_error"],
62+
promoteAfterAttempts: 2,
63+
},
5864
claudeSensitiveFallbackRunner: "codex",
5965
multiVerifier: "one",
6066
worktrees: { mode: "off", dir: ".dagain/worktrees" },

0 commit comments

Comments
 (0)