Skip to content

Commit cad21e8

Browse files
committed
fix: enforce hard wall-clock timeout on runTrackedJob
Wrap the runner with Promise.race against a 30-minute default timeout. On expiry the job transitions to failed/phase:failed so zombie 'running' rows can't accumulate when a runner hangs. OPENCODE_COMPANION_JOB_TIMEOUT_MS overrides the default. Closes #41. Port of openai/codex-plugin-cc#184.
1 parent 260d84b commit cad21e8

File tree

2 files changed

+127
-2
lines changed

2 files changed

+127
-2
lines changed

plugins/opencode/scripts/lib/tracked-jobs.mjs

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@ import { generateJobId, upsertJob, jobLogPath, jobDataPath } from "./state.mjs";
77

88
const SESSION_ID_ENV = "OPENCODE_COMPANION_SESSION_ID";
99

10+
// Hard ceiling for any single tracked job. 30 minutes is generous enough for
11+
// long OpenCode turns but bounded so a hung runner cannot keep the companion
12+
// process alive forever. Override via OPENCODE_COMPANION_JOB_TIMEOUT_MS.
13+
const DEFAULT_JOB_TIMEOUT_MS = 30 * 60 * 1000;
14+
15+
function resolveJobTimeoutMs(options = {}) {
16+
if (Number.isFinite(options.timeoutMs) && options.timeoutMs > 0) {
17+
return options.timeoutMs;
18+
}
19+
const fromEnv = Number(process.env.OPENCODE_COMPANION_JOB_TIMEOUT_MS);
20+
if (Number.isFinite(fromEnv) && fromEnv > 0) {
21+
return fromEnv;
22+
}
23+
return DEFAULT_JOB_TIMEOUT_MS;
24+
}
25+
1026
/**
1127
* Get the current Claude session ID from environment.
1228
* @returns {string|undefined}
@@ -41,9 +57,10 @@ export function createJobRecord(workspacePath, type, meta = {}) {
4157
* @param {string} workspacePath
4258
* @param {object} job
4359
* @param {(ctx: { report: Function, log: Function }) => Promise<object>} runner
60+
* @param {{ timeoutMs?: number }} [options]
4461
* @returns {Promise<object>} the job result
4562
*/
46-
export async function runTrackedJob(workspacePath, job, runner) {
63+
export async function runTrackedJob(workspacePath, job, runner, options = {}) {
4764
// Mark as running
4865
upsertJob(workspacePath, { id: job.id, status: "running", pid: process.pid });
4966

@@ -61,9 +78,35 @@ export async function runTrackedJob(workspacePath, job, runner) {
6178
appendLine(logFile, `[${new Date().toISOString()}] ${message}`);
6279
};
6380

81+
// Race the runner against a hard wall-clock timeout so a hung runner
82+
// (dropped SSE stream, wedged post-response handler, unresolved downstream
83+
// fetch) cannot leave the job in `running` forever. See issue #41.
84+
const timeoutMs = resolveJobTimeoutMs(options);
85+
let timeoutHandle = null;
86+
const timeoutPromise = new Promise((_resolve, reject) => {
87+
timeoutHandle = setTimeout(() => {
88+
reject(
89+
new Error(
90+
`Tracked job ${job.id} exceeded the ${Math.round(timeoutMs / 1000)}s hard timeout. ` +
91+
"The runner did not produce a terminal status. " +
92+
"Set OPENCODE_COMPANION_JOB_TIMEOUT_MS to adjust."
93+
)
94+
);
95+
}, timeoutMs);
96+
timeoutHandle.unref?.();
97+
});
98+
99+
const clearTimer = () => {
100+
if (timeoutHandle) {
101+
clearTimeout(timeoutHandle);
102+
timeoutHandle = null;
103+
}
104+
};
105+
64106
try {
65107
report("starting", `Job ${job.id} started`);
66-
const result = await runner({ report, log });
108+
const result = await Promise.race([runner({ report, log }), timeoutPromise]);
109+
clearTimer();
67110

68111
// Mark as completed
69112
upsertJob(workspacePath, {
@@ -81,9 +124,11 @@ export async function runTrackedJob(workspacePath, job, runner) {
81124
report("completed", `Job ${job.id} completed`);
82125
return result;
83126
} catch (err) {
127+
clearTimer();
84128
upsertJob(workspacePath, {
85129
id: job.id,
86130
status: "failed",
131+
phase: "failed",
87132
completedAt: new Date().toISOString(),
88133
errorMessage: err.message,
89134
});
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import { describe, it, before, after } from "node:test";
2+
import assert from "node:assert/strict";
3+
import fs from "node:fs";
4+
import os from "node:os";
5+
import path from "node:path";
6+
import { runTrackedJob } from "../plugins/opencode/scripts/lib/tracked-jobs.mjs";
7+
import { loadState } from "../plugins/opencode/scripts/lib/state.mjs";
8+
9+
describe("runTrackedJob timeout", () => {
10+
let workspace;
11+
let previousPluginData;
12+
13+
before(() => {
14+
workspace = fs.mkdtempSync(path.join(os.tmpdir(), "opencode-timeout-"));
15+
previousPluginData = process.env.CLAUDE_PLUGIN_DATA;
16+
process.env.CLAUDE_PLUGIN_DATA = fs.mkdtempSync(
17+
path.join(os.tmpdir(), "opencode-timeout-data-")
18+
);
19+
});
20+
21+
after(() => {
22+
if (previousPluginData == null) {
23+
delete process.env.CLAUDE_PLUGIN_DATA;
24+
} else {
25+
process.env.CLAUDE_PLUGIN_DATA = previousPluginData;
26+
}
27+
});
28+
29+
it("aborts a runner that never resolves and marks the job failed", async () => {
30+
const job = { id: "timeout-never-1" };
31+
32+
await assert.rejects(
33+
runTrackedJob(
34+
workspace,
35+
job,
36+
() => new Promise(() => {}),
37+
{ timeoutMs: 50 }
38+
),
39+
/hard timeout/i
40+
);
41+
42+
const state = loadState(workspace);
43+
const stored = state.jobs.find((j) => j.id === job.id);
44+
assert.ok(stored);
45+
assert.equal(stored.status, "failed");
46+
assert.equal(stored.phase, "failed");
47+
assert.match(stored.errorMessage, /hard timeout/i);
48+
});
49+
50+
it("does not fire for runners that resolve quickly", async () => {
51+
const job = { id: "timeout-quick-1" };
52+
53+
const result = await runTrackedJob(
54+
workspace,
55+
job,
56+
async () => ({ rendered: "ok" }),
57+
{ timeoutMs: 60_000 }
58+
);
59+
60+
assert.equal(result.rendered, "ok");
61+
const state = loadState(workspace);
62+
const stored = state.jobs.find((j) => j.id === job.id);
63+
assert.equal(stored.status, "completed");
64+
});
65+
66+
it("honors OPENCODE_COMPANION_JOB_TIMEOUT_MS env override", async () => {
67+
const previous = process.env.OPENCODE_COMPANION_JOB_TIMEOUT_MS;
68+
process.env.OPENCODE_COMPANION_JOB_TIMEOUT_MS = "40";
69+
try {
70+
const job = { id: "timeout-env-1" };
71+
await assert.rejects(
72+
runTrackedJob(workspace, job, () => new Promise(() => {})),
73+
/hard timeout/i
74+
);
75+
} finally {
76+
if (previous == null) delete process.env.OPENCODE_COMPANION_JOB_TIMEOUT_MS;
77+
else process.env.OPENCODE_COMPANION_JOB_TIMEOUT_MS = previous;
78+
}
79+
});
80+
});

0 commit comments

Comments
 (0)