Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions agents/iron-loop/iron-loop-executor.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,46 @@ if [ -f "plans/.stop-after-current" ]; then
fi
```

## API Overload (529) Handling

When the Anthropic API returns HTTP 529 (overloaded), your response depends on whether
any tool writes have been made **in the current step**:

### Pre-write overload — no writes made yet in the current step

1. Write `status: "overload-retry"` to the plan's `.status` file:
```json
{
"agent": "iron-loop-executor",
"status": "overload-retry",
"message": "API overloaded (529) — no writes made, safe to retry",
"retry_at": "<ISO timestamp: now + overloadIntervalSeconds from .ctoc/settings.json, default 600s>"
}
```
2. If `ScheduleWakeup` is available in your context, call it with the same interval.
3. Exit cleanly. The dashboard will display `⏳ retry in Xm — <plan>`. The operator
can restart the agent via the menu when ready.

### Mid-write overload — at least one file was written in the current step

1. Write `status: "overload-partial"` to the plan's `.status` file:
```json
{
"agent": "iron-loop-executor",
"status": "overload-partial",
"message": "API overloaded (529) after partial writes — human review required before resuming"
}
```
2. Exit cleanly. The dashboard will display `⚠ partial write — review: <plan>`.
3. **Do NOT auto-retry.** A human must inspect what was written and decide whether
to continue or roll back before restarting the agent.

### Tracking writes within a step

Before making any tool write in a step, note the last completed checkpoint (the most
recent `[x]` checkbox). If a 529 occurs before you write anything, use `overload-retry`.
If a 529 occurs after at least one write in the current step, use `overload-partial`.

## Error Handling

If a step fails:
Expand Down
55 changes: 48 additions & 7 deletions src/lib/actions.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const fs = require('fs');
const path = require('path');
const { parseMetadata } = require('./state');
const { refineLoop, appendDeferredQuestions } = require('./iron-loop');
const { writeStatus, clearStatus } = require('./background');
const { writeStatus, clearStatus, readStatus } = require('./background');
const { findProjectRoot } = require('./project-root');
const { validateForReview, validateTransition, formatValidationResult } = require('./plan-validator');
const { logTransition } = require('./transition-log');
Expand Down Expand Up @@ -495,11 +495,45 @@ function startAgent(projectPath) {
// 2. Clear any leftover stop flag
clearStop(root);

// 3. Clean up stale in-progress plans (D2)
// 3. Clean up stale in-progress plans (skips overload-retry / overload-partial)
const cleanedUp = cleanupStaleInProgress(root);

// 4. Get next plan from todo queue
// 4. Check for an overload-retry plan that is ready to resume.
// These stay in in-progress across agent restarts — do not pick a new todo plan.
const plansDir = getPlansDir(root);
const inProgressPlans = readPlans(path.join(plansDir, 'in-progress'));
const retryPlan = inProgressPlans.find(p => p.bgStatus === 'overload-retry');
if (retryPlan) {
// Clear overload-retry status so the executor resumes normally
clearStatus(retryPlan.path);
updateLockPlan(root, retryPlan.name);
setAgentStatus(root, {
active: true,
plan: retryPlan.name,
step: 8,
phase: 'TEST',
task: 'Resuming after API overload'
});
return {
started: true,
resumed: true,
plan: { name: retryPlan.name, path: retryPlan.path },
cleanedUp,
remainingTodo: readPlans(path.join(plansDir, 'todo')).length
};
}

// 5. Block if an overload-partial plan is in-progress — requires human gate.
const partialPlan = inProgressPlans.find(p => p.bgStatus === 'overload-partial');
if (partialPlan) {
releaseLock(root);
return {
started: false,
error: `Plan "${partialPlan.name}" has a partial write from an API overload. Review the in-progress plan and clear the .status file before restarting the agent.`
};
}

// 6. Get next plan from todo queue
const todoPlans = readPlans(path.join(plansDir, 'todo'));

if (todoPlans.length === 0) {
Expand All @@ -511,16 +545,16 @@ function startAgent(projectPath) {
};
}

// 5. Pick oldest plan (FIFO — already sorted by readPlans)
// 7. Pick oldest plan (FIFO — already sorted by readPlans)
const nextPlan = todoPlans[0];

// 6. Update lock with actual plan name
// 8. Update lock with actual plan name
updateLockPlan(root, nextPlan.name);

// 7. Move plan to in-progress
// 9. Move plan to in-progress
const newPath = startExecution(nextPlan.path, root);

// 8. Update agent status for dashboard display
// 10. Update agent status for dashboard display
setAgentStatus(root, {
active: true,
plan: nextPlan.name,
Expand Down Expand Up @@ -704,6 +738,13 @@ function cleanupStaleInProgress(projectPath) {
const cleanedUp = [];

for (const plan of plans) {
// Skip plans in overload states — they need special handling, not cleanup.
// overload-retry: executor will resume; overload-partial: human gate required.
const planStatus = readStatus(plan.path);
if (planStatus.status === 'overload-retry' || planStatus.status === 'overload-partial') {
continue;
}

// Log cleanup event to .ctoc/logs/cleanup.json
const logDir = path.join(root, '.ctoc', 'logs');
fs.mkdirSync(logDir, { recursive: true });
Expand Down
47 changes: 45 additions & 2 deletions src/lib/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ function getStatusPath(planPath) {
* @param {string} planPath - Path to the plan file
* @param {Object} status - Status object
* @param {string} status.agent - Agent type (research-assistant, implementation-planner, etc.)
* @param {string} status.status - Current status (working, complete, needs-input, timeout)
* @param {string} status.status - Current status (working, complete, needs-input, timeout, overload-retry, overload-partial)
* @param {string} [status.message] - Optional status message
* @param {string} [status.retry_at] - ISO timestamp for next retry (overload-retry only)
*/
function writeStatus(planPath, status) {
const statusPath = getStatusPath(planPath);
Expand All @@ -34,14 +35,18 @@ function writeStatus(planPath, status) {
updatedAt: new Date().toISOString()
};

if (status.retry_at) {
statusObj.retry_at = status.retry_at;
}

fs.writeFileSync(statusPath, JSON.stringify(statusObj, null, 2));
return statusObj;
}

/**
* Read background agent status
* @param {string} planPath - Path to the plan file
* @returns {Object} Status object with status field (none, working, complete, needs-input, timeout)
* @returns {Object} Status object with status field (none, working, complete, needs-input, timeout, overload-retry, overload-partial)
*/
function readStatus(planPath) {
const statusPath = getStatusPath(planPath);
Expand Down Expand Up @@ -87,6 +92,10 @@ function getStatusIcon(status) {
return '⚠'; // Background agent needs input
case 'timeout':
return '✗'; // Timed out
case 'overload-retry':
return '⏳'; // API overload — scheduled retry
case 'overload-partial':
return '⚠'; // API overload mid-write — human review needed
default:
return '○';
}
Expand Down Expand Up @@ -152,6 +161,38 @@ function markTimeout(planPath) {
});
}

/**
* Mark plan as overload-retry: API returned 529 before any writes in the current step.
* Safe to auto-retry after the configured interval.
* @param {string} planPath - Path to the plan file
* @param {string} retryAt - ISO timestamp when the retry should occur
*/
function markOverloadRetry(planPath, retryAt) {
const current = readStatus(planPath);
writeStatus(planPath, {
agent: current.agent || 'iron-loop-executor',
status: 'overload-retry',
started: current.started,
message: 'API overloaded (529) — no writes made, safe to retry',
retry_at: retryAt
});
}

/**
* Mark plan as overload-partial: API returned 529 after partial writes in the current step.
* Requires human review before resuming to avoid duplicate or inconsistent state.
* @param {string} planPath - Path to the plan file
*/
function markOverloadPartial(planPath) {
const current = readStatus(planPath);
writeStatus(planPath, {
agent: current.agent || 'iron-loop-executor',
status: 'overload-partial',
started: current.started,
message: 'API overloaded (529) after partial writes — human review required before resuming'
});
}

/**
* Get all status files in a directory
* @param {string} dirPath - Directory to scan
Expand Down Expand Up @@ -206,6 +247,8 @@ module.exports = {
markComplete,
markNeedsInput,
markTimeout,
markOverloadRetry,
markOverloadPartial,
getAllStatuses,
cleanupStale
};
11 changes: 11 additions & 0 deletions src/lib/menu-screens.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ function buildDashboardTable(projectPath) {
out += '\n';
} else if (agent.stale) {
out += ` ⚠ Stale lock: ${agent.stalePlan || 'unknown'} (process died)\n`;
} else if (agent.overloadRetry) {
const retryLabel = agent.retryAt
? (() => {
const diffMs = new Date(agent.retryAt).getTime() - Date.now();
const diffMin = Math.ceil(diffMs / 60000);
return diffMin > 0 ? `retry in ${diffMin}m` : 'ready to retry';
})()
: 'retry pending';
out += ` ⏳ ${retryLabel} — ${agent.plan}\n`;
} else if (agent.overloadPartial) {
out += ` ⚠ partial write — review: ${agent.plan}\n`;
} else {
out += ` ○ Idle\n`;
}
Expand Down
9 changes: 8 additions & 1 deletion src/lib/settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ const SETTINGS_TABS = [
{ id: 'learning', name: 'Learning' },
{ id: 'git', name: 'Git' },
{ id: 'privacy', name: 'Privacy' },
{ id: 'deployment', name: 'Deployment' }
{ id: 'deployment', name: 'Deployment' },
{ id: 'retry', name: 'Retry' }
];

const SETTINGS_SCHEMA = {
Expand Down Expand Up @@ -84,6 +85,12 @@ const SETTINGS_SCHEMA = {
{ key: 'productionApproval', label: 'Production approval', type: 'select', options: ['auto', 'manual'], default: 'manual' },
{ key: 'autoRollback', label: 'Auto-rollback on failure', type: 'toggle', default: true }
]
},
retry: {
label: 'Retry Settings',
settings: [
{ key: 'overloadIntervalSeconds', label: 'API overload retry interval (seconds)', type: 'number', default: 600 }
]
}
};

Expand Down
27 changes: 26 additions & 1 deletion src/lib/state.js
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,32 @@ function getAgentStatus(projectPath) {
};
}

// No lock file — agent is idle
// No lock file — check in-progress plans for overload statuses
const plansDir = getPlansDir(root);
const inProgressDir = path.join(plansDir, 'in-progress');
if (fs.existsSync(inProgressDir)) {
const mdFiles = fs.readdirSync(inProgressDir).filter(f => f.endsWith('.md'));
for (const f of mdFiles) {
const planPath = path.join(inProgressDir, f);
const status = readStatus(planPath);
if (status.status === 'overload-retry') {
return {
active: false,
overloadRetry: true,
plan: f.replace('.md', ''),
retryAt: status.retry_at || null
};
}
if (status.status === 'overload-partial') {
return {
active: false,
overloadPartial: true,
plan: f.replace('.md', '')
};
}
}
}

return { active: false };
}

Expand Down
Loading