Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/helpers/eval-store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ export interface EvalTestEntry {
last_tool_call?: string; // e.g. "Write(review-output.md)"

// Model + timing diagnostics (added for Sonnet/Opus split)
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-7'
first_response_ms?: number; // time from spawn to first NDJSON line
max_inter_turn_ms?: number; // peak latency between consecutive tool calls

Expand Down
4 changes: 2 additions & 2 deletions test/skill-e2e-design.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
timeout: 360_000,
testName: 'design-consultation-core',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/design-consultation core', result);
Expand Down Expand Up @@ -227,7 +227,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
timeout: 360_000,
testName: 'design-consultation-existing',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/design-consultation existing', result);
Expand Down
12 changes: 6 additions & 6 deletions test/skill-e2e-plan.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
timeout: 360_000,
testName: 'plan-ceo-review',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/plan-ceo-review', result);
Expand Down Expand Up @@ -167,7 +167,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
timeout: 360_000,
testName: 'plan-ceo-review-selective',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/plan-ceo-review (SELECTIVE)', result);
Expand Down Expand Up @@ -233,7 +233,7 @@ Write your expansion proposals to ${planDir}/proposals.md with ONLY the proposal
timeout: 360_000,
testName: 'plan-ceo-review-expansion-energy',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/plan-ceo-review (EXPANSION ENERGY)', result);
Expand Down Expand Up @@ -333,7 +333,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
timeout: 360_000,
testName: 'plan-eng-review',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/plan-eng-review', result);
Expand Down Expand Up @@ -459,7 +459,7 @@ Write your review to ${planDir}/review-output.md`,
timeout: 360_000,
testName: 'plan-eng-review-artifact',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/plan-eng-review artifact', result);
Expand Down Expand Up @@ -679,7 +679,7 @@ This review report at the bottom of the plan is the MOST IMPORTANT deliverable o
timeout: 360_000,
testName: 'plan-review-report',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/plan-eng-review report', result);
Expand Down
2 changes: 1 addition & 1 deletion test/skill-e2e-qa-bugs.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ CRITICAL RULES:
timeout: 300_000,
testName: `qa-${label}`,
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost(`/qa ${label}`, result);
Expand Down
2 changes: 1 addition & 1 deletion test/skill-e2e-review.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ Analyze the git history and produce the narrative report as described in the SKI
timeout: 300_000,
testName: 'retro',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/retro', result);
Expand Down
2 changes: 1 addition & 1 deletion test/skill-e2e-workflow.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
timeout: 300_000,
testName: 'codex-review',
runId,
model: 'claude-opus-4-6',
model: 'claude-opus-4-7',
});

logCost('/codex review', result);
Expand Down