Skip to content

Commit a7e37b2

Browse files
christsoclaude
andcommitted
refactor(core): extract PASS_THRESHOLD constant, clean up scoring module
- Extract PASS_THRESHOLD = 0.8 as single source of truth in scoring.ts - Replace magic 0.8 in evaluate.ts and orchestrator.ts with the constant - Add file header to scoring.ts explaining the scoring model - Use data-driven NEGATED_VERDICT map instead of ternary chain - Remove dead isNonEmptyString import from composite.ts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1f10cb1 commit a7e37b2

5 files changed

Lines changed: 36 additions & 24 deletions

File tree

packages/core/src/evaluation/evaluate.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ import path from 'node:path';
6161
import { buildDirectoryChain, findGitRoot } from './file-utils.js';
6262

6363
import type { AssertFn } from './assertions.js';
64+
import { PASS_THRESHOLD } from './evaluators/scoring.js';
6465
import { runEvaluation } from './orchestrator.js';
6566
import { createFunctionProvider } from './providers/function-provider.js';
6667
import { readTargetDefinitions } from './providers/targets-file.js';
@@ -165,9 +166,9 @@ export interface EvalConfig {
165166
export interface EvalSummary {
166167
/** Total number of test cases */
167168
readonly total: number;
168-
/** Number of passing test cases (score >= 0.8) */
169+
/** Number of passing test cases (score >= PASS_THRESHOLD) */
169170
readonly passed: number;
170-
/** Number of failing test cases (score < 0.8) */
171+
/** Number of failing test cases (score < PASS_THRESHOLD) */
171172
readonly failed: number;
172173
/** Total duration in milliseconds */
173174
readonly durationMs: number;
@@ -375,7 +376,7 @@ function computeSummary(results: readonly EvaluationResult[], durationMs: number
375376

376377
for (const r of results) {
377378
scoreSum += r.score;
378-
if (r.score >= 0.8) {
379+
if (r.score >= PASS_THRESHOLD) {
379380
passed++;
380381
}
381382
}

packages/core/src/evaluation/evaluators/composite.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,7 @@ import type {
99
} from '../types.js';
1010
import { executeScript } from './code-evaluator.js';
1111
import { buildOutputSchema, freeformEvaluationSchema } from './llm-grader.js';
12-
import {
13-
clampScore,
14-
isNonEmptyString,
15-
parseJsonFromText,
16-
parseJsonSafe,
17-
scoreToVerdict,
18-
} from './scoring.js';
12+
import { clampScore, parseJsonFromText, parseJsonSafe, scoreToVerdict } from './scoring.js';
1913
import type {
2014
ChildEvaluatorResult,
2115
EvaluationContext,

packages/core/src/evaluation/evaluators/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export type {
1010

1111
// Scoring utilities
1212
export {
13+
PASS_THRESHOLD,
1314
clampScore,
1415
deepEqual,
1516
extractJsonBlob,

packages/core/src/evaluation/evaluators/scoring.ts

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,26 @@
1+
/**
2+
* Scoring primitives for the evaluation engine.
3+
*
4+
* Scoring model:
5+
* score ∈ [0, 1] — continuous quality signal
6+
* verdict — binary classification derived from score via PASS_THRESHOLD
7+
*
8+
* score >= PASS_THRESHOLD → 'pass'
9+
* score < PASS_THRESHOLD → 'fail'
10+
* (infrastructure skip) → 'skip'
11+
*
12+
* To change the pass/fail boundary, update PASS_THRESHOLD.
13+
* All verdict derivation flows through scoreToVerdict().
14+
*/
15+
116
import type { EvaluationVerdict } from '../types.js';
217
import type { EvaluationScore } from './types.js';
318

19+
/** Score threshold for pass verdict. Scores below this are fail. */
20+
export const PASS_THRESHOLD = 0.8;
21+
422
export function scoreToVerdict(score: number): EvaluationVerdict {
5-
if (score >= 0.8) {
6-
return 'pass';
7-
}
8-
return 'fail';
23+
return score >= PASS_THRESHOLD ? 'pass' : 'fail';
924
}
1025

1126
export function clampScore(value: number): number {
@@ -81,18 +96,22 @@ export function deepEqual(a: unknown, b: unknown): boolean {
8196
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
8297
}
8398

99+
/** Verdict inversion map: pass↔fail, skip stays skip. */
100+
const NEGATED_VERDICT: Record<EvaluationVerdict, EvaluationVerdict> = {
101+
pass: 'fail',
102+
fail: 'pass',
103+
skip: 'skip',
104+
};
105+
84106
/**
85107
* Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
86108
* and flips passed on each assertion.
87109
*/
88110
export function negateScore(score: EvaluationScore): EvaluationScore {
89-
const negatedScore = clampScore(1 - score.score);
90-
const negatedVerdict: EvaluationVerdict =
91-
score.verdict === 'pass' ? 'fail' : score.verdict === 'fail' ? 'pass' : 'skip';
92111
return {
93112
...score,
94-
score: negatedScore,
95-
verdict: negatedVerdict,
113+
score: clampScore(1 - score.score),
114+
verdict: NEGATED_VERDICT[score.verdict],
96115
assertions: score.assertions.map((a) => ({
97116
...a,
98117
passed: !a.passed,

packages/core/src/evaluation/orchestrator.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {
1010
type EvaluationScore,
1111
type Evaluator,
1212
LlmGraderEvaluator,
13+
PASS_THRESHOLD,
1314
negateScore,
1415
scoreToVerdict,
1516
} from './evaluators.js';
@@ -76,11 +77,8 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j
7677

7778
type MaybePromise<T> = T | Promise<T>;
7879

79-
/** Threshold for classifying ok vs quality_failure (score >= threshold → ok). */
80-
const QUALITY_PASS_THRESHOLD = 0.8;
81-
8280
function classifyQualityStatus(score: number): ExecutionStatus {
83-
return score >= QUALITY_PASS_THRESHOLD ? 'ok' : 'quality_failure';
81+
return score >= PASS_THRESHOLD ? 'ok' : 'quality_failure';
8482
}
8583

8684
function buildSkippedEvaluatorError(
@@ -2423,7 +2421,6 @@ async function runEvaluatorList(options: {
24232421
}
24242422

24252423
// Required gate: if any evaluator with `required` flag fails its threshold, aggregate becomes 0
2426-
const PASS_THRESHOLD = 0.8;
24272424
const hasRequiredFailure = scored.some((entry) => {
24282425
if (!entry.required) return false;
24292426
const minScore = typeof entry.required === 'number' ? entry.required : PASS_THRESHOLD;

0 commit comments

Comments
 (0)