Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
416 changes: 347 additions & 69 deletions packages/sdk/server-ai/__tests__/Judge.test.ts

Large diffs are not rendered by default.

137 changes: 135 additions & 2 deletions packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ describe('config evaluation', () => {
evaluateSpy.mockRestore();
});

it('evaluates judge config successfully', async () => {
it('evaluates judge config successfully with evaluationMetricKeys (legacy)', async () => {
const client = new LDAIClientImpl(mockLdClient);
const key = 'test-judge';
const defaultValue: LDAIJudgeConfigDefault = {
Expand All @@ -159,7 +159,140 @@ describe('config evaluation', () => {
const result = await client.judgeConfig(key, testContext, defaultValue);

expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
expect(result.evaluationMetricKeys).toEqual(['relevance', 'accuracy']);
// Should use first value from evaluationMetricKeys
expect(result.evaluationMetricKey).toBe('relevance');
expect(result.tracker).toBeDefined();
expect(result.enabled).toBe(true);
evaluateSpy.mockRestore();
});

it('evaluates judge config successfully with evaluationMetricKey', async () => {
const client = new LDAIClientImpl(mockLdClient);
const key = 'test-judge';
const defaultValue: LDAIJudgeConfigDefault = {
enabled: false,
};

const mockVariation = {
enabled: true,
model: { name: 'gpt-4' },
provider: { name: 'openai' },
evaluationMetricKey: 'relevance',
messages: [{ role: 'system', content: 'You are a judge.' }],
_ldMeta: {
variationKey: 'v1',
enabled: true,
mode: 'judge',
},
};

mockLdClient.variation.mockResolvedValue(mockVariation);

const evaluateSpy = jest.spyOn(client as any, '_evaluate');
const result = await client.judgeConfig(key, testContext, defaultValue);

expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
expect(result.evaluationMetricKey).toBe('relevance');
expect(result.tracker).toBeDefined();
expect(result.enabled).toBe(true);
evaluateSpy.mockRestore();
});

it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => {
const client = new LDAIClientImpl(mockLdClient);
const key = 'test-judge';
const defaultValue: LDAIJudgeConfigDefault = {
enabled: false,
};

const mockVariation = {
enabled: true,
model: { name: 'gpt-4' },
provider: { name: 'openai' },
evaluationMetricKey: 'helpfulness',
evaluationMetricKeys: ['relevance', 'accuracy'],
messages: [{ role: 'system', content: 'You are a judge.' }],
_ldMeta: {
variationKey: 'v1',
enabled: true,
mode: 'judge',
},
};

mockLdClient.variation.mockResolvedValue(mockVariation);

const evaluateSpy = jest.spyOn(client as any, '_evaluate');
const result = await client.judgeConfig(key, testContext, defaultValue);

expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
expect(result.evaluationMetricKey).toBe('helpfulness');
expect(result.tracker).toBeDefined();
expect(result.enabled).toBe(true);
evaluateSpy.mockRestore();
});

it('treats empty string evaluationMetricKey as invalid and falls back to evaluationMetricKeys', async () => {
const client = new LDAIClientImpl(mockLdClient);
const key = 'test-judge';
const defaultValue: LDAIJudgeConfigDefault = {
enabled: false,
};

const mockVariation = {
enabled: true,
model: { name: 'gpt-4' },
provider: { name: 'openai' },
evaluationMetricKey: '',
evaluationMetricKeys: ['relevance', 'accuracy'],
messages: [{ role: 'system', content: 'You are a judge.' }],
_ldMeta: {
variationKey: 'v1',
enabled: true,
mode: 'judge',
},
};

mockLdClient.variation.mockResolvedValue(mockVariation);

const evaluateSpy = jest.spyOn(client as any, '_evaluate');
const result = await client.judgeConfig(key, testContext, defaultValue);

expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
// Empty string should be treated as invalid, so should fall back to first value in evaluationMetricKeys
expect(result.evaluationMetricKey).toBe('relevance');
expect(result.tracker).toBeDefined();
expect(result.enabled).toBe(true);
evaluateSpy.mockRestore();
});

it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => {
const client = new LDAIClientImpl(mockLdClient);
const key = 'test-judge';
const defaultValue: LDAIJudgeConfigDefault = {
enabled: false,
};

const mockVariation = {
enabled: true,
model: { name: 'gpt-4' },
provider: { name: 'openai' },
evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'],
messages: [{ role: 'system', content: 'You are a judge.' }],
_ldMeta: {
variationKey: 'v1',
enabled: true,
mode: 'judge',
},
};

mockLdClient.variation.mockResolvedValue(mockVariation);

const evaluateSpy = jest.spyOn(client as any, '_evaluate');
const result = await client.judgeConfig(key, testContext, defaultValue);

expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
// Should skip empty and whitespace strings, use first valid value
expect(result.evaluationMetricKey).toBe('relevance');
expect(result.tracker).toBeDefined();
expect(result.enabled).toBe(true);
evaluateSpy.mockRestore();
Expand Down
67 changes: 67 additions & 0 deletions packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -813,3 +813,70 @@ describe('trackMetricsOf', () => {
);
});
});

describe('trackJudgeResponse', () => {
it('tracks evaluation metric key with score', () => {
const tracker = new LDAIConfigTrackerImpl(
mockLdClient,
configKey,
variationKey,
version,
modelName,
providerName,
testContext,
);

const judgeResponse = {
judgeConfigKey: 'test-judge',
evals: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
success: true,
};

tracker.trackJudgeResponse(judgeResponse);

expect(mockTrack).toHaveBeenCalledWith(
'relevance',
testContext,
{ ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
0.8,
);
});

it('tracks multiple evaluation metrics when present', () => {
const tracker = new LDAIConfigTrackerImpl(
mockLdClient,
configKey,
variationKey,
version,
modelName,
providerName,
testContext,
);

const judgeResponse = {
judgeConfigKey: 'test-judge',
evals: {
relevance: { score: 0.8, reasoning: 'Relevant' },
accuracy: { score: 0.9, reasoning: 'Accurate' },
},
success: true,
};

tracker.trackJudgeResponse(judgeResponse);

expect(mockTrack).toHaveBeenCalledWith(
'relevance',
testContext,
{ ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
0.8,
);
expect(mockTrack).toHaveBeenCalledWith(
'accuracy',
testContext,
{ ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
0.9,
);
});
});
4 changes: 2 additions & 2 deletions packages/sdk/server-ai/src/api/LDAIClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ export interface LDAIClient {
* enabled: true,
* model: { name: 'gpt-4' },
* provider: { name: 'openai' },
* evaluationMetricKeys: ['$ld:ai:judge:relevance'],
* evaluationMetricKey: '$ld:ai:judge:relevance',
* messages: [{ role: 'system', content: 'You are a relevance judge.' }]
* }, variables);
*
Expand Down Expand Up @@ -303,7 +303,7 @@ export interface LDAIClient {
* enabled: true,
* model: { name: "gpt-4" },
* provider: { name: "openai" },
* evaluationMetricKeys: ['$ld:ai:judge:relevance'],
* evaluationMetricKey: '$ld:ai:judge:relevance',
* messages: [{ role: 'system', content: 'You are a relevance judge.' }]
* },
* { metric: "relevance" }
Expand Down
18 changes: 16 additions & 2 deletions packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export interface LDAIConfigFlagValue {
messages?: LDMessage[];
provider?: LDProviderConfig;
instructions?: string;
evaluationMetricKey?: string;
evaluationMetricKeys?: string[];
judgeConfiguration?: LDJudgeConfiguration;
}
Expand Down Expand Up @@ -65,6 +66,9 @@ export class LDAIConfigUtils {
if ('instructions' in config && config.instructions !== undefined) {
flagValue.instructions = config.instructions;
}
if ('evaluationMetricKey' in config && config.evaluationMetricKey !== undefined) {
flagValue.evaluationMetricKey = config.evaluationMetricKey;
}
if ('evaluationMetricKeys' in config && config.evaluationMetricKeys !== undefined) {
flagValue.evaluationMetricKeys = config.evaluationMetricKeys;
}
Expand Down Expand Up @@ -121,7 +125,6 @@ export class LDAIConfigUtils {
key,
enabled: false,
tracker: undefined,
evaluationMetricKeys: [],
} as LDAIJudgeConfig;
case 'completion':
default:
Expand Down Expand Up @@ -202,11 +205,22 @@ export class LDAIConfigUtils {
flagValue: LDAIConfigFlagValue,
tracker: LDAIConfigTracker,
): LDAIJudgeConfig {
// Prioritize evaluationMetricKey, fallback to first valid (non-empty, non-whitespace) value in evaluationMetricKeys
let evaluationMetricKey: string | undefined;
if (flagValue.evaluationMetricKey && flagValue.evaluationMetricKey.trim().length > 0) {
evaluationMetricKey = flagValue.evaluationMetricKey.trim();
} else if (flagValue.evaluationMetricKeys && flagValue.evaluationMetricKeys.length > 0) {
const validKey = flagValue.evaluationMetricKeys.find(
(metricKey) => metricKey && metricKey.trim().length > 0,
);
evaluationMetricKey = validKey ? validKey.trim() : undefined;
}

return {
...this._toBaseConfig(key, flagValue),
tracker,
messages: flagValue.messages,
evaluationMetricKeys: flagValue.evaluationMetricKeys || [],
evaluationMetricKey,
};
}
}
18 changes: 15 additions & 3 deletions packages/sdk/server-ai/src/api/config/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,14 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault {
*/
messages?: LDMessage[];
/**
* Evaluation metric keys for judge configurations.
* Evaluation metric key for judge configurations.
* The key of the metric that this judge can evaluate.
*/
evaluationMetricKey?: string;
/**
* Evaluation metric keys for judge configurations (legacy).
* The keys of the metrics that this judge can evaluate.
* @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
*/
evaluationMetricKeys?: string[];
}
Expand Down Expand Up @@ -211,10 +217,16 @@ export interface LDAIJudgeConfig extends LDAIConfig {
*/
messages?: LDMessage[];
/**
* Evaluation metric keys for judge configurations.
* Evaluation metric key for judge configurations.
* The key of the metric that this judge can evaluate.
*/
evaluationMetricKey?: string;
/**
* Evaluation metric keys for judge configurations (legacy).
* The keys of the metrics that this judge can evaluate.
* @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
*/
evaluationMetricKeys: string[];
evaluationMetricKeys?: string[];
}

// ============================================================================
Expand Down
23 changes: 9 additions & 14 deletions packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@
* Not exported - only used internally by TrackedJudge.
*/
class EvaluationSchemaBuilder {
static build(evaluationMetricKeys: string[]): Record<string, unknown> {
static build(evaluationMetricKey?: string): Record<string, unknown> {
if (!evaluationMetricKey) {
return {};
}
return {
type: 'object',
properties: {
evaluations: {
type: 'object',
description: `Object containing evaluation results for ${evaluationMetricKeys.join(', ')} metrics`,
properties: this._buildKeyProperties(evaluationMetricKeys),
required: evaluationMetricKeys,
description: `Object containing evaluation results for ${evaluationMetricKey} metric`,
properties: {
[evaluationMetricKey]: this._buildKeySchema(evaluationMetricKey),
},
required: [evaluationMetricKey],
additionalProperties: false,
},
},
Expand All @@ -20,16 +25,6 @@ class EvaluationSchemaBuilder {
} as const;
}

private static _buildKeyProperties(evaluationMetricKeys: string[]) {
return evaluationMetricKeys.reduce(
(acc, key) => {
acc[key] = this._buildKeySchema(key);
return acc;
},
{} as Record<string, unknown>,
);
}

private static _buildKeySchema(key: string) {
return {
type: 'object',
Expand Down
Loading