diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index 2485095742..6c8985b914 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -19,25 +19,21 @@ describe('Judge', () => { }; beforeEach(() => { - // Mock the AIProvider - only mock what's actually used mockProvider = { invokeStructuredModel: jest.fn(), } as any; - // Mock the LDAIConfigTracker - only mock what's actually used mockTracker = { trackMetricsOf: jest.fn(), getTrackData: jest.fn().mockReturnValue(mockTrackData), } as any; - // Mock the logger - only mock what's actually used mockLogger = { debug: jest.fn(), warn: jest.fn(), error: jest.fn(), } as any; - // Create a basic judge config judgeConfig = { key: 'test-judge', enabled: true, @@ -52,7 +48,7 @@ describe('Judge', () => { model: { name: 'gpt-4' }, provider: { name: 'openai' }, tracker: mockTracker, - evaluationMetricKeys: ['relevance', 'accuracy', 'helpfulness'], + evaluationMetricKey: 'relevance', }; }); @@ -76,15 +72,11 @@ describe('Judge', () => { data: { evaluations: { relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, }, }, rawResponse: JSON.stringify({ evaluations: { relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, }, }), metrics: { @@ -111,14 +103,6 @@ describe('Judge', () => { score: 0.8, reasoning: 'The response is relevant to the question', }, - accuracy: { - score: 0.9, - reasoning: 'The response is factually accurate', - }, - helpfulness: { - score: 0.7, - reasoning: 'The response provides helpful information', - }, }, success: true, judgeConfigKey: 'test-judge', @@ -140,25 +124,51 @@ describe('Judge', () => { ); }); + it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => { + const mockStructuredResponse: StructuredResponse = { + data: { + evaluations: { + relevance: { score: 0.85, reasoning: 'Highly relevant response' }, + }, + }, + rawResponse: JSON.stringify({ + evaluations: { + relevance: { score: 0.85, reasoning: 'Highly relevant response' }, + }, + }), + metrics: { + success: true, + usage: { total: 100, input: 50, output: 50 }, + }, + }; + + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse); + + const result = await judge.evaluate('test input', 'test output'); + + expect(result).toBeDefined(); + expect(result?.evals).toHaveProperty('relevance'); + expect(result?.evals.relevance.score).toBe(0.85); + expect(result?.judgeConfigKey).toBe('test-judge'); + expect(result?.success).toBe(true); + // Verify the evaluationMetricKey from config is used in the result + expect(Object.keys(result?.evals || {})).toContain(judgeConfig.evaluationMetricKey); + }); + it('handles sampling rate correctly', async () => { - // Mock Math.random to return 0.3 (should be sampled with rate 0.5 since 0.3 <= 0.5) const originalRandom = Math.random; Math.random = jest.fn().mockReturnValue(0.3); - // Mock the structured response const mockStructuredResponse: StructuredResponse = { data: { evaluations: { relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, }, }, rawResponse: JSON.stringify({ evaluations: { relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, }, }), metrics: { @@ -179,7 +189,6 @@ describe('Judge', () => { }); it('returns undefined when not sampled', async () => { - // Mock Math.random to return 0.8 (should not be sampled with rate 0.5 since 0.8 > 0.5) const originalRandom = Math.random; Math.random = jest.fn().mockReturnValue(0.8); @@ -194,9 +203,10 @@ describe('Judge', () => { Math.random = originalRandom; }); - it('returns undefined when evaluationMetricKeys is empty', async () => { + it('returns undefined when evaluationMetricKey and evaluationMetricKeys are both missing', async () => { const configWithoutMetrics: LDAIJudgeConfig = { ...judgeConfig, + evaluationMetricKey: undefined, evaluationMetricKeys: [], }; const judgeWithoutMetrics = new Judge( @@ -210,11 +220,183 @@ describe('Judge', () => { expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( - 'Judge configuration is missing required evaluationMetricKeys', + 'Judge configuration is missing required evaluation metric key', mockTrackData, ); }); + it('uses evaluationMetricKey when provided', async () => { + const configWithSingleKey: LDAIJudgeConfig = { + ...judgeConfig, + evaluationMetricKey: 'relevance', + evaluationMetricKeys: undefined, + }; + const judgeWithSingleKey = new Judge( + configWithSingleKey, + mockTracker, + mockProvider, + mockLogger, + ); + + const mockStructuredResponse: StructuredResponse = { + data: { + evaluations: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + }, + rawResponse: JSON.stringify({ + evaluations: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + }), + metrics: { + success: true, + usage: { total: 100, input: 50, output: 50 }, + }, + }; + + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse); + + const result = await judgeWithSingleKey.evaluate('test input', 'test output'); + + expect(result).toEqual({ + evals: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + success: true, + judgeConfigKey: 'test-judge', + }); + }); + + it('falls back to first value in evaluationMetricKeys when evaluationMetricKey is not provided', async () => { + const configWithLegacyKeys: LDAIJudgeConfig = { + ...judgeConfig, + evaluationMetricKey: undefined, + evaluationMetricKeys: ['relevance', 'accuracy'], + }; + const judgeWithLegacyKeys = new Judge( + configWithLegacyKeys, + mockTracker, + mockProvider, + mockLogger, + ); + + const mockStructuredResponse: StructuredResponse = { + data: { + evaluations: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + }, + rawResponse: JSON.stringify({ + evaluations: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + }), + metrics: { + success: true, + usage: { total: 100, input: 50, output: 50 }, + }, + }; + + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse); + + const result = await judgeWithLegacyKeys.evaluate('test input', 'test output'); + + expect(result).toEqual({ + evals: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + success: true, + judgeConfigKey: 'test-judge', + }); + }); + + it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => { + const configWithInvalidKeys: LDAIJudgeConfig = { + ...judgeConfig, + evaluationMetricKey: undefined, + evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'], + }; + const judgeWithInvalidKeys = new Judge( + configWithInvalidKeys, + mockTracker, + mockProvider, + mockLogger, + ); + + const mockStructuredResponse: StructuredResponse = { + data: { + evaluations: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + }, + rawResponse: JSON.stringify({ + evaluations: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + }), + metrics: { + success: true, + usage: { total: 100, input: 50, output: 50 }, + }, + }; + + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse); + + const result = await judgeWithInvalidKeys.evaluate('test input', 'test output'); + + // Should skip empty and whitespace strings, use first valid value + expect(result).toEqual({ + evals: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + success: true, + judgeConfigKey: 'test-judge', + }); + }); + + it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => { + const configWithBoth: LDAIJudgeConfig = { + ...judgeConfig, + evaluationMetricKey: 'helpfulness', + evaluationMetricKeys: ['relevance', 'accuracy'], + }; + const judgeWithBoth = new Judge(configWithBoth, mockTracker, mockProvider, mockLogger); + + const mockStructuredResponse: StructuredResponse = { + data: { + evaluations: { + helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, + }, + }, + rawResponse: JSON.stringify({ + evaluations: { + helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, + }, + }), + metrics: { + success: true, + usage: { total: 100, input: 50, output: 50 }, + }, + }; + + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse); + + const result = await judgeWithBoth.evaluate('test input', 'test output'); + + expect(result).toEqual({ + evals: { + helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, + }, + success: true, + judgeConfigKey: 'test-judge', + }); + }); + it('returns undefined when messages are missing', async () => { const configWithoutMessages: LDAIJudgeConfig = { ...judgeConfig, @@ -236,19 +418,16 @@ describe('Judge', () => { ); }); - it('returns partial evaluations when some metrics are missing', async () => { + it('returns empty evaluations with success false when expected metric is missing', async () => { const mockStructuredResponse: StructuredResponse = { data: { evaluations: { - relevance: { score: 0.8, reasoning: 'Good' }, - // accuracy is missing - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + accuracy: { score: 0.9, reasoning: 'Accurate' }, }, }, rawResponse: JSON.stringify({ evaluations: { - relevance: { score: 0.8, reasoning: 'Good' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + accuracy: { score: 0.9, reasoning: 'Accurate' }, }, }), metrics: { @@ -262,12 +441,8 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); - // When one metric is missing, it returns the partial evals it has with success: false expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'Good' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, - }, + evals: {}, success: false, judgeConfigKey: 'test-judge', }); @@ -276,7 +451,6 @@ describe('Judge', () => { it('returns empty evaluations when response structure is malformed', async () => { const mockStructuredResponse: StructuredResponse = { data: { - // Missing 'evaluations' wrapper - malformed structure relevance: { score: 0.8, reasoning: 'Good' }, accuracy: { score: 0.9, reasoning: 'Accurate' }, helpfulness: { score: 0.7, reasoning: 'Helpful' }, @@ -297,7 +471,6 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); - // When the structure is completely wrong, returns empty evals with success: false expect(result).toEqual({ evals: {}, success: false, @@ -355,15 +528,11 @@ describe('Judge', () => { data: { evaluations: { relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, }, }, rawResponse: JSON.stringify({ evaluations: { relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, }, }), metrics: { @@ -383,14 +552,6 @@ describe('Judge', () => { score: 0.8, reasoning: 'The response is relevant to the question', }, - accuracy: { - score: 0.9, - reasoning: 'The response is factually accurate', - }, - helpfulness: { - score: 0.7, - reasoning: 'The response provides helpful information', - }, }, success: true, judgeConfigKey: 'test-judge', @@ -419,7 +580,6 @@ describe('Judge', () => { metrics: { success: true }, }; - // Mock Math.random to return 0.8 (should not be sampled with rate 0.5 since 0.8 > 0.5) const originalRandom = Math.random; Math.random = jest.fn().mockReturnValue(0.8); @@ -440,7 +600,6 @@ describe('Judge', () => { }); it('constructs evaluation messages correctly', () => { - // Access private method for testing // eslint-disable-next-line no-underscore-dangle const constructMessages = (judge as any)._constructEvaluationMessages.bind(judge); const messages = constructMessages('test input', 'test output'); @@ -471,17 +630,13 @@ describe('Judge', () => { const responseData = { evaluations: { relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, }, }; - const result = parseResponse(responseData); + const result = parseResponse(responseData, 'relevance'); expect(result).toEqual({ relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, }); }); @@ -490,12 +645,10 @@ describe('Judge', () => { const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); const responseData = { relevance: { score: 0.8, reasoning: 'Good' }, - // Missing evaluations wrapper - invalid structure }; - const result = parseResponse(responseData); + const result = parseResponse(responseData, 'relevance'); - // Returns empty object when evaluations structure is missing expect(result).toEqual({}); }); @@ -504,18 +657,143 @@ describe('Judge', () => { const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); const responseData = { evaluations: { - relevance: { score: 0.8 }, // Missing reasoning - accuracy: { reasoning: 'Accurate' }, // Missing score - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + relevance: { score: 0.8 }, }, }; - const result = parseResponse(responseData); + const result = parseResponse(responseData, 'relevance'); - // Only helpfulness passes validation, relevance and accuracy are skipped - expect(result).toEqual({ - helpfulness: { score: 0.7, reasoning: 'Helpful' }, - }); + expect(result).toEqual({}); + }); + + it('handles invalid score values out of range', () => { + // eslint-disable-next-line no-underscore-dangle + const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); + const responseData = { + evaluations: { + relevance: { score: 1.5, reasoning: 'Good' }, + }, + }; + + const result = parseResponse(responseData, 'relevance'); + + expect(result).toEqual({}); + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('Invalid score evaluated for relevance: 1.5'), + mockTrackData, + ); + }); + + it('handles negative score values', () => { + // eslint-disable-next-line no-underscore-dangle + const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); + const responseData = { + evaluations: { + relevance: { score: -0.1, reasoning: 'Good' }, + }, + }; + + const result = parseResponse(responseData, 'relevance'); + + expect(result).toEqual({}); + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('Invalid score evaluated for relevance: -0.1'), + mockTrackData, + ); + }); + + it('handles invalid reasoning type', () => { + // eslint-disable-next-line no-underscore-dangle + const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); + const responseData = { + evaluations: { + relevance: { score: 0.8, reasoning: 123 }, + }, + }; + + const result = parseResponse(responseData, 'relevance'); + + expect(result).toEqual({}); + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('Invalid reasoning evaluated for relevance: 123'), + mockTrackData, + ); + }); + + it('handles missing evaluation when key does not exist in response', () => { + // eslint-disable-next-line no-underscore-dangle + const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); + const responseData = { + evaluations: { + accuracy: { score: 0.9, reasoning: 'Accurate' }, + }, + }; + + const result = parseResponse(responseData, 'relevance'); + + expect(result).toEqual({}); + expect(mockLogger.warn).toHaveBeenCalledWith( + 'Missing evaluation for metric key: relevance', + mockTrackData, + ); + }); + + it('handles empty evaluationMetricKeys array fallback', async () => { + const configWithEmptyKeys: LDAIJudgeConfig = { + ...judgeConfig, + evaluationMetricKey: undefined, + evaluationMetricKeys: [], + }; + const judgeWithEmptyKeys = new Judge( + configWithEmptyKeys, + mockTracker, + mockProvider, + mockLogger, + ); + + const result = await judgeWithEmptyKeys.evaluate('test input', 'test output'); + + expect(result).toBeUndefined(); + expect(mockLogger.warn).toHaveBeenCalledWith( + 'Judge configuration is missing required evaluation metric key', + mockTrackData, + ); + }); + + it('handles evaluation value that is not an object', () => { + // eslint-disable-next-line no-underscore-dangle + const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); + const responseData = { + evaluations: { + relevance: 'not an object', + }, + }; + + const result = parseResponse(responseData, 'relevance'); + + expect(result).toEqual({}); + expect(mockLogger.warn).toHaveBeenCalledWith( + 'Missing evaluation for metric key: relevance', + mockTrackData, + ); + }); + + it('handles null evaluation value', () => { + // eslint-disable-next-line no-underscore-dangle + const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); + const responseData = { + evaluations: { + relevance: null, + }, + }; + + const result = parseResponse(responseData, 'relevance'); + + expect(result).toEqual({}); + expect(mockLogger.warn).toHaveBeenCalledWith( + 'Missing evaluation for metric key: relevance', + mockTrackData, + ); }); }); }); diff --git a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts index bfb5e13ff0..3c64f5234b 100644 --- a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts @@ -133,7 +133,7 @@ describe('config evaluation', () => { evaluateSpy.mockRestore(); }); - it('evaluates judge config successfully', async () => { + it('evaluates judge config successfully with evaluationMetricKeys (legacy)', async () => { const client = new LDAIClientImpl(mockLdClient); const key = 'test-judge'; const defaultValue: LDAIJudgeConfigDefault = { @@ -159,7 +159,140 @@ describe('config evaluation', () => { const result = await client.judgeConfig(key, testContext, defaultValue); expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); - expect(result.evaluationMetricKeys).toEqual(['relevance', 'accuracy']); + // Should use first value from evaluationMetricKeys + expect(result.evaluationMetricKey).toBe('relevance'); + expect(result.tracker).toBeDefined(); + expect(result.enabled).toBe(true); + evaluateSpy.mockRestore(); + }); + + it('evaluates judge config successfully with evaluationMetricKey', async () => { + const client = new LDAIClientImpl(mockLdClient); + const key = 'test-judge'; + const defaultValue: LDAIJudgeConfigDefault = { + enabled: false, + }; + + const mockVariation = { + enabled: true, + model: { name: 'gpt-4' }, + provider: { name: 'openai' }, + evaluationMetricKey: 'relevance', + messages: [{ role: 'system', content: 'You are a judge.' }], + _ldMeta: { + variationKey: 'v1', + enabled: true, + mode: 'judge', + }, + }; + + mockLdClient.variation.mockResolvedValue(mockVariation); + + const evaluateSpy = jest.spyOn(client as any, '_evaluate'); + const result = await client.judgeConfig(key, testContext, defaultValue); + + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(result.evaluationMetricKey).toBe('relevance'); + expect(result.tracker).toBeDefined(); + expect(result.enabled).toBe(true); + evaluateSpy.mockRestore(); + }); + + it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => { + const client = new LDAIClientImpl(mockLdClient); + const key = 'test-judge'; + const defaultValue: LDAIJudgeConfigDefault = { + enabled: false, + }; + + const mockVariation = { + enabled: true, + model: { name: 'gpt-4' }, + provider: { name: 'openai' }, + evaluationMetricKey: 'helpfulness', + evaluationMetricKeys: ['relevance', 'accuracy'], + messages: [{ role: 'system', content: 'You are a judge.' }], + _ldMeta: { + variationKey: 'v1', + enabled: true, + mode: 'judge', + }, + }; + + mockLdClient.variation.mockResolvedValue(mockVariation); + + const evaluateSpy = jest.spyOn(client as any, '_evaluate'); + const result = await client.judgeConfig(key, testContext, defaultValue); + + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(result.evaluationMetricKey).toBe('helpfulness'); + expect(result.tracker).toBeDefined(); + expect(result.enabled).toBe(true); + evaluateSpy.mockRestore(); + }); + + it('treats empty string evaluationMetricKey as invalid and falls back to evaluationMetricKeys', async () => { + const client = new LDAIClientImpl(mockLdClient); + const key = 'test-judge'; + const defaultValue: LDAIJudgeConfigDefault = { + enabled: false, + }; + + const mockVariation = { + enabled: true, + model: { name: 'gpt-4' }, + provider: { name: 'openai' }, + evaluationMetricKey: '', + evaluationMetricKeys: ['relevance', 'accuracy'], + messages: [{ role: 'system', content: 'You are a judge.' }], + _ldMeta: { + variationKey: 'v1', + enabled: true, + mode: 'judge', + }, + }; + + mockLdClient.variation.mockResolvedValue(mockVariation); + + const evaluateSpy = jest.spyOn(client as any, '_evaluate'); + const result = await client.judgeConfig(key, testContext, defaultValue); + + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + // Empty string should be treated as invalid, so should fall back to first value in evaluationMetricKeys + expect(result.evaluationMetricKey).toBe('relevance'); + expect(result.tracker).toBeDefined(); + expect(result.enabled).toBe(true); + evaluateSpy.mockRestore(); + }); + + it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => { + const client = new LDAIClientImpl(mockLdClient); + const key = 'test-judge'; + const defaultValue: LDAIJudgeConfigDefault = { + enabled: false, + }; + + const mockVariation = { + enabled: true, + model: { name: 'gpt-4' }, + provider: { name: 'openai' }, + evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'], + messages: [{ role: 'system', content: 'You are a judge.' }], + _ldMeta: { + variationKey: 'v1', + enabled: true, + mode: 'judge', + }, + }; + + mockLdClient.variation.mockResolvedValue(mockVariation); + + const evaluateSpy = jest.spyOn(client as any, '_evaluate'); + const result = await client.judgeConfig(key, testContext, defaultValue); + + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + // Should skip empty and whitespace strings, use first valid value + expect(result.evaluationMetricKey).toBe('relevance'); expect(result.tracker).toBeDefined(); expect(result.enabled).toBe(true); evaluateSpy.mockRestore(); diff --git a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts index 02965a5e5f..f126cc4cda 100644 --- a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts @@ -813,3 +813,70 @@ describe('trackMetricsOf', () => { ); }); }); + +describe('trackJudgeResponse', () => { + it('tracks evaluation metric key with score', () => { + const tracker = new LDAIConfigTrackerImpl( + mockLdClient, + configKey, + variationKey, + version, + modelName, + providerName, + testContext, + ); + + const judgeResponse = { + judgeConfigKey: 'test-judge', + evals: { + relevance: { score: 0.8, reasoning: 'The response is relevant' }, + }, + success: true, + }; + + tracker.trackJudgeResponse(judgeResponse); + + expect(mockTrack).toHaveBeenCalledWith( + 'relevance', + testContext, + { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, + 0.8, + ); + }); + + it('tracks multiple evaluation metrics when present', () => { + const tracker = new LDAIConfigTrackerImpl( + mockLdClient, + configKey, + variationKey, + version, + modelName, + providerName, + testContext, + ); + + const judgeResponse = { + judgeConfigKey: 'test-judge', + evals: { + relevance: { score: 0.8, reasoning: 'Relevant' }, + accuracy: { score: 0.9, reasoning: 'Accurate' }, + }, + success: true, + }; + + tracker.trackJudgeResponse(judgeResponse); + + expect(mockTrack).toHaveBeenCalledWith( + 'relevance', + testContext, + { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, + 0.8, + ); + expect(mockTrack).toHaveBeenCalledWith( + 'accuracy', + testContext, + { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, + 0.9, + ); + }); +}); diff --git a/packages/sdk/server-ai/src/api/LDAIClient.ts b/packages/sdk/server-ai/src/api/LDAIClient.ts index 22319d76e0..86433e8c36 100644 --- a/packages/sdk/server-ai/src/api/LDAIClient.ts +++ b/packages/sdk/server-ai/src/api/LDAIClient.ts @@ -156,7 +156,7 @@ export interface LDAIClient { * enabled: true, * model: { name: 'gpt-4' }, * provider: { name: 'openai' }, - * evaluationMetricKeys: ['$ld:ai:judge:relevance'], + * evaluationMetricKey: '$ld:ai:judge:relevance', * messages: [{ role: 'system', content: 'You are a relevance judge.' }] * }, variables); * @@ -303,7 +303,7 @@ export interface LDAIClient { * enabled: true, * model: { name: "gpt-4" }, * provider: { name: "openai" }, - * evaluationMetricKeys: ['$ld:ai:judge:relevance'], + * evaluationMetricKey: '$ld:ai:judge:relevance', * messages: [{ role: 'system', content: 'You are a relevance judge.' }] * }, * { metric: "relevance" } diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts index cd943be6c6..2a926f1c87 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts @@ -29,6 +29,7 @@ export interface LDAIConfigFlagValue { messages?: LDMessage[]; provider?: LDProviderConfig; instructions?: string; + evaluationMetricKey?: string; evaluationMetricKeys?: string[]; judgeConfiguration?: LDJudgeConfiguration; } @@ -65,6 +66,9 @@ export class LDAIConfigUtils { if ('instructions' in config && config.instructions !== undefined) { flagValue.instructions = config.instructions; } + if ('evaluationMetricKey' in config && config.evaluationMetricKey !== undefined) { + flagValue.evaluationMetricKey = config.evaluationMetricKey; + } if ('evaluationMetricKeys' in config && config.evaluationMetricKeys !== undefined) { flagValue.evaluationMetricKeys = config.evaluationMetricKeys; } @@ -121,7 +125,6 @@ export class LDAIConfigUtils { key, enabled: false, tracker: undefined, - evaluationMetricKeys: [], } as LDAIJudgeConfig; case 'completion': default: @@ -202,11 +205,22 @@ export class LDAIConfigUtils { flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAIJudgeConfig { + // Prioritize evaluationMetricKey, fallback to first valid (non-empty, non-whitespace) value in evaluationMetricKeys + let evaluationMetricKey: string | undefined; + if (flagValue.evaluationMetricKey && flagValue.evaluationMetricKey.trim().length > 0) { + evaluationMetricKey = flagValue.evaluationMetricKey.trim(); + } else if (flagValue.evaluationMetricKeys && flagValue.evaluationMetricKeys.length > 0) { + const validKey = flagValue.evaluationMetricKeys.find( + (metricKey) => metricKey && metricKey.trim().length > 0, + ); + evaluationMetricKey = validKey ? validKey.trim() : undefined; + } + return { ...this._toBaseConfig(key, flagValue), tracker, messages: flagValue.messages, - evaluationMetricKeys: flagValue.evaluationMetricKeys || [], + evaluationMetricKey, }; } } diff --git a/packages/sdk/server-ai/src/api/config/types.ts b/packages/sdk/server-ai/src/api/config/types.ts index ade099037b..15f3766851 100644 --- a/packages/sdk/server-ai/src/api/config/types.ts +++ b/packages/sdk/server-ai/src/api/config/types.ts @@ -154,8 +154,14 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault { */ messages?: LDMessage[]; /** - * Evaluation metric keys for judge configurations. + * Evaluation metric key for judge configurations. + * The key of the metric that this judge can evaluate. + */ + evaluationMetricKey?: string; + /** + * Evaluation metric keys for judge configurations (legacy). * The keys of the metrics that this judge can evaluate. + * @deprecated Use evaluationMetricKey instead. This field is kept for legacy support. */ evaluationMetricKeys?: string[]; } @@ -211,10 +217,16 @@ export interface LDAIJudgeConfig extends LDAIConfig { */ messages?: LDMessage[]; /** - * Evaluation metric keys for judge configurations. + * Evaluation metric key for judge configurations. + * The key of the metric that this judge can evaluate. + */ + evaluationMetricKey?: string; + /** + * Evaluation metric keys for judge configurations (legacy). * The keys of the metrics that this judge can evaluate. + * @deprecated Use evaluationMetricKey instead. This field is kept for legacy support. */ - evaluationMetricKeys: string[]; + evaluationMetricKeys?: string[]; } // ============================================================================ diff --git a/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts b/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts index 16d9ce651d..06f745a418 100644 --- a/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts +++ b/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts @@ -3,15 +3,20 @@ * Not exported - only used internally by TrackedJudge. */ class EvaluationSchemaBuilder { - static build(evaluationMetricKeys: string[]): Record { + static build(evaluationMetricKey?: string): Record { + if (!evaluationMetricKey) { + return {}; + } return { type: 'object', properties: { evaluations: { type: 'object', - description: `Object containing evaluation results for ${evaluationMetricKeys.join(', ')} metrics`, - properties: this._buildKeyProperties(evaluationMetricKeys), - required: evaluationMetricKeys, + description: `Object containing evaluation results for ${evaluationMetricKey} metric`, + properties: { + [evaluationMetricKey]: this._buildKeySchema(evaluationMetricKey), + }, + required: [evaluationMetricKey], additionalProperties: false, }, }, @@ -20,16 +25,6 @@ class EvaluationSchemaBuilder { } as const; } - private static _buildKeyProperties(evaluationMetricKeys: string[]) { - return evaluationMetricKeys.reduce( - (acc, key) => { - acc[key] = this._buildKeySchema(key); - return acc; - }, - {} as Record, - ); - } - private static _buildKeySchema(key: string) { return { type: 'object', diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index e71a43bdd3..382addc632 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -26,9 +26,30 @@ export class Judge { logger?: LDLogger, ) { this._logger = logger; - this._evaluationResponseStructure = EvaluationSchemaBuilder.build( - this._aiConfig.evaluationMetricKeys, - ); + const evaluationMetricKey = this._getEvaluationMetricKey(); + this._evaluationResponseStructure = EvaluationSchemaBuilder.build(evaluationMetricKey); + } + + /** + * Gets the evaluation metric key, prioritizing evaluationMetricKey over evaluationMetricKeys. + * Falls back to the first valid (non-empty, non-whitespace) value in evaluationMetricKeys if evaluationMetricKey is not provided. + * Treats empty strings and whitespace-only strings as invalid. + * @returns The evaluation metric key, or undefined if not available + */ + private _getEvaluationMetricKey(): string | undefined { + if ( + this._aiConfig.evaluationMetricKey && + this._aiConfig.evaluationMetricKey.trim().length > 0 + ) { + return this._aiConfig.evaluationMetricKey.trim(); + } + if (this._aiConfig.evaluationMetricKeys && this._aiConfig.evaluationMetricKeys.length > 0) { + const validKey = this._aiConfig.evaluationMetricKeys.find( + (key) => key && key.trim().length > 0, + ); + return validKey ? validKey.trim() : undefined; + } + return undefined; } /** @@ -45,12 +66,10 @@ export class Judge { samplingRate: number = 1, ): Promise { try { - if ( - !this._aiConfig.evaluationMetricKeys || - this._aiConfig.evaluationMetricKeys.length === 0 - ) { + const evaluationMetricKey = this._getEvaluationMetricKey(); + if (!evaluationMetricKey) { this._logger?.warn( - 'Judge configuration is missing required evaluationMetricKeys', + 'Judge configuration is missing required evaluation metric key', this._aiConfigTracker.getTrackData(), ); return undefined; @@ -78,11 +97,11 @@ export class Judge { let { success } = response.metrics; - const evals = this._parseEvaluationResponse(response.data); + const evals = this._parseEvaluationResponse(response.data, evaluationMetricKey); - if (Object.keys(evals).length !== this._aiConfig.evaluationMetricKeys.length) { + if (!evals[evaluationMetricKey]) { this._logger?.warn( - 'Judge evaluation did not return all evaluations', + 'Judge evaluation did not return the expected evaluation', this._aiConfigTracker.getTrackData(), ); success = false; @@ -169,7 +188,10 @@ export class Judge { /** * Parses the structured evaluation response from the AI provider. */ - private _parseEvaluationResponse(data: Record): Record { + private _parseEvaluationResponse( + data: Record, + evaluationMetricKey: string, + ): Record { const evaluations = data.evaluations as Record; const results: Record = {}; @@ -178,40 +200,38 @@ export class Judge { return results; } - this._aiConfig.evaluationMetricKeys.forEach((metricKey) => { - const evaluation = evaluations[metricKey]; + const evaluation = evaluations[evaluationMetricKey]; - if (!evaluation || typeof evaluation !== 'object') { - this._logger?.warn( - `Missing evaluation for metric key: ${metricKey}`, - this._aiConfigTracker.getTrackData(), - ); - return; - } + if (!evaluation || typeof evaluation !== 'object') { + this._logger?.warn( + `Missing evaluation for metric key: ${evaluationMetricKey}`, + this._aiConfigTracker.getTrackData(), + ); + return results; + } - const evalData = evaluation as Record; + const evalData = evaluation as Record; - if (typeof evalData.score !== 'number' || evalData.score < 0 || evalData.score > 1) { - this._logger?.warn( - `Invalid score evaluated for ${metricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`, - this._aiConfigTracker.getTrackData(), - ); - return; - } + if (typeof evalData.score !== 'number' || evalData.score < 0 || evalData.score > 1) { + this._logger?.warn( + `Invalid score evaluated for ${evaluationMetricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`, + this._aiConfigTracker.getTrackData(), + ); + return results; + } - if (typeof evalData.reasoning !== 'string') { - this._logger?.warn( - `Invalid reasoning evaluated for ${metricKey}: ${evalData.reasoning}. Reasoning must be a string`, - this._aiConfigTracker.getTrackData(), - ); - return; - } + if (typeof evalData.reasoning !== 'string') { + this._logger?.warn( + `Invalid reasoning evaluated for ${evaluationMetricKey}: ${evalData.reasoning}. Reasoning must be a string`, + this._aiConfigTracker.getTrackData(), + ); + return results; + } - results[metricKey] = { - score: evalData.score, - reasoning: evalData.reasoning, - }; - }); + results[evaluationMetricKey] = { + score: evalData.score, + reasoning: evalData.reasoning, + }; return results; }