launchdarkly · knfreemLD · Jan 27, 2026 · Jan 22, 2026 · Jan 22, 2026
@@ -133,7 +133,7 @@ describe('config evaluation', () => {
     evaluateSpy.mockRestore();
   });
 
-  it('evaluates judge config successfully', async () => {
+  it('evaluates judge config successfully with evaluationMetricKeys (legacy)', async () => {
     const client = new LDAIClientImpl(mockLdClient);
     const key = 'test-judge';
     const defaultValue: LDAIJudgeConfigDefault = {
@@ -159,7 +159,140 @@ describe('config evaluation', () => {
     const result = await client.judgeConfig(key, testContext, defaultValue);
 
     expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
-    expect(result.evaluationMetricKeys).toEqual(['relevance', 'accuracy']);
+    // Should use first value from evaluationMetricKeys
+    expect(result.evaluationMetricKey).toBe('relevance');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('evaluates judge config successfully with evaluationMetricKey', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKey: 'relevance',
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    expect(result.evaluationMetricKey).toBe('relevance');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKey: 'helpfulness',
+      evaluationMetricKeys: ['relevance', 'accuracy'],
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    expect(result.evaluationMetricKey).toBe('helpfulness');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('treats empty string evaluationMetricKey as invalid and falls back to evaluationMetricKeys', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKey: '',
+      evaluationMetricKeys: ['relevance', 'accuracy'],
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    // Empty string should be treated as invalid, so should fall back to first value in evaluationMetricKeys
+    expect(result.evaluationMetricKey).toBe('relevance');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKeys: ['', '   ', 'relevance', 'accuracy'],
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    // Should skip empty and whitespace strings, use first valid value
+    expect(result.evaluationMetricKey).toBe('relevance');
     expect(result.tracker).toBeDefined();
     expect(result.enabled).toBe(true);
     evaluateSpy.mockRestore();

@@ -813,3 +813,70 @@ describe('trackMetricsOf', () => {
     );
   });
 });
+
+describe('trackJudgeResponse', () => {
+  it('tracks evaluation metric key with score', () => {
+    const tracker = new LDAIConfigTrackerImpl(
+      mockLdClient,
+      configKey,
+      variationKey,
+      version,
+      modelName,
+      providerName,
+      testContext,
+    );
+
+    const judgeResponse = {
+      judgeConfigKey: 'test-judge',
+      evals: {
+        relevance: { score: 0.8, reasoning: 'The response is relevant' },
+      },
+      success: true,
+    };
+
+    tracker.trackJudgeResponse(judgeResponse);
+
+    expect(mockTrack).toHaveBeenCalledWith(
+      'relevance',
+      testContext,
+      { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
+      0.8,
+    );
+  });
+
+  it('tracks multiple evaluation metrics when present', () => {
+    const tracker = new LDAIConfigTrackerImpl(
+      mockLdClient,
+      configKey,
+      variationKey,
+      version,
+      modelName,
+      providerName,
+      testContext,
+    );
+
+    const judgeResponse = {
+      judgeConfigKey: 'test-judge',
+      evals: {
+        relevance: { score: 0.8, reasoning: 'Relevant' },
+        accuracy: { score: 0.9, reasoning: 'Accurate' },
+      },
+      success: true,
+    };
+
+    tracker.trackJudgeResponse(judgeResponse);
+
+    expect(mockTrack).toHaveBeenCalledWith(
+      'relevance',
+      testContext,
+      { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
+      0.8,
+    );
+    expect(mockTrack).toHaveBeenCalledWith(
+      'accuracy',
+      testContext,
+      { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
+      0.9,
+    );
+  });
+});
@@ -156,7 +156,7 @@ export interface LDAIClient {
    *   enabled: true,
    *   model: { name: 'gpt-4' },
    *   provider: { name: 'openai' },
-   *   evaluationMetricKeys: ['$ld:ai:judge:relevance'],
+   *   evaluationMetricKey: '$ld:ai:judge:relevance',
    *   messages: [{ role: 'system', content: 'You are a relevance judge.' }]
    * }, variables);
    *
@@ -303,7 +303,7 @@ export interface LDAIClient {
    *     enabled: true,
    *     model: { name: "gpt-4" },
    *     provider: { name: "openai" },
-   *     evaluationMetricKeys: ['$ld:ai:judge:relevance'],
+   *     evaluationMetricKey: '$ld:ai:judge:relevance',
    *     messages: [{ role: 'system', content: 'You are a relevance judge.' }]
    *   },
    *   { metric: "relevance" }

@@ -29,6 +29,7 @@ export interface LDAIConfigFlagValue {
   messages?: LDMessage[];
   provider?: LDProviderConfig;
   instructions?: string;
+  evaluationMetricKey?: string;
   evaluationMetricKeys?: string[];
   judgeConfiguration?: LDJudgeConfiguration;
 }
@@ -65,6 +66,9 @@ export class LDAIConfigUtils {
     if ('instructions' in config && config.instructions !== undefined) {
       flagValue.instructions = config.instructions;
     }
+    if ('evaluationMetricKey' in config && config.evaluationMetricKey !== undefined) {
+      flagValue.evaluationMetricKey = config.evaluationMetricKey;
+    }
     if ('evaluationMetricKeys' in config && config.evaluationMetricKeys !== undefined) {
       flagValue.evaluationMetricKeys = config.evaluationMetricKeys;
     }
@@ -121,7 +125,6 @@ export class LDAIConfigUtils {
           key,
           enabled: false,
           tracker: undefined,
-          evaluationMetricKeys: [],
         } as LDAIJudgeConfig;
       case 'completion':
       default:
@@ -202,11 +205,22 @@ export class LDAIConfigUtils {
     flagValue: LDAIConfigFlagValue,
     tracker: LDAIConfigTracker,
   ): LDAIJudgeConfig {
+    // Prioritize evaluationMetricKey, fallback to first valid (non-empty, non-whitespace) value in evaluationMetricKeys
+    let evaluationMetricKey: string | undefined;
+    if (flagValue.evaluationMetricKey && flagValue.evaluationMetricKey.trim().length > 0) {
+      evaluationMetricKey = flagValue.evaluationMetricKey.trim();
+    } else if (flagValue.evaluationMetricKeys && flagValue.evaluationMetricKeys.length > 0) {
+      const validKey = flagValue.evaluationMetricKeys.find(
+        (metricKey) => metricKey && metricKey.trim().length > 0,
+      );
+      evaluationMetricKey = validKey ? validKey.trim() : undefined;
+    }
+
     return {
       ...this._toBaseConfig(key, flagValue),
       tracker,
       messages: flagValue.messages,
-      evaluationMetricKeys: flagValue.evaluationMetricKeys || [],
+      evaluationMetricKey,
     };
   }
 }
@@ -154,8 +154,14 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault {
    */
   messages?: LDMessage[];
   /**
-   * Evaluation metric keys for judge configurations.
+   * Evaluation metric key for judge configurations.
+   * The key of the metric that this judge can evaluate.
+   */
+  evaluationMetricKey?: string;
+  /**
+   * Evaluation metric keys for judge configurations (legacy).
    * The keys of the metrics that this judge can evaluate.
+   * @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
    */
   evaluationMetricKeys?: string[];
 }
@@ -211,10 +217,16 @@ export interface LDAIJudgeConfig extends LDAIConfig {
    */
   messages?: LDMessage[];
   /**
-   * Evaluation metric keys for judge configurations.
+   * Evaluation metric key for judge configurations.
+   * The key of the metric that this judge can evaluate.
+   */
+  evaluationMetricKey?: string;
+  /**
+   * Evaluation metric keys for judge configurations (legacy).
    * The keys of the metrics that this judge can evaluate.
+   * @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
    */
-  evaluationMetricKeys: string[];
+  evaluationMetricKeys?: string[];
 }
 
 // ============================================================================

@@ -3,15 +3,20 @@
  * Not exported - only used internally by TrackedJudge.
  */
 class EvaluationSchemaBuilder {
-  static build(evaluationMetricKeys: string[]): Record<string, unknown> {
+  static build(evaluationMetricKey?: string): Record<string, unknown> {
+    if (!evaluationMetricKey) {
+      return {};
+    }
     return {
       type: 'object',
       properties: {
         evaluations: {
           type: 'object',
-          description: `Object containing evaluation results for ${evaluationMetricKeys.join(', ')} metrics`,
-          properties: this._buildKeyProperties(evaluationMetricKeys),
-          required: evaluationMetricKeys,
+          description: `Object containing evaluation results for ${evaluationMetricKey} metric`,
+          properties: {
+            [evaluationMetricKey]: this._buildKeySchema(evaluationMetricKey),
+          },
+          required: [evaluationMetricKey],
           additionalProperties: false,
         },
       },
@@ -20,16 +25,6 @@ class EvaluationSchemaBuilder {
     } as const;
   }
 
-  private static _buildKeyProperties(evaluationMetricKeys: string[]) {
-    return evaluationMetricKeys.reduce(
-      (acc, key) => {
-        acc[key] = this._buildKeySchema(key);
-        return acc;
-      },
-      {} as Record<string, unknown>,
-    );
-  }
-
   private static _buildKeySchema(key: string) {
     return {
       type: 'object',