diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts
index 2485095742..6c8985b914 100644
--- a/packages/sdk/server-ai/__tests__/Judge.test.ts
+++ b/packages/sdk/server-ai/__tests__/Judge.test.ts
@@ -19,25 +19,21 @@ describe('Judge', () => {
   };
 
   beforeEach(() => {
-    // Mock the AIProvider - only mock what's actually used
     mockProvider = {
       invokeStructuredModel: jest.fn(),
     } as any;
 
-    // Mock the LDAIConfigTracker - only mock what's actually used
     mockTracker = {
       trackMetricsOf: jest.fn(),
       getTrackData: jest.fn().mockReturnValue(mockTrackData),
     } as any;
 
-    // Mock the logger - only mock what's actually used
     mockLogger = {
       debug: jest.fn(),
       warn: jest.fn(),
       error: jest.fn(),
     } as any;
 
-    // Create a basic judge config
     judgeConfig = {
       key: 'test-judge',
       enabled: true,
@@ -52,7 +48,7 @@ describe('Judge', () => {
       model: { name: 'gpt-4' },
       provider: { name: 'openai' },
       tracker: mockTracker,
-      evaluationMetricKeys: ['relevance', 'accuracy', 'helpfulness'],
+      evaluationMetricKey: 'relevance',
     };
   });
 
@@ -76,15 +72,11 @@ describe('Judge', () => {
         data: {
           evaluations: {
             relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
-            accuracy: { score: 0.9, reasoning: 'The response is factually accurate' },
-            helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' },
           },
         },
         rawResponse: JSON.stringify({
           evaluations: {
             relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
-            accuracy: { score: 0.9, reasoning: 'The response is factually accurate' },
-            helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' },
           },
         }),
         metrics: {
@@ -111,14 +103,6 @@ describe('Judge', () => {
             score: 0.8,
             reasoning: 'The response is relevant to the question',
           },
-          accuracy: {
-            score: 0.9,
-            reasoning: 'The response is factually accurate',
-          },
-          helpfulness: {
-            score: 0.7,
-            reasoning: 'The response provides helpful information',
-          },
         },
         success: true,
         judgeConfigKey: 'test-judge',
@@ -140,25 +124,51 @@ describe('Judge', () => {
       );
     });
 
+    it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => {
+      const mockStructuredResponse: StructuredResponse = {
+        data: {
+          evaluations: {
+            relevance: { score: 0.85, reasoning: 'Highly relevant response' },
+          },
+        },
+        rawResponse: JSON.stringify({
+          evaluations: {
+            relevance: { score: 0.85, reasoning: 'Highly relevant response' },
+          },
+        }),
+        metrics: {
+          success: true,
+          usage: { total: 100, input: 50, output: 50 },
+        },
+      };
+
+      mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
+      mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse);
+
+      const result = await judge.evaluate('test input', 'test output');
+
+      expect(result).toBeDefined();
+      expect(result?.evals).toHaveProperty('relevance');
+      expect(result?.evals.relevance.score).toBe(0.85);
+      expect(result?.judgeConfigKey).toBe('test-judge');
+      expect(result?.success).toBe(true);
+      // Verify the evaluationMetricKey from config is used in the result
+      expect(Object.keys(result?.evals || {})).toContain(judgeConfig.evaluationMetricKey);
+    });
+
     it('handles sampling rate correctly', async () => {
-      // Mock Math.random to return 0.3 (should be sampled with rate 0.5 since 0.3 <= 0.5)
       const originalRandom = Math.random;
       Math.random = jest.fn().mockReturnValue(0.3);
 
-      // Mock the structured response
       const mockStructuredResponse: StructuredResponse = {
         data: {
           evaluations: {
             relevance: { score: 0.8, reasoning: 'Good' },
-            accuracy: { score: 0.9, reasoning: 'Accurate' },
-            helpfulness: { score: 0.7, reasoning: 'Helpful' },
           },
         },
         rawResponse: JSON.stringify({
           evaluations: {
             relevance: { score: 0.8, reasoning: 'Good' },
-            accuracy: { score: 0.9, reasoning: 'Accurate' },
-            helpfulness: { score: 0.7, reasoning: 'Helpful' },
           },
         }),
         metrics: {
@@ -179,7 +189,6 @@ describe('Judge', () => {
     });
 
     it('returns undefined when not sampled', async () => {
-      // Mock Math.random to return 0.8 (should not be sampled with rate 0.5 since 0.8 > 0.5)
       const originalRandom = Math.random;
       Math.random = jest.fn().mockReturnValue(0.8);
 
@@ -194,9 +203,10 @@ describe('Judge', () => {
       Math.random = originalRandom;
     });
 
-    it('returns undefined when evaluationMetricKeys is empty', async () => {
+    it('returns undefined when evaluationMetricKey and evaluationMetricKeys are both missing', async () => {
       const configWithoutMetrics: LDAIJudgeConfig = {
         ...judgeConfig,
+        evaluationMetricKey: undefined,
         evaluationMetricKeys: [],
       };
       const judgeWithoutMetrics = new Judge(
@@ -210,11 +220,183 @@ describe('Judge', () => {
 
       expect(result).toBeUndefined();
       expect(mockLogger.warn).toHaveBeenCalledWith(
-        'Judge configuration is missing required evaluationMetricKeys',
+        'Judge configuration is missing required evaluation metric key',
         mockTrackData,
       );
     });
 
+    it('uses evaluationMetricKey when provided', async () => {
+      const configWithSingleKey: LDAIJudgeConfig = {
+        ...judgeConfig,
+        evaluationMetricKey: 'relevance',
+        evaluationMetricKeys: undefined,
+      };
+      const judgeWithSingleKey = new Judge(
+        configWithSingleKey,
+        mockTracker,
+        mockProvider,
+        mockLogger,
+      );
+
+      const mockStructuredResponse: StructuredResponse = {
+        data: {
+          evaluations: {
+            relevance: { score: 0.8, reasoning: 'The response is relevant' },
+          },
+        },
+        rawResponse: JSON.stringify({
+          evaluations: {
+            relevance: { score: 0.8, reasoning: 'The response is relevant' },
+          },
+        }),
+        metrics: {
+          success: true,
+          usage: { total: 100, input: 50, output: 50 },
+        },
+      };
+
+      mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
+      mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse);
+
+      const result = await judgeWithSingleKey.evaluate('test input', 'test output');
+
+      expect(result).toEqual({
+        evals: {
+          relevance: { score: 0.8, reasoning: 'The response is relevant' },
+        },
+        success: true,
+        judgeConfigKey: 'test-judge',
+      });
+    });
+
+    it('falls back to first value in evaluationMetricKeys when evaluationMetricKey is not provided', async () => {
+      const configWithLegacyKeys: LDAIJudgeConfig = {
+        ...judgeConfig,
+        evaluationMetricKey: undefined,
+        evaluationMetricKeys: ['relevance', 'accuracy'],
+      };
+      const judgeWithLegacyKeys = new Judge(
+        configWithLegacyKeys,
+        mockTracker,
+        mockProvider,
+        mockLogger,
+      );
+
+      const mockStructuredResponse: StructuredResponse = {
+        data: {
+          evaluations: {
+            relevance: { score: 0.8, reasoning: 'The response is relevant' },
+          },
+        },
+        rawResponse: JSON.stringify({
+          evaluations: {
+            relevance: { score: 0.8, reasoning: 'The response is relevant' },
+          },
+        }),
+        metrics: {
+          success: true,
+          usage: { total: 100, input: 50, output: 50 },
+        },
+      };
+
+      mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
+      mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse);
+
+      const result = await judgeWithLegacyKeys.evaluate('test input', 'test output');
+
+      expect(result).toEqual({
+        evals: {
+          relevance: { score: 0.8, reasoning: 'The response is relevant' },
+        },
+        success: true,
+        judgeConfigKey: 'test-judge',
+      });
+    });
+
+    it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => {
+      const configWithInvalidKeys: LDAIJudgeConfig = {
+        ...judgeConfig,
+        evaluationMetricKey: undefined,
+        evaluationMetricKeys: ['', '   ', 'relevance', 'accuracy'],
+      };
+      const judgeWithInvalidKeys = new Judge(
+        configWithInvalidKeys,
+        mockTracker,
+        mockProvider,
+        mockLogger,
+      );
+
+      const mockStructuredResponse: StructuredResponse = {
+        data: {
+          evaluations: {
+            relevance: { score: 0.8, reasoning: 'The response is relevant' },
+          },
+        },
+        rawResponse: JSON.stringify({
+          evaluations: {
+            relevance: { score: 0.8, reasoning: 'The response is relevant' },
+          },
+        }),
+        metrics: {
+          success: true,
+          usage: { total: 100, input: 50, output: 50 },
+        },
+      };
+
+      mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
+      mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse);
+
+      const result = await judgeWithInvalidKeys.evaluate('test input', 'test output');
+
+      // Should skip empty and whitespace strings, use first valid value
+      expect(result).toEqual({
+        evals: {
+          relevance: { score: 0.8, reasoning: 'The response is relevant' },
+        },
+        success: true,
+        judgeConfigKey: 'test-judge',
+      });
+    });
+
+    it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => {
+      const configWithBoth: LDAIJudgeConfig = {
+        ...judgeConfig,
+        evaluationMetricKey: 'helpfulness',
+        evaluationMetricKeys: ['relevance', 'accuracy'],
+      };
+      const judgeWithBoth = new Judge(configWithBoth, mockTracker, mockProvider, mockLogger);
+
+      const mockStructuredResponse: StructuredResponse = {
+        data: {
+          evaluations: {
+            helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
+          },
+        },
+        rawResponse: JSON.stringify({
+          evaluations: {
+            helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
+          },
+        }),
+        metrics: {
+          success: true,
+          usage: { total: 100, input: 50, output: 50 },
+        },
+      };
+
+      mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
+      mockProvider.invokeStructuredModel.mockResolvedValue(mockStructuredResponse);
+
+      const result = await judgeWithBoth.evaluate('test input', 'test output');
+
+      expect(result).toEqual({
+        evals: {
+          helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
+        },
+        success: true,
+        judgeConfigKey: 'test-judge',
+      });
+    });
+
     it('returns undefined when messages are missing', async () => {
       const configWithoutMessages: LDAIJudgeConfig = {
         ...judgeConfig,
@@ -236,19 +418,16 @@ describe('Judge', () => {
       );
     });
 
-    it('returns partial evaluations when some metrics are missing', async () => {
+    it('returns empty evaluations with success false when expected metric is missing', async () => {
       const mockStructuredResponse: StructuredResponse = {
         data: {
           evaluations: {
-            relevance: { score: 0.8, reasoning: 'Good' },
-            // accuracy is missing
-            helpfulness: { score: 0.7, reasoning: 'Helpful' },
+            accuracy: { score: 0.9, reasoning: 'Accurate' },
           },
         },
         rawResponse: JSON.stringify({
           evaluations: {
-            relevance: { score: 0.8, reasoning: 'Good' },
-            helpfulness: { score: 0.7, reasoning: 'Helpful' },
+            accuracy: { score: 0.9, reasoning: 'Accurate' },
           },
         }),
         metrics: {
@@ -262,12 +441,8 @@ describe('Judge', () => {
 
       const result = await judge.evaluate('test input', 'test output');
 
-      // When one metric is missing, it returns the partial evals it has with success: false
       expect(result).toEqual({
-        evals: {
-          relevance: { score: 0.8, reasoning: 'Good' },
-          helpfulness: { score: 0.7, reasoning: 'Helpful' },
-        },
+        evals: {},
         success: false,
         judgeConfigKey: 'test-judge',
       });
@@ -276,7 +451,6 @@ describe('Judge', () => {
     it('returns empty evaluations when response structure is malformed', async () => {
       const mockStructuredResponse: StructuredResponse = {
         data: {
-          // Missing 'evaluations' wrapper - malformed structure
           relevance: { score: 0.8, reasoning: 'Good' },
           accuracy: { score: 0.9, reasoning: 'Accurate' },
           helpfulness: { score: 0.7, reasoning: 'Helpful' },
@@ -297,7 +471,6 @@ describe('Judge', () => {
 
       const result = await judge.evaluate('test input', 'test output');
 
-      // When the structure is completely wrong, returns empty evals with success: false
       expect(result).toEqual({
         evals: {},
         success: false,
@@ -355,15 +528,11 @@ describe('Judge', () => {
         data: {
           evaluations: {
             relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
-            accuracy: { score: 0.9, reasoning: 'The response is factually accurate' },
-            helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' },
           },
         },
         rawResponse: JSON.stringify({
           evaluations: {
             relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
-            accuracy: { score: 0.9, reasoning: 'The response is factually accurate' },
-            helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' },
           },
         }),
         metrics: {
@@ -383,14 +552,6 @@ describe('Judge', () => {
             score: 0.8,
             reasoning: 'The response is relevant to the question',
           },
-          accuracy: {
-            score: 0.9,
-            reasoning: 'The response is factually accurate',
-          },
-          helpfulness: {
-            score: 0.7,
-            reasoning: 'The response provides helpful information',
-          },
         },
         success: true,
         judgeConfigKey: 'test-judge',
@@ -419,7 +580,6 @@ describe('Judge', () => {
         metrics: { success: true },
       };
 
-      // Mock Math.random to return 0.8 (should not be sampled with rate 0.5 since 0.8 > 0.5)
       const originalRandom = Math.random;
       Math.random = jest.fn().mockReturnValue(0.8);
 
@@ -440,7 +600,6 @@ describe('Judge', () => {
     });
 
     it('constructs evaluation messages correctly', () => {
-      // Access private method for testing
       // eslint-disable-next-line no-underscore-dangle
       const constructMessages = (judge as any)._constructEvaluationMessages.bind(judge);
       const messages = constructMessages('test input', 'test output');
@@ -471,17 +630,13 @@ describe('Judge', () => {
       const responseData = {
         evaluations: {
           relevance: { score: 0.8, reasoning: 'Good' },
-          accuracy: { score: 0.9, reasoning: 'Accurate' },
-          helpfulness: { score: 0.7, reasoning: 'Helpful' },
         },
       };
 
-      const result = parseResponse(responseData);
+      const result = parseResponse(responseData, 'relevance');
 
       expect(result).toEqual({
         relevance: { score: 0.8, reasoning: 'Good' },
-        accuracy: { score: 0.9, reasoning: 'Accurate' },
-        helpfulness: { score: 0.7, reasoning: 'Helpful' },
       });
     });
 
@@ -490,12 +645,10 @@ describe('Judge', () => {
       const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
       const responseData = {
         relevance: { score: 0.8, reasoning: 'Good' },
-        // Missing evaluations wrapper - invalid structure
       };
 
-      const result = parseResponse(responseData);
+      const result = parseResponse(responseData, 'relevance');
 
-      // Returns empty object when evaluations structure is missing
       expect(result).toEqual({});
     });
 
@@ -504,18 +657,143 @@ describe('Judge', () => {
       const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
       const responseData = {
         evaluations: {
-          relevance: { score: 0.8 }, // Missing reasoning
-          accuracy: { reasoning: 'Accurate' }, // Missing score
-          helpfulness: { score: 0.7, reasoning: 'Helpful' },
+          relevance: { score: 0.8 },
         },
       };
 
-      const result = parseResponse(responseData);
+      const result = parseResponse(responseData, 'relevance');
 
-      // Only helpfulness passes validation, relevance and accuracy are skipped
-      expect(result).toEqual({
-        helpfulness: { score: 0.7, reasoning: 'Helpful' },
-      });
+      expect(result).toEqual({});
+    });
+
+    it('handles invalid score values out of range', () => {
+      // eslint-disable-next-line no-underscore-dangle
+      const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
+      const responseData = {
+        evaluations: {
+          relevance: { score: 1.5, reasoning: 'Good' },
+        },
+      };
+
+      const result = parseResponse(responseData, 'relevance');
+
+      expect(result).toEqual({});
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        expect.stringContaining('Invalid score evaluated for relevance: 1.5'),
+        mockTrackData,
+      );
+    });
+
+    it('handles negative score values', () => {
+      // eslint-disable-next-line no-underscore-dangle
+      const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
+      const responseData = {
+        evaluations: {
+          relevance: { score: -0.1, reasoning: 'Good' },
+        },
+      };
+
+      const result = parseResponse(responseData, 'relevance');
+
+      expect(result).toEqual({});
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        expect.stringContaining('Invalid score evaluated for relevance: -0.1'),
+        mockTrackData,
+      );
+    });
+
+    it('handles invalid reasoning type', () => {
+      // eslint-disable-next-line no-underscore-dangle
+      const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
+      const responseData = {
+        evaluations: {
+          relevance: { score: 0.8, reasoning: 123 },
+        },
+      };
+
+      const result = parseResponse(responseData, 'relevance');
+
+      expect(result).toEqual({});
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        expect.stringContaining('Invalid reasoning evaluated for relevance: 123'),
+        mockTrackData,
+      );
+    });
+
+    it('handles missing evaluation when key does not exist in response', () => {
+      // eslint-disable-next-line no-underscore-dangle
+      const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
+      const responseData = {
+        evaluations: {
+          accuracy: { score: 0.9, reasoning: 'Accurate' },
+        },
+      };
+
+      const result = parseResponse(responseData, 'relevance');
+
+      expect(result).toEqual({});
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        'Missing evaluation for metric key: relevance',
+        mockTrackData,
+      );
+    });
+
+    it('handles empty evaluationMetricKeys array fallback', async () => {
+      const configWithEmptyKeys: LDAIJudgeConfig = {
+        ...judgeConfig,
+        evaluationMetricKey: undefined,
+        evaluationMetricKeys: [],
+      };
+      const judgeWithEmptyKeys = new Judge(
+        configWithEmptyKeys,
+        mockTracker,
+        mockProvider,
+        mockLogger,
+      );
+
+      const result = await judgeWithEmptyKeys.evaluate('test input', 'test output');
+
+      expect(result).toBeUndefined();
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        'Judge configuration is missing required evaluation metric key',
+        mockTrackData,
+      );
+    });
+
+    it('handles evaluation value that is not an object', () => {
+      // eslint-disable-next-line no-underscore-dangle
+      const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
+      const responseData = {
+        evaluations: {
+          relevance: 'not an object',
+        },
+      };
+
+      const result = parseResponse(responseData, 'relevance');
+
+      expect(result).toEqual({});
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        'Missing evaluation for metric key: relevance',
+        mockTrackData,
+      );
+    });
+
+    it('handles null evaluation value', () => {
+      // eslint-disable-next-line no-underscore-dangle
+      const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
+      const responseData = {
+        evaluations: {
+          relevance: null,
+        },
+      };
+
+      const result = parseResponse(responseData, 'relevance');
+
+      expect(result).toEqual({});
+      expect(mockLogger.warn).toHaveBeenCalledWith(
+        'Missing evaluation for metric key: relevance',
+        mockTrackData,
+      );
     });
   });
 });
diff --git a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts
index bfb5e13ff0..3c64f5234b 100644
--- a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts
+++ b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts
@@ -133,7 +133,7 @@ describe('config evaluation', () => {
     evaluateSpy.mockRestore();
   });
 
-  it('evaluates judge config successfully', async () => {
+  it('evaluates judge config successfully with evaluationMetricKeys (legacy)', async () => {
     const client = new LDAIClientImpl(mockLdClient);
     const key = 'test-judge';
     const defaultValue: LDAIJudgeConfigDefault = {
@@ -159,7 +159,140 @@ describe('config evaluation', () => {
     const result = await client.judgeConfig(key, testContext, defaultValue);
 
     expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
-    expect(result.evaluationMetricKeys).toEqual(['relevance', 'accuracy']);
+    // Should use first value from evaluationMetricKeys
+    expect(result.evaluationMetricKey).toBe('relevance');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('evaluates judge config successfully with evaluationMetricKey', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKey: 'relevance',
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    expect(result.evaluationMetricKey).toBe('relevance');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKey: 'helpfulness',
+      evaluationMetricKeys: ['relevance', 'accuracy'],
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    expect(result.evaluationMetricKey).toBe('helpfulness');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('treats empty string evaluationMetricKey as invalid and falls back to evaluationMetricKeys', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKey: '',
+      evaluationMetricKeys: ['relevance', 'accuracy'],
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    // Empty string should be treated as invalid, so should fall back to first value in evaluationMetricKeys
+    expect(result.evaluationMetricKey).toBe('relevance');
+    expect(result.tracker).toBeDefined();
+    expect(result.enabled).toBe(true);
+    evaluateSpy.mockRestore();
+  });
+
+  it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => {
+    const client = new LDAIClientImpl(mockLdClient);
+    const key = 'test-judge';
+    const defaultValue: LDAIJudgeConfigDefault = {
+      enabled: false,
+    };
+
+    const mockVariation = {
+      enabled: true,
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      evaluationMetricKeys: ['', '   ', 'relevance', 'accuracy'],
+      messages: [{ role: 'system', content: 'You are a judge.' }],
+      _ldMeta: {
+        variationKey: 'v1',
+        enabled: true,
+        mode: 'judge',
+      },
+    };
+
+    mockLdClient.variation.mockResolvedValue(mockVariation);
+
+    const evaluateSpy = jest.spyOn(client as any, '_evaluate');
+    const result = await client.judgeConfig(key, testContext, defaultValue);
+
+    expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
+    // Should skip empty and whitespace strings, use first valid value
+    expect(result.evaluationMetricKey).toBe('relevance');
     expect(result.tracker).toBeDefined();
     expect(result.enabled).toBe(true);
     evaluateSpy.mockRestore();
diff --git a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts
index 02965a5e5f..f126cc4cda 100644
--- a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts
+++ b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts
@@ -813,3 +813,70 @@ describe('trackMetricsOf', () => {
     );
   });
 });
+
+describe('trackJudgeResponse', () => {
+  it('tracks evaluation metric key with score', () => {
+    const tracker = new LDAIConfigTrackerImpl(
+      mockLdClient,
+      configKey,
+      variationKey,
+      version,
+      modelName,
+      providerName,
+      testContext,
+    );
+
+    const judgeResponse = {
+      judgeConfigKey: 'test-judge',
+      evals: {
+        relevance: { score: 0.8, reasoning: 'The response is relevant' },
+      },
+      success: true,
+    };
+
+    tracker.trackJudgeResponse(judgeResponse);
+
+    expect(mockTrack).toHaveBeenCalledWith(
+      'relevance',
+      testContext,
+      { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
+      0.8,
+    );
+  });
+
+  it('tracks multiple evaluation metrics when present', () => {
+    const tracker = new LDAIConfigTrackerImpl(
+      mockLdClient,
+      configKey,
+      variationKey,
+      version,
+      modelName,
+      providerName,
+      testContext,
+    );
+
+    const judgeResponse = {
+      judgeConfigKey: 'test-judge',
+      evals: {
+        relevance: { score: 0.8, reasoning: 'Relevant' },
+        accuracy: { score: 0.9, reasoning: 'Accurate' },
+      },
+      success: true,
+    };
+
+    tracker.trackJudgeResponse(judgeResponse);
+
+    expect(mockTrack).toHaveBeenCalledWith(
+      'relevance',
+      testContext,
+      { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
+      0.8,
+    );
+    expect(mockTrack).toHaveBeenCalledWith(
+      'accuracy',
+      testContext,
+      { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
+      0.9,
+    );
+  });
+});
diff --git a/packages/sdk/server-ai/src/api/LDAIClient.ts b/packages/sdk/server-ai/src/api/LDAIClient.ts
index 22319d76e0..86433e8c36 100644
--- a/packages/sdk/server-ai/src/api/LDAIClient.ts
+++ b/packages/sdk/server-ai/src/api/LDAIClient.ts
@@ -156,7 +156,7 @@ export interface LDAIClient {
    *   enabled: true,
    *   model: { name: 'gpt-4' },
    *   provider: { name: 'openai' },
-   *   evaluationMetricKeys: ['$ld:ai:judge:relevance'],
+   *   evaluationMetricKey: '$ld:ai:judge:relevance',
    *   messages: [{ role: 'system', content: 'You are a relevance judge.' }]
    * }, variables);
    *
@@ -303,7 +303,7 @@ export interface LDAIClient {
    *     enabled: true,
    *     model: { name: "gpt-4" },
    *     provider: { name: "openai" },
-   *     evaluationMetricKeys: ['$ld:ai:judge:relevance'],
+   *     evaluationMetricKey: '$ld:ai:judge:relevance',
    *     messages: [{ role: 'system', content: 'You are a relevance judge.' }]
    *   },
    *   { metric: "relevance" }
diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts
index cd943be6c6..2a926f1c87 100644
--- a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts
+++ b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts
@@ -29,6 +29,7 @@ export interface LDAIConfigFlagValue {
   messages?: LDMessage[];
   provider?: LDProviderConfig;
   instructions?: string;
+  evaluationMetricKey?: string;
   evaluationMetricKeys?: string[];
   judgeConfiguration?: LDJudgeConfiguration;
 }
@@ -65,6 +66,9 @@ export class LDAIConfigUtils {
     if ('instructions' in config && config.instructions !== undefined) {
       flagValue.instructions = config.instructions;
     }
+    if ('evaluationMetricKey' in config && config.evaluationMetricKey !== undefined) {
+      flagValue.evaluationMetricKey = config.evaluationMetricKey;
+    }
     if ('evaluationMetricKeys' in config && config.evaluationMetricKeys !== undefined) {
       flagValue.evaluationMetricKeys = config.evaluationMetricKeys;
     }
@@ -121,7 +125,6 @@ export class LDAIConfigUtils {
           key,
           enabled: false,
           tracker: undefined,
-          evaluationMetricKeys: [],
         } as LDAIJudgeConfig;
       case 'completion':
       default:
@@ -202,11 +205,22 @@ export class LDAIConfigUtils {
     flagValue: LDAIConfigFlagValue,
     tracker: LDAIConfigTracker,
   ): LDAIJudgeConfig {
+    // Prioritize evaluationMetricKey, fallback to first valid (non-empty, non-whitespace) value in evaluationMetricKeys
+    let evaluationMetricKey: string | undefined;
+    if (flagValue.evaluationMetricKey && flagValue.evaluationMetricKey.trim().length > 0) {
+      evaluationMetricKey = flagValue.evaluationMetricKey.trim();
+    } else if (flagValue.evaluationMetricKeys && flagValue.evaluationMetricKeys.length > 0) {
+      const validKey = flagValue.evaluationMetricKeys.find(
+        (metricKey) => metricKey && metricKey.trim().length > 0,
+      );
+      evaluationMetricKey = validKey ? validKey.trim() : undefined;
+    }
+
     return {
       ...this._toBaseConfig(key, flagValue),
       tracker,
       messages: flagValue.messages,
-      evaluationMetricKeys: flagValue.evaluationMetricKeys || [],
+      evaluationMetricKey,
     };
   }
 }
diff --git a/packages/sdk/server-ai/src/api/config/types.ts b/packages/sdk/server-ai/src/api/config/types.ts
index ade099037b..15f3766851 100644
--- a/packages/sdk/server-ai/src/api/config/types.ts
+++ b/packages/sdk/server-ai/src/api/config/types.ts
@@ -154,8 +154,14 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault {
    */
   messages?: LDMessage[];
   /**
-   * Evaluation metric keys for judge configurations.
+   * Evaluation metric key for judge configurations.
+   * The key of the metric that this judge can evaluate.
+   */
+  evaluationMetricKey?: string;
+  /**
+   * Evaluation metric keys for judge configurations (legacy).
    * The keys of the metrics that this judge can evaluate.
+   * @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
    */
   evaluationMetricKeys?: string[];
 }
@@ -211,10 +217,16 @@ export interface LDAIJudgeConfig extends LDAIConfig {
    */
   messages?: LDMessage[];
   /**
-   * Evaluation metric keys for judge configurations.
+   * Evaluation metric key for judge configurations.
+   * The key of the metric that this judge can evaluate.
+   */
+  evaluationMetricKey?: string;
+  /**
+   * Evaluation metric keys for judge configurations (legacy).
    * The keys of the metrics that this judge can evaluate.
+   * @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
    */
-  evaluationMetricKeys: string[];
+  evaluationMetricKeys?: string[];
 }
 
 // ============================================================================
diff --git a/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts b/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts
index 16d9ce651d..06f745a418 100644
--- a/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts
+++ b/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts
@@ -3,15 +3,20 @@
  * Not exported - only used internally by TrackedJudge.
  */
 class EvaluationSchemaBuilder {
-  static build(evaluationMetricKeys: string[]): Record<string, unknown> {
+  static build(evaluationMetricKey?: string): Record<string, unknown> {
+    if (!evaluationMetricKey) {
+      return {};
+    }
     return {
       type: 'object',
       properties: {
         evaluations: {
           type: 'object',
-          description: `Object containing evaluation results for ${evaluationMetricKeys.join(', ')} metrics`,
-          properties: this._buildKeyProperties(evaluationMetricKeys),
-          required: evaluationMetricKeys,
+          description: `Object containing evaluation results for ${evaluationMetricKey} metric`,
+          properties: {
+            [evaluationMetricKey]: this._buildKeySchema(evaluationMetricKey),
+          },
+          required: [evaluationMetricKey],
           additionalProperties: false,
         },
       },
@@ -20,16 +25,6 @@ class EvaluationSchemaBuilder {
     } as const;
   }
 
-  private static _buildKeyProperties(evaluationMetricKeys: string[]) {
-    return evaluationMetricKeys.reduce(
-      (acc, key) => {
-        acc[key] = this._buildKeySchema(key);
-        return acc;
-      },
-      {} as Record<string, unknown>,
-    );
-  }
-
   private static _buildKeySchema(key: string) {
     return {
       type: 'object',
diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts
index e71a43bdd3..382addc632 100644
--- a/packages/sdk/server-ai/src/api/judge/Judge.ts
+++ b/packages/sdk/server-ai/src/api/judge/Judge.ts
@@ -26,9 +26,30 @@ export class Judge {
     logger?: LDLogger,
   ) {
     this._logger = logger;
-    this._evaluationResponseStructure = EvaluationSchemaBuilder.build(
-      this._aiConfig.evaluationMetricKeys,
-    );
+    const evaluationMetricKey = this._getEvaluationMetricKey();
+    this._evaluationResponseStructure = EvaluationSchemaBuilder.build(evaluationMetricKey);
+  }
+
+  /**
+   * Gets the evaluation metric key, prioritizing evaluationMetricKey over evaluationMetricKeys.
+   * Falls back to the first valid (non-empty, non-whitespace) value in evaluationMetricKeys if evaluationMetricKey is not provided.
+   * Treats empty strings and whitespace-only strings as invalid.
+   * @returns The evaluation metric key, or undefined if not available
+   */
+  private _getEvaluationMetricKey(): string | undefined {
+    if (
+      this._aiConfig.evaluationMetricKey &&
+      this._aiConfig.evaluationMetricKey.trim().length > 0
+    ) {
+      return this._aiConfig.evaluationMetricKey.trim();
+    }
+    if (this._aiConfig.evaluationMetricKeys && this._aiConfig.evaluationMetricKeys.length > 0) {
+      const validKey = this._aiConfig.evaluationMetricKeys.find(
+        (key) => key && key.trim().length > 0,
+      );
+      return validKey ? validKey.trim() : undefined;
+    }
+    return undefined;
   }
 
   /**
@@ -45,12 +66,10 @@ export class Judge {
     samplingRate: number = 1,
   ): Promise<JudgeResponse | undefined> {
     try {
-      if (
-        !this._aiConfig.evaluationMetricKeys ||
-        this._aiConfig.evaluationMetricKeys.length === 0
-      ) {
+      const evaluationMetricKey = this._getEvaluationMetricKey();
+      if (!evaluationMetricKey) {
         this._logger?.warn(
-          'Judge configuration is missing required evaluationMetricKeys',
+          'Judge configuration is missing required evaluation metric key',
           this._aiConfigTracker.getTrackData(),
         );
         return undefined;
@@ -78,11 +97,11 @@ export class Judge {
 
       let { success } = response.metrics;
 
-      const evals = this._parseEvaluationResponse(response.data);
+      const evals = this._parseEvaluationResponse(response.data, evaluationMetricKey);
 
-      if (Object.keys(evals).length !== this._aiConfig.evaluationMetricKeys.length) {
+      if (!evals[evaluationMetricKey]) {
         this._logger?.warn(
-          'Judge evaluation did not return all evaluations',
+          'Judge evaluation did not return the expected evaluation',
           this._aiConfigTracker.getTrackData(),
         );
         success = false;
@@ -169,7 +188,10 @@ export class Judge {
   /**
    * Parses the structured evaluation response from the AI provider.
    */
-  private _parseEvaluationResponse(data: Record<string, unknown>): Record<string, EvalScore> {
+  private _parseEvaluationResponse(
+    data: Record<string, unknown>,
+    evaluationMetricKey: string,
+  ): Record<string, EvalScore> {
     const evaluations = data.evaluations as Record<string, unknown>;
     const results: Record<string, EvalScore> = {};
 
@@ -178,40 +200,38 @@ export class Judge {
       return results;
     }
 
-    this._aiConfig.evaluationMetricKeys.forEach((metricKey) => {
-      const evaluation = evaluations[metricKey];
+    const evaluation = evaluations[evaluationMetricKey];
 
-      if (!evaluation || typeof evaluation !== 'object') {
-        this._logger?.warn(
-          `Missing evaluation for metric key: ${metricKey}`,
-          this._aiConfigTracker.getTrackData(),
-        );
-        return;
-      }
+    if (!evaluation || typeof evaluation !== 'object') {
+      this._logger?.warn(
+        `Missing evaluation for metric key: ${evaluationMetricKey}`,
+        this._aiConfigTracker.getTrackData(),
+      );
+      return results;
+    }
 
-      const evalData = evaluation as Record<string, unknown>;
+    const evalData = evaluation as Record<string, unknown>;
 
-      if (typeof evalData.score !== 'number' || evalData.score < 0 || evalData.score > 1) {
-        this._logger?.warn(
-          `Invalid score evaluated for ${metricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`,
-          this._aiConfigTracker.getTrackData(),
-        );
-        return;
-      }
+    if (typeof evalData.score !== 'number' || evalData.score < 0 || evalData.score > 1) {
+      this._logger?.warn(
+        `Invalid score evaluated for ${evaluationMetricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`,
+        this._aiConfigTracker.getTrackData(),
+      );
+      return results;
+    }
 
-      if (typeof evalData.reasoning !== 'string') {
-        this._logger?.warn(
-          `Invalid reasoning evaluated for ${metricKey}: ${evalData.reasoning}. Reasoning must be a string`,
-          this._aiConfigTracker.getTrackData(),
-        );
-        return;
-      }
+    if (typeof evalData.reasoning !== 'string') {
+      this._logger?.warn(
+        `Invalid reasoning evaluated for ${evaluationMetricKey}: ${evalData.reasoning}. Reasoning must be a string`,
+        this._aiConfigTracker.getTrackData(),
+      );
+      return results;
+    }
 
-      results[metricKey] = {
-        score: evalData.score,
-        reasoning: evalData.reasoning,
-      };
-    });
+    results[evaluationMetricKey] = {
+      score: evalData.score,
+      reasoning: evalData.reasoning,
+    };
 
     return results;
   }