From d231ee12127806c0d88ebc4a325f56e4945f39a4 Mon Sep 17 00:00:00 2001
From: alrocar <alrocar@tinybird.co>
Date: Tue, 5 Aug 2025 20:27:06 +0200
Subject: [PATCH 1/3] refactor: improve exactness score calculation in
 evaluation

---
 src/src/lib/eval.ts | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts
index 0a2da5e1..4fbbabf6 100644
--- a/src/src/lib/eval.ts
+++ b/src/src/lib/eval.ts
@@ -187,13 +187,32 @@ function blendedExactnessScore(provider: string, model: string) {
     return 0;
   }
 
-  const { avgExactDistance, avgNumericDistance, avgFScore } =
+  const { totalMatches, exactMatches, avgExactDistance, avgNumericDistance, avgFScore } =
     validationSummaries.modelStats[
       modelKey as keyof typeof validationSummaries.modelStats
     ];
 
-  // strong preference for exact, numeric as backup, fscore as minor fallback (it's correlated with jaccard)
-  return blendScore(avgExactDistance, avgNumericDistance, avgFScore);
+  // Calculate match rates
+  const totalMatchRate = totalMatches / validationSummaries.totalQuestions;
+  const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
+  const failedMatchRate = 1 - totalMatchRate;
+
+  // Calculate quality score for successful matches
+  const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore);
+  
+  // Calculate comprehensive exactness score with penalties for failures
+  // Base score from successful matches
+  const baseScore = totalMatchRate * qualityScore;
+  
+  // Penalty for failed matches (each failed match reduces score)
+  const failurePenalty = failedMatchRate * 1; // score here if needed
+  
+  // Bonus for exact matches
+  const exactMatchBonus = exactMatchRate * 1; // score here if needed
+  
+  const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus);
+  
+  return comprehensiveScore;
 }
 
 function blendScore(exact: number, numeric: number, fscore: number) {

From 5f2a606a473b4aebf3ae5cd9032b46d68623bcb7 Mon Sep 17 00:00:00 2001
From: alrocar <alrocar@tinybird.co>
Date: Wed, 24 Sep 2025 09:58:35 +0200
Subject: [PATCH 2/3] feat: implement blended exactness score for model
 evaluation

---
 src/src/lib/eval.ts | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts
index 4fbbabf6..8d12f0a1 100644
--- a/src/src/lib/eval.ts
+++ b/src/src/lib/eval.ts
@@ -175,6 +175,17 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] {
   });
 }
 
+/**
+ * Calculates a comprehensive exactness score for a model based on validation results.
+ * 
+ * The score combines:
+ * 1. Match rate (what percentage of queries produced valid results)
+ * 2. Quality of matches (how accurate the results are)
+ * 3. Penalty for failures (reduces score for queries that failed)
+ * 
+ * All calculations are done on a 0-1 scale internally for consistency,
+ * then converted to 0-100 scale for final output.
+ */
 function blendedExactnessScore(provider: string, model: string) {
   const modelKey = `${provider}/${model}`;
 
@@ -192,29 +203,40 @@ function blendedExactnessScore(provider: string, model: string) {
       modelKey as keyof typeof validationSummaries.modelStats
     ];
 
-  // Calculate match rates
+  // Calculate match rates (0-1 scale)
   const totalMatchRate = totalMatches / validationSummaries.totalQuestions;
   const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
   const failedMatchRate = 1 - totalMatchRate;
 
-  // Calculate quality score for successful matches
-  const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore);
+  // Calculate quality score for successful matches (0-1 scale)
+  // This already accounts for exact matches through the distance metrics
+  const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore) / 100;
   
-  // Calculate comprehensive exactness score with penalties for failures
-  // Base score from successful matches
+  // Calculate comprehensive exactness score
+  // Base score: successful matches weighted by their quality
   const baseScore = totalMatchRate * qualityScore;
   
-  // Penalty for failed matches (each failed match reduces score)
-  const failurePenalty = failedMatchRate * 1; // score here if needed
+  // Apply penalty for failed matches (reduces score proportionally)
+  // Using a moderate penalty to avoid overly harsh scoring
+  const failurePenalty = failedMatchRate * 0.3;
   
-  // Bonus for exact matches
-  const exactMatchBonus = exactMatchRate * 1; // score here if needed
+  // Apply bonus for exact matches (additional reward for perfect accuracy)
+  const exactMatchBonus = exactMatchRate * 0.1; // 10% bonus for exact matches
   
   const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus);
   
-  return comprehensiveScore;
+  // Convert back to 0-100 scale for consistency with other scores
+  return Math.round(comprehensiveScore * 100);
 }
 
+/**
+ * Blends different distance metrics into a single quality score.
+ * 
+ * @param exact - Exact distance metric (0 = perfect match, 1 = complete mismatch)
+ * @param numeric - Numeric distance metric (0 = perfect match, 1 = complete mismatch)  
+ * @param fscore - F-score metric (0 = worst, 1 = best)
+ * @returns Quality score on 0-100 scale (100 = perfect)
+ */
 function blendScore(exact: number, numeric: number, fscore: number) {
   return 100 * (0.65 * (1 - exact) + 0.25 * (1 - numeric) + 0.1 * fscore);
 }

From c9ab5492b0f264138fbc507f2c78ca296cec6f1b Mon Sep 17 00:00:00 2001
From: alrocar <alrocar@tinybird.co>
Date: Wed, 24 Sep 2025 10:20:48 +0200
Subject: [PATCH 3/3] refactor: improve blended exactness score calculation for
 model evaluation

---
 src/src/lib/eval.ts | 83 ++++++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 28 deletions(-)

diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts
index 8d12f0a1..ee194595 100644
--- a/src/src/lib/eval.ts
+++ b/src/src/lib/eval.ts
@@ -178,17 +178,18 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] {
 /**
  * Calculates a comprehensive exactness score for a model based on validation results.
  * 
- * The score combines:
- * 1. Match rate (what percentage of queries produced valid results)
- * 2. Quality of matches (how accurate the results are)
- * 3. Penalty for failures (reduces score for queries that failed)
+ * This function calculates the average of individual exactness scores across all questions,
+ * which provides a more accurate representation than using average distance metrics.
  * 
- * All calculations are done on a 0-1 scale internally for consistency,
- * then converted to 0-100 scale for final output.
+ * The score properly accounts for:
+ * 1. Individual question scores (including perfect 100 scores)
+ * 2. Failed queries (scored as 0)
+ * 3. Exact match bonus for perfect accuracy
  */
 function blendedExactnessScore(provider: string, model: string) {
   const modelKey = `${provider}/${model}`;
 
+  // Validate that model stats exist
   if (
     !validationSummaries.modelStats[
       modelKey as keyof typeof validationSummaries.modelStats
@@ -198,35 +199,61 @@ function blendedExactnessScore(provider: string, model: string) {
     return 0;
   }
 
-  const { totalMatches, exactMatches, avgExactDistance, avgNumericDistance, avgFScore } =
-    validationSummaries.modelStats[
-      modelKey as keyof typeof validationSummaries.modelStats
-    ];
+  const modelStats = validationSummaries.modelStats[
+    modelKey as keyof typeof validationSummaries.modelStats
+  ];
 
-  // Calculate match rates (0-1 scale)
-  const totalMatchRate = totalMatches / validationSummaries.totalQuestions;
-  const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
-  const failedMatchRate = 1 - totalMatchRate;
+  // Validate required fields exist and are numbers
+  if (
+    typeof modelStats.totalMatches !== 'number' ||
+    typeof modelStats.exactMatches !== 'number' ||
+    typeof validationSummaries.totalQuestions !== 'number' ||
+    validationSummaries.totalQuestions === 0
+  ) {
+    console.log(`Invalid validation data for ${modelKey}`);
+    return 0;
+  }
+
+  const { totalMatches, exactMatches } = modelStats;
 
-  // Calculate quality score for successful matches (0-1 scale)
-  // This already accounts for exact matches through the distance metrics
-  const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore) / 100;
+  // Calculate individual exactness scores for all questions
+  const individualScores: number[] = [];
   
-  // Calculate comprehensive exactness score
-  // Base score: successful matches weighted by their quality
-  const baseScore = totalMatchRate * qualityScore;
+  // Get all question keys from validation results
+  const questionKeys = Object.keys(validationResults).filter(key => key !== '_summary');
   
-  // Apply penalty for failed matches (reduces score proportionally)
-  // Using a moderate penalty to avoid overly harsh scoring
-  const failurePenalty = failedMatchRate * 0.3;
+  // Validate we have questions to process
+  if (questionKeys.length === 0) {
+    console.log(`No questions found in validation results for ${modelKey}`);
+    return 0;
+  }
   
-  // Apply bonus for exact matches (additional reward for perfect accuracy)
-  const exactMatchBonus = exactMatchRate * 0.1; // 10% bonus for exact matches
+  for (const question of questionKeys) {
+    const individualScore = getExactnessScore(provider, model, question);
+    individualScores.push(individualScore);
+  }
+  
+  // Calculate average of individual scores (safe division)
+  const avgIndividualScore = individualScores.length > 0 
+    ? individualScores.reduce((sum, score) => sum + score, 0) / individualScores.length
+    : 0;
+  
+  // Apply exact match bonus (safe division)
+  const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
   
-  const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus);
+  // Calculate bonus that ensures final score never exceeds 100
+  const maxPossibleBonus = Math.max(0, 100 - avgIndividualScore);
+  const exactMatchBonus = exactMatchRate * Math.min(5, maxPossibleBonus);
+  
+  const finalScore = avgIndividualScore + exactMatchBonus;
+  
+  // Validate final score is a valid number
+  if (!isFinite(finalScore)) {
+    console.log(`Invalid final score calculated for ${modelKey}: ${finalScore}`);
+    return 0;
+  }
   
-  // Convert back to 0-100 scale for consistency with other scores
-  return Math.round(comprehensiveScore * 100);
+  return Math.round(finalScore);
 }
 
 /**