From d231ee12127806c0d88ebc4a325f56e4945f39a4 Mon Sep 17 00:00:00 2001 From: alrocar Date: Tue, 5 Aug 2025 20:27:06 +0200 Subject: [PATCH 1/3] refactor: improve exactness score calculation in evaluation --- src/src/lib/eval.ts | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts index 0a2da5e1..4fbbabf6 100644 --- a/src/src/lib/eval.ts +++ b/src/src/lib/eval.ts @@ -187,13 +187,32 @@ function blendedExactnessScore(provider: string, model: string) { return 0; } - const { avgExactDistance, avgNumericDistance, avgFScore } = + const { totalMatches, exactMatches, avgExactDistance, avgNumericDistance, avgFScore } = validationSummaries.modelStats[ modelKey as keyof typeof validationSummaries.modelStats ]; - // strong preference for exact, numeric as backup, fscore as minor fallback (it's correlated with jaccard) - return blendScore(avgExactDistance, avgNumericDistance, avgFScore); + // Calculate match rates + const totalMatchRate = totalMatches / validationSummaries.totalQuestions; + const exactMatchRate = exactMatches / validationSummaries.totalQuestions; + const failedMatchRate = 1 - totalMatchRate; + + // Calculate quality score for successful matches + const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore); + + // Calculate comprehensive exactness score with penalties for failures + // Base score from successful matches + const baseScore = totalMatchRate * qualityScore; + + // Penalty for failed matches (each failed match reduces score) + const failurePenalty = failedMatchRate * 1; // score here if needed + + // Bonus for exact matches + const exactMatchBonus = exactMatchRate * 1; // score here if needed + + const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus); + + return comprehensiveScore; } function blendScore(exact: number, numeric: number, fscore: number) { From 5f2a606a473b4aebf3ae5cd9032b46d68623bcb7 Mon Sep 17 00:00:00 2001 From: alrocar Date: Wed, 24 Sep 2025 09:58:35 +0200 Subject: [PATCH 2/3] feat: implement blended exactness score for model evaluation --- src/src/lib/eval.ts | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts index 4fbbabf6..8d12f0a1 100644 --- a/src/src/lib/eval.ts +++ b/src/src/lib/eval.ts @@ -175,6 +175,17 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] { }); } +/** + * Calculates a comprehensive exactness score for a model based on validation results. + * + * The score combines: + * 1. Match rate (what percentage of queries produced valid results) + * 2. Quality of matches (how accurate the results are) + * 3. Penalty for failures (reduces score for queries that failed) + * + * All calculations are done on a 0-1 scale internally for consistency, + * then converted to 0-100 scale for final output. + */ function blendedExactnessScore(provider: string, model: string) { const modelKey = `${provider}/${model}`; @@ -192,29 +203,40 @@ function blendedExactnessScore(provider: string, model: string) { modelKey as keyof typeof validationSummaries.modelStats ]; - // Calculate match rates + // Calculate match rates (0-1 scale) const totalMatchRate = totalMatches / validationSummaries.totalQuestions; const exactMatchRate = exactMatches / validationSummaries.totalQuestions; const failedMatchRate = 1 - totalMatchRate; - // Calculate quality score for successful matches - const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore); + // Calculate quality score for successful matches (0-1 scale) + // This already accounts for exact matches through the distance metrics + const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore) / 100; - // Calculate comprehensive exactness score with penalties for failures - // Base score from successful matches + // Calculate comprehensive exactness score + // Base score: successful matches weighted by their quality const baseScore = totalMatchRate * qualityScore; - // Penalty for failed matches (each failed match reduces score) - const failurePenalty = failedMatchRate * 1; // score here if needed + // Apply penalty for failed matches (reduces score proportionally) + // Using a moderate penalty to avoid overly harsh scoring + const failurePenalty = failedMatchRate * 0.3; - // Bonus for exact matches - const exactMatchBonus = exactMatchRate * 1; // score here if needed + // Apply bonus for exact matches (additional reward for perfect accuracy) + const exactMatchBonus = exactMatchRate * 0.1; // 10% bonus for exact matches const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus); - return comprehensiveScore; + // Convert back to 0-100 scale for consistency with other scores + return Math.round(comprehensiveScore * 100); } +/** + * Blends different distance metrics into a single quality score. + * + * @param exact - Exact distance metric (0 = perfect match, 1 = complete mismatch) + * @param numeric - Numeric distance metric (0 = perfect match, 1 = complete mismatch) + * @param fscore - F-score metric (0 = worst, 1 = best) + * @returns Quality score on 0-100 scale (100 = perfect) + */ function blendScore(exact: number, numeric: number, fscore: number) { return 100 * (0.65 * (1 - exact) + 0.25 * (1 - numeric) + 0.1 * fscore); } From c9ab5492b0f264138fbc507f2c78ca296cec6f1b Mon Sep 17 00:00:00 2001 From: alrocar Date: Wed, 24 Sep 2025 10:20:48 +0200 Subject: [PATCH 3/3] refactor: improve blended exactness score calculation for model evaluation --- src/src/lib/eval.ts | 83 ++++++++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts index 8d12f0a1..ee194595 100644 --- a/src/src/lib/eval.ts +++ b/src/src/lib/eval.ts @@ -178,17 +178,18 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] { /** * Calculates a comprehensive exactness score for a model based on validation results. * - * The score combines: - * 1. Match rate (what percentage of queries produced valid results) - * 2. Quality of matches (how accurate the results are) - * 3. Penalty for failures (reduces score for queries that failed) + * This function calculates the average of individual exactness scores across all questions, + * which provides a more accurate representation than using average distance metrics. * - * All calculations are done on a 0-1 scale internally for consistency, - * then converted to 0-100 scale for final output. + * The score properly accounts for: + * 1. Individual question scores (including perfect 100 scores) + * 2. Failed queries (scored as 0) + * 3. Exact match bonus for perfect accuracy */ function blendedExactnessScore(provider: string, model: string) { const modelKey = `${provider}/${model}`; + // Validate that model stats exist if ( !validationSummaries.modelStats[ modelKey as keyof typeof validationSummaries.modelStats @@ -198,35 +199,61 @@ function blendedExactnessScore(provider: string, model: string) { return 0; } - const { totalMatches, exactMatches, avgExactDistance, avgNumericDistance, avgFScore } = - validationSummaries.modelStats[ - modelKey as keyof typeof validationSummaries.modelStats - ]; + const modelStats = validationSummaries.modelStats[ + modelKey as keyof typeof validationSummaries.modelStats + ]; - // Calculate match rates (0-1 scale) - const totalMatchRate = totalMatches / validationSummaries.totalQuestions; - const exactMatchRate = exactMatches / validationSummaries.totalQuestions; - const failedMatchRate = 1 - totalMatchRate; + // Validate required fields exist and are numbers + if ( + typeof modelStats.totalMatches !== 'number' || + typeof modelStats.exactMatches !== 'number' || + typeof validationSummaries.totalQuestions !== 'number' || + validationSummaries.totalQuestions === 0 + ) { + console.log(`Invalid validation data for ${modelKey}`); + return 0; + } + + const { totalMatches, exactMatches } = modelStats; - // Calculate quality score for successful matches (0-1 scale) - // This already accounts for exact matches through the distance metrics - const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore) / 100; + // Calculate individual exactness scores for all questions + const individualScores: number[] = []; - // Calculate comprehensive exactness score - // Base score: successful matches weighted by their quality - const baseScore = totalMatchRate * qualityScore; + // Get all question keys from validation results + const questionKeys = Object.keys(validationResults).filter(key => key !== '_summary'); - // Apply penalty for failed matches (reduces score proportionally) - // Using a moderate penalty to avoid overly harsh scoring - const failurePenalty = failedMatchRate * 0.3; + // Validate we have questions to process + if (questionKeys.length === 0) { + console.log(`No questions found in validation results for ${modelKey}`); + return 0; + } - // Apply bonus for exact matches (additional reward for perfect accuracy) - const exactMatchBonus = exactMatchRate * 0.1; // 10% bonus for exact matches + for (const question of questionKeys) { + const individualScore = getExactnessScore(provider, model, question); + individualScores.push(individualScore); + } + + // Calculate average of individual scores (safe division) + const avgIndividualScore = individualScores.length > 0 + ? individualScores.reduce((sum, score) => sum + score, 0) / individualScores.length + : 0; + + // Apply exact match bonus (safe division) + const exactMatchRate = exactMatches / validationSummaries.totalQuestions; - const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus); + // Calculate bonus that ensures final score never exceeds 100 + const maxPossibleBonus = Math.max(0, 100 - avgIndividualScore); + const exactMatchBonus = exactMatchRate * Math.min(5, maxPossibleBonus); + + const finalScore = avgIndividualScore + exactMatchBonus; + + // Validate final score is a valid number + if (!isFinite(finalScore)) { + console.log(`Invalid final score calculated for ${modelKey}: ${finalScore}`); + return 0; + } - // Convert back to 0-100 scale for consistency with other scores - return Math.round(comprehensiveScore * 100); + return Math.round(finalScore); } /**