diff --git a/benchmarks/stability_benchmarks.py b/benchmarks/stability_benchmarks.py index 0c4082f..243fff1 100644 --- a/benchmarks/stability_benchmarks.py +++ b/benchmarks/stability_benchmarks.py @@ -1,9 +1,6 @@ """ Stability Benchmarking Script for DiffMage -This script uses your existing EvaluationBenchmarks class to test evaluation -stability across various real commit examples and scenarios. - Usage: python benchmark_stability.py --repo-path /path/to/repo --runs 10 python benchmark_stability.py --commit-range "HEAD~20..HEAD" --runs 5 @@ -618,7 +615,7 @@ def main(): description="Stability Benchmarking Script for DiffMage" ) parser.add_argument("--repo-path", default=".", help="Path to git repository") - parser.add_argument("--model", help="AI model to test (default: uses your default)") + parser.add_argument("--model", help="AI model to test (default: uses default)") parser.add_argument("--runs", type=int, default=5, help="Number of runs per test") parser.add_argument( "--variance-threshold", diff --git a/benchmarks/validation_suite.py b/benchmarks/validation_suite.py index bd1ebff..88c079f 100644 --- a/benchmarks/validation_suite.py +++ b/benchmarks/validation_suite.py @@ -1,9 +1,6 @@ """ Evaluation *Smoke Test* Validation Suite for DiffMage -This script performs fundamental sanity checks on your LLM evaluator to ensure -it's working correctly before running extensive benchmarks. - Usage: python validation_suite.py --all python validation_suite.py --test obvious-cases @@ -58,115 +55,412 @@ def get_obvious_test_cases(self) -> List[ValidationCase]: return [ # EXCELLENT cases (4.5-5.0) ValidationCase( - name="security_fix", - commit_message="fix: resolve critical SQL injection vulnerability in user authentication", - git_diff="""--- a/src/auth/UserAuth.py -+++ b/src/auth/UserAuth.py -@@ -23,7 +23,8 @@ class UserAuth: - def authenticate_user(self, username, password): -- query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" -+ query = "SELECT * FROM users WHERE username=? AND password=?" -+ return self.db.execute(query, (username, password)) -- return self.db.execute(query)""", - expected_score_range=(4.0, 5.0), + name="excellent_security_with_impact", + commit_message="fix: resolve authentication bypass allowing unauthorized admin access\n\nCritical security vulnerability discovered in production where users could escalate privileges by manipulating JWT tokens. Affecting ~2,300 enterprise customers with potential for full system compromise.\n\n The root cause is a missing signature verification in token validation middleware. Impact: Complete authentication bypass, unauthorized admin panel access\nSolution: Implement proper HMAC signature validation with key rotation\nTesting: Verified with penetration testing and security audit\n\nResolves: SEC-2024-001, addresses compliance requirements for SOC2 certification.", + git_diff="""--- a/src/middleware/auth.py + +++ b/src/middleware/auth.py + @@ -1,4 +1,5 @@ + import jwt + +import hmac + +import hashlib + from flask import request, jsonify + + def verify_token(token): + try: + - payload = jwt.decode(token, verify=False) + + # Verify signature with rotating keys + + payload = jwt.decode(token, get_current_secret_key(), algorithms=['HS256']) + + + + # Additional signature verification + + expected_sig = hmac.new( + + get_signing_key().encode(), + + f"{payload['user_id']}{payload['exp']}".encode(), + + hashlib.sha256 + + ).hexdigest() + + + + if not hmac.compare_digest(payload.get('sig', ''), expected_sig): + + raise jwt.InvalidTokenError("Invalid signature") + + + return payload + except jwt.InvalidTokenError: + return None""", + expected_score_range=(4.5, 5.0), + expected_quality="excellent", + description="Security fix with clear business impact, root cause analysis, and comprehensive solution", + ), + ValidationCase( + name="excellent_performance_with_metrics", + commit_message="perf: optimize user search query reducing response time from 8s to 200ms\n\nUser search was causing timeouts and abandonment during peak hours. Analysis showed 47% of users abandoned search after 5+ second wait times, directly impacting conversion rates.\n\nProblem: Sequential database queries without indexing on 2M+ user records\nMetrics: 8-12 second response times, 47% abandonment rate, 200+ timeout errors/day\nSolution: Implement composite indexes + query optimization + result pagination\nResults: 200ms average response time, 12% abandonment rate, zero timeouts\n\nA/B testing shows 23% increase in search completion rates.\nEstimated impact: +$180K quarterly revenue from improved user engagement.", + git_diff="""--- a/src/models/User.js + +++ b/src/models/User.js + @@ -10,15 +10,25 @@ const userSchema = new mongoose.Schema({ + updatedAt: { type: Date, default: Date.now } + }); + + +// Composite indexes for optimized search + +userSchema.index({ + + firstName: 'text', + + lastName: 'text', + + email: 'text' + +}, { + + weights: { firstName: 10, lastName: 5, email: 1 } + +}); + +userSchema.index({ createdAt: -1 }); + +userSchema.index({ email: 1 }, { unique: true }); + + // Optimized search with pagination and relevance scoring + -userSchema.statics.searchUsers = function(searchTerm) { + +userSchema.statics.searchUsers = function(searchTerm, page = 1, limit = 20) { + const regex = new RegExp(searchTerm, 'i'); + - return this.find({ + + + + return this.find({ + $or: [ + - { email: regex }, + - { firstName: regex }, + - { lastName: regex } + + { $text: { $search: searchTerm } }, + + { email: regex } + ] + - }); + + }, { score: { $meta: 'textScore' } }) + + .sort({ score: { $meta: 'textScore' }, createdAt: -1 }) + + .limit(limit) + + .skip((page - 1) * limit); + };""", + expected_score_range=(4.5, 5.0), expected_quality="excellent", - description="Clear security fix with good explanation", + description="Performance optimization with detailed metrics, business impact, and quantified results", ), + # GOOD cases (3.5-4.4) ValidationCase( name="feature_with_context", - commit_message="feat: implement user password reset with email verification\n\nAdds secure password reset flow:\n- Generate time-limited reset tokens\n- Send verification emails\n- Validate tokens before allowing reset\n- Log security events for auditing", + commit_message="feat: implement user password reset with email verification\n\nAdds secure password reset flow:\n- Generate time-limited reset tokens for security\n- Send verification emails\n- Validate tokens before allowing reset\n- Log security events for auditing", git_diff="""--- a/src/auth/PasswordReset.py -+++ b/src/auth/PasswordReset.py -@@ -0,0 +1,45 @@ -+import secrets -+import hashlib -+from datetime import datetime, timedelta -+ -+class PasswordReset: -+ def __init__(self, email_service, user_service): -+ self.email_service = email_service -+ self.user_service = user_service -+ -+ def request_reset(self, email): -+ user = self.user_service.find_by_email(email) -+ if not user: -+ return False # Don't reveal if email exists -+ -+ token = secrets.token_urlsafe(32) -+ expires_at = datetime.now() + timedelta(hours=1) -+ -+ self.user_service.store_reset_token(user.id, token, expires_at) -+ self.email_service.send_reset_email(email, token) -+ -+ return True""", - expected_score_range=(4.0, 5.0), - expected_quality="excellent", + +++ b/src/auth/PasswordReset.py + @@ -0,0 +1,45 @@ + +import secrets + +import hashlib + +from datetime import datetime, timedelta + + + +class PasswordReset: + + def __init__(self, email_service, user_service): + + self.email_service = email_service + + self.user_service = user_service + + + + def request_reset(self, email): + + user = self.user_service.find_by_email(email) + + if not user: + + return False # Don't reveal if email exists + + + + token = secrets.token_urlsafe(32) + + expires_at = datetime.now() + timedelta(hours=1) + + + + self.user_service.store_reset_token(user.id, token, expires_at) + + self.email_service.send_reset_email(email, token) + + + + return True""", + expected_score_range=(3.5, 4.4), + expected_quality="good", description="Comprehensive feature with detailed explanation", ), - # GOOD cases (3.5-4.0) + ValidationCase( + name="good_docs_update", + commit_message="docs: update installation instructions for new deployment process", + git_diff="""--- a/README.md + +++ b/README.md + @@ -15,8 +15,12 @@ A modern web application for managing user accounts. + + ## Installation + + -1. Clone the repository + -2. Run `npm install` + -3. Start with `npm start` + +1. Clone the repository: `git clone https://github.com/company/app.git` + +2. Install dependencies: `npm install` + +3. Copy environment file: `cp .env.example .env` + +4. Configure your database settings in `.env` + +5. Run database migrations: `npm run migrate` + +6. Start the application: `npm start` + + ## Configuration + @@ -25,4 +29,8 @@ A modern web application for managing user accounts. + + - `DATABASE_URL` - Your database connection string + - `JWT_SECRET` - Secret key for JWT tokens + +- `SMTP_HOST` - Email server host + +- `SMTP_PORT` - Email server port + +- `SMTP_USER` - Email username + +- `SMTP_PASS` - Email password""", + expected_score_range=(3.5, 4.4), + expected_quality="good", + description="Updates documentation. Doesn't explain why changes were needed, but this is an example of a low impact change that we don't penalize for. Excellent would be a more detailed explanation of the why.", + ), + ValidationCase( + name="good_feature_with_context", + commit_message="feat: add email verification for new user registrations\n\nNew users must verify their email address before accessing the platform. Sends verification email with time-limited token (24hr expiry) and blocks login until confirmed. Reduces fake accounts and improves email deliverability for notifications.", + git_diff="""--- a/src/controllers/AuthController.js + +++ b/src/controllers/AuthController.js + @@ -1,4 +1,5 @@ + const User = require('../models/User'); + +const EmailService = require('../services/EmailService'); + const bcrypt = require('bcrypt'); + +const crypto = require('crypto'); + + class AuthController { + async register(req, res) { + @@ -10,8 +11,18 @@ class AuthController { + + const user = await User.create({ + email: userData.email, + password: hashedPassword, + - isActive: true + + isActive: false, + + emailVerified: false, + + verificationToken: crypto.randomBytes(32).toString('hex'), + + verificationExpires: new Date(Date.now() + 24 * 60 * 60 * 1000) + }); + + + + // Send verification email + + await EmailService.sendVerificationEmail( + + user.email, + + user.verificationToken + + ); + + - res.json({ success: true, userId: user.id }); + + res.json({ + + success: true, + + message: 'Please check your email to verify your account' + + }); + } + + + + async verifyEmail(req, res) { + + const { token } = req.params; + + + + const user = await User.findOne({ + + verificationToken: token, + + verificationExpires: { $gt: new Date() } + + }); + + + + if (!user) { + + return res.status(400).json({ error: 'Invalid or expired token' }); + + } + + + + user.emailVerified = true; + + user.isActive = true; + + user.verificationToken = null; + + user.verificationExpires = null; + + await user.save(); + + + + res.json({ success: true, message: 'Email verified successfully' }); + + } + + async login(req, res) { + const user = await User.findOne({ email: req.body.email }); + - + - if (!user || !bcrypt.compareSync(req.body.password, user.password)) { + + + + if (!user || !user.emailVerified || !bcrypt.compareSync(req.body.password, user.password)) { + return res.status(401).json({ error: 'Invalid credentials' }); + } + + + + if (!user.emailVerified) { + + return res.status(401).json({ error: 'Please verify your email first' }); + + } + """, + expected_score_range=(3.5, 4.4), + expected_quality="good", + description="Feature with clear user benefit and implementation details", + ), + ValidationCase( + name="good_feature_basic", + commit_message="feat: add dark mode toggle to settings page\n\n- Add toggle switch component\n- Store preference in localStorage\n- Apply theme on page load", + git_diff="""--- a/src/pages/Settings.jsx + +++ b/src/pages/Settings.jsx + @@ -1,4 +1,5 @@ + import React, { useState } from 'react'; + +import ToggleSwitch from '../components/ToggleSwitch'; + + const Settings = () => { + const [notifications, setNotifications] = useState(true); + + const [darkMode, setDarkMode] = useState( + + localStorage.getItem('darkMode') === 'true' + + ); + + + const handleDarkModeToggle = (enabled) => { + + setDarkMode(enabled); + + localStorage.setItem('darkMode', enabled); + + document.body.classList.toggle('dark-mode', enabled); + + }; + + return ( +
+

Settings

+ +
+ + +
+ + +
+ + + + + +
+
+ ); + };""", + expected_score_range=(3.5, 4.4), + expected_quality="good", + description="Describes the changes but lacks some what context and lacks user motivation or business context", + ), + # AVERAGE cases (2.5-3.4) + ValidationCase( + name="security_fix", + commit_message="fix: resolve critical SQL injection vulnerability in user authentication", + git_diff="""--- a/src/auth/UserAuth.py + +++ b/src/auth/UserAuth.py + @@ -23,7 +23,8 @@ class UserAuth: + def authenticate_user(self, username, password): + - query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" + + query = "SELECT * FROM users WHERE username=? AND password=?" + + return self.db.execute(query, (username, password)) + - return self.db.execute(query)""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Clear security fix with no technical explanation", + ), ValidationCase( name="simple_bug_fix", commit_message="fix: handle null values in user profile display", git_diff="""--- a/src/components/UserProfile.jsx -+++ b/src/components/UserProfile.jsx -@@ -15,7 +15,7 @@ const UserProfile = ({ user }) => { -
-

{user.name}

--

Email: {user.email}

-+

Email: {user.email || 'Not provided'}

--

Bio: {user.bio}

-+

Bio: {user.bio || 'No bio available'}

-
- );""", - expected_score_range=(3.0, 4.5), - expected_quality="good", + +++ b/src/components/UserProfile.jsx + @@ -15,7 +15,7 @@ const UserProfile = ({ user }) => { +
+

{user.name}

+ -

Email: {user.email}

+ +

Email: {user.email || 'Not provided'}

+ -

Bio: {user.bio}

+ +

Bio: {user.bio || 'No bio available'}

+
+ );""", + expected_score_range=(2.5, 3.4), + expected_quality="average", description="Clear bug fix, could use more detail", ), - # AVERAGE cases (2.5-3.5) + ValidationCase( + name="average_refactor", + commit_message="refactor: move validation logic to separate service", + git_diff="""--- a/src/controllers/UserController.js + +++ b/src/controllers/UserController.js + @@ -5,12 +5,7 @@ class UserController { + async createUser(userData) { + try { + - // Validate email format + - if (!userData.email || !userData.email.includes('@')) { + - throw new Error('Invalid email'); + - } + - // Validate required fields + - if (!userData.firstName || !userData.lastName) { + - throw new Error('Missing required fields'); + - } + + ValidationService.validateUserData(userData); + + const user = await User.create(userData); + return { success: true, user }; + @@ -18,4 +13,17 @@ class UserController { + return { success: false, error: error.message }; + } + } + +} + + + +--- /dev/null + ++++ b/src/services/ValidationService.js + @@ -0,0 +1,15 @@ + +class ValidationService { + + static validateUserData(userData) { + + if (!userData.email || !userData.email.includes('@')) { + + throw new Error('Invalid email'); + + } + + + + if (!userData.firstName || !userData.lastName) { + + throw new Error('Missing required fields'); + + } + + } + }""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Basic refactor, describes what but minimal why context", + ), + ValidationCase( + name="average_bug_fix", + commit_message="fix: prevent crash when user profile is empty", + git_diff="""--- a/src/components/UserProfile.jsx + +++ b/src/components/UserProfile.jsx + @@ -8,7 +8,7 @@ const UserProfile = ({ user }) => { + return ( +
+

{user.name}

+ -

Bio: {user.bio}

+ +

Bio: {user.bio || 'No bio provided'}

+ -

Joined: {new Date(user.joinedAt).toLocaleDateString()}

+ +

Joined: {user.joinedAt ? new Date(user.joinedAt).toLocaleDateString() : 'Unknown'}

+
+ ); + };""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Fixes the issue but doesn't explain impact or root cause", + ), + # POOR cases (1.5-2.4) ValidationCase( name="generic_update", commit_message="update user component", git_diff="""--- a/src/components/User.jsx -+++ b/src/components/User.jsx -@@ -10,6 +10,7 @@ const User = ({ userData }) => { - return ( -
- {userData.name} -+ {userData.role} -
- );""", - expected_score_range=(2.0, 3.5), - expected_quality="average", + +++ b/src/components/User.jsx + @@ -10,6 +10,7 @@ const User = ({ userData }) => { + return ( +
+ {userData.name} + + {userData.role} +
+ );""", + expected_score_range=(1.5, 2.4), + expected_quality="poor", description="Vague message, minimal change", ), - # POOR cases (1.5-2.5) + # VERY POOR cases (1.0-1.4) ValidationCase( name="meaningless_message", commit_message="fix stuff", git_diff="""--- a/src/utils/helper.js -+++ b/src/utils/helper.js -@@ -5,7 +5,7 @@ function processData(data) { - if (!data) { - return null; - } -- return data.map(item => item.value); -+ return data.map(item => item.value || 0); - }""", - expected_score_range=(1.0, 2.5), - expected_quality="poor", - description="Meaningless commit message", + +++ b/src/utils/helper.js + @@ -5,7 +5,7 @@ function processData(data) { + if (!data) { + return null; + } + - return data.map(item => item.value); + + return data.map(item => item.value || 0); + }""", + expected_score_range=(1.0, 1.4), + expected_quality="very_poor", + description="Meaningless commit message without any context", ), - # VERY POOR cases (1.0-2.0) ValidationCase( name="gibberish", commit_message="asdf jkl; qwerty", git_diff="""--- a/src/test.js -+++ b/src/test.js -@@ -1,3 +1,4 @@ - // Test file - console.log('hello'); -+console.log('world');""", - expected_score_range=(1.0, 2.0), + +++ b/src/test.js + @@ -1,3 +1,4 @@ + // Test file + console.log('hello'); + +console.log('world');""", + expected_score_range=(1.0, 1.4), expected_quality="very_poor", description="Nonsensical commit message", ), @@ -180,7 +474,7 @@ def get_edge_test_cases(self) -> List[ValidationCase]: name="empty_message", commit_message="", git_diff="--- a/file.txt\n+++ b/file.txt\n@@ -1 +1,2 @@\n hello\n+world", - expected_score_range=(1.0, 2.0), + expected_score_range=(1.0, 1.4), expected_quality="very_poor", description="Empty commit message", ), @@ -191,7 +485,7 @@ def get_edge_test_cases(self) -> List[ValidationCase]: + "long commit message that goes on and on and provides way too much detail about a simple change " * 10, git_diff="--- a/file.txt\n+++ b/file.txt\n@@ -1 +1,2 @@\n hello\n+world", - expected_score_range=(1.0, 3.0), + expected_score_range=(1.0, 1.4), expected_quality="poor", description="Excessively long commit message", ), @@ -204,7 +498,7 @@ def get_edge_test_cases(self) -> List[ValidationCase]: def process_input(text): + text = text.encode('utf-8').decode('utf-8') return text.strip()""", - expected_score_range=(3.0, 4.5), + expected_score_range=(3.5, 4.4), expected_quality="good", description="Special characters and emojis", ), @@ -212,7 +506,7 @@ def process_input(text): name="no_diff", commit_message="fix: important bug fix", git_diff="", - expected_score_range=(1.0, 3.0), + expected_score_range=(1.0, 2.4), expected_quality="poor", description="No diff provided", ), @@ -233,7 +527,7 @@ def login(user, password): + +def reset_password(email): + send_reset_email(email)""", - expected_score_range=(1.0, 3.0), + expected_score_range=(1.0, 2.4), expected_quality="poor", description="Merge commit (usually auto-generated)", ), @@ -643,6 +937,7 @@ def _display_obvious_cases_results( table.add_column("Case", style="cyan") table.add_column("Expected", justify="center") table.add_column("Actual", justify="center") + table.add_column("What/Why", justify="center") table.add_column("Status", justify="center") table.add_column("Deviation", justify="center") @@ -659,6 +954,7 @@ def _display_obvious_cases_results( r["case"].name, expected_range, actual_score, + f"{r['result'].what_score:.1f}/{r['result'].why_score:.1f}", f"[{status_color}]{status}[/{status_color}]", deviation, ) diff --git a/benchmarks/why_context_benchmarks.py b/benchmarks/why_context_benchmarks.py new file mode 100644 index 0000000..a779645 --- /dev/null +++ b/benchmarks/why_context_benchmarks.py @@ -0,0 +1,354 @@ +""" +WHY Context Enhancement Benchmarking Suite +""" + +from typing import List, Dict, Any, Optional +from dataclasses import dataclass +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich import box +from enum import Enum +from diffmage.generation.commit_message_generator import CommitMessageGenerator +from diffmage.generation.models import GenerationResult +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator + +console = Console() + + +class ContextQuality(str, Enum): + GOOD = "good" + BAD = "bad" + TECHNICAL = "technical" + REDUNDANT = "redundant" + + +@dataclass +class WhyContextTestCase: + """Test case for WHY context enhancement""" + + name: str + commit_message: str + git_diff: str + why_context: str + should_enhance: bool # Should this context actually improve the message? + expected_improvement: float # Expected WHY score improvement (0 if no enhancement) + context_quality: ContextQuality + + +class WhyContextBenchmarkSuite: + """Benchmark suite specifically for WHY context enhancement""" + + def __init__(self, model_name: Optional[str] = None): + self.generator = CommitMessageGenerator(model_name=model_name) + self.evaluator = CommitMessageEvaluator(model_name=model_name) + + def get_why_context_test_cases(self) -> List[WhyContextTestCase]: + """Get comprehensive test cases for WHY context enhancement""" + + return [ + # GOOD WHY CONTEXT (should enhance) + WhyContextTestCase( + name="user_problem_context", + commit_message="fix: resolve null pointer exception in user validation", + git_diff="""--- a/src/auth/UserValidator.java + +++ b/src/auth/UserValidator.java + @@ -23,7 +23,7 @@ public class UserValidator { + public boolean isValidUser(User user) { + - return user.getEmail().contains("@"); + + return user != null && user.getEmail() != null && user.getEmail().contains("@"); + } + }""", + why_context="Users were experiencing app crashes during login attempts. Support team received 47 crash reports in the last week, all traced to this validation issue. This fix prevents the crashes and improves user experience.", + should_enhance=True, + expected_improvement=1.5, + context_quality=ContextQuality.GOOD, + ), + WhyContextTestCase( + name="performance_impact_context", + commit_message="perf: optimize database query for user search", + git_diff="""--- a/src/models/User.js + +++ b/src/models/User.js + @@ -15,8 +15,12 @@ const userSchema = new mongoose.Schema({ + +userSchema.index({ email: 1 }); + +userSchema.index({ firstName: 1, lastName: 1 }); + userSchema.statics.searchUsers = function(searchTerm) { + - return this.find({ + + return this.find({ + $or: [ + { email: regex }, + { firstName: regex } + ] + - }); + + }).limit(20).sort({ createdAt: -1 }); + };""", + why_context="User search was taking 8-12 seconds with 100k+ users. Customer support reported users abandoning the search feature. This optimization reduces search time to under 500ms, improving user engagement.", + should_enhance=True, + expected_improvement=2.0, + context_quality=ContextQuality.GOOD, + ), + WhyContextTestCase( + name="compliance_requirement_context", + commit_message="feat: add data encryption for user profiles", + git_diff="""--- a/src/models/UserProfile.py + +++ b/src/models/UserProfile.py + @@ -1,4 +1,5 @@ + from django.db import models + +from cryptography.fernet import Fernet + + class UserProfile(models.Model): + user = models.OneToOneField(User, on_delete=models.CASCADE) + - ssn = models.CharField(max_length=11) + + ssn_encrypted = models.BinaryField() + + + def set_ssn(self, ssn): + + f = Fernet(settings.ENCRYPTION_KEY) + + self.ssn_encrypted = f.encrypt(ssn.encode()) + + + + def get_ssn(self): + + f = Fernet(settings.ENCRYPTION_KEY) + + return f.decrypt(self.ssn_encrypted).decode()""", + why_context="Legal compliance requirement for GDPR and HIPAA. Audit found unencrypted PII in database. This change is required before Q2 compliance review to avoid potential $2.7M fine.", + should_enhance=True, + expected_improvement=1.8, + context_quality=ContextQuality.GOOD, + ), + # BAD WHY CONTEXT (should NOT enhance) + WhyContextTestCase( + name="technical_implementation_context", + commit_message="refactor: extract validation logic into separate class", + git_diff="""--- a/src/controllers/UserController.py + +++ b/src/controllers/UserController.py + @@ -8,6 +8,4 @@ class UserController: + def create_user(self, data): + - if not data.get('email'): + - raise ValueError('Email required') + + UserValidator.validate_email(data) + return User.create(data)""", + why_context="The previous implementation had validation logic directly in the controller. This refactor moves it to a separate UserValidator class following single responsibility principle and makes the code more maintainable.", + should_enhance=False, + expected_improvement=0.0, + context_quality=ContextQuality.TECHNICAL, + ), + WhyContextTestCase( + name="redundant_context", + commit_message="fix: handle null values in email validation", + git_diff="""--- a/src/utils/validator.js + +++ b/src/utils/validator.js + @@ -5,7 +5,7 @@ function validateEmail(email) { + - return email.includes('@'); + + return email && email.includes('@'); + }""", + why_context="This change fixes null pointer exceptions when email is null by adding a null check before calling includes method.", + should_enhance=False, + expected_improvement=0.0, + context_quality=ContextQuality.REDUNDANT, + ), + WhyContextTestCase( + name="test_coverage_context", + commit_message="test: add unit tests for authentication service", + git_diff="""--- /dev/null + +++ b/tests/auth.test.js + @@ -0,0 +1,25 @@ + +describe('AuthService', () => { + + test('should authenticate valid user', () => { + + // test implementation + + }); + +});""", + why_context="Added comprehensive test coverage for the authentication service to ensure code quality and catch regressions. Tests cover both success and failure scenarios.", + should_enhance=False, + expected_improvement=0.0, + context_quality=ContextQuality.TECHNICAL, + ), + # BORDERLINE CASES + WhyContextTestCase( + name="mixed_context", + commit_message="fix: update API response format for user endpoints", + git_diff="""--- a/src/api/users.py + +++ b/src/api/users.py + @@ -15,7 +15,10 @@ def get_user(user_id): + user = User.find(user_id) + - return {'user': user.to_dict()} + + return { + + 'user': user.to_dict(), + + 'timestamp': datetime.now().isoformat() + + }""", + why_context="The mobile app team requested timestamp information for caching purposes. This was causing cache invalidation issues. Also updated the response format to be more consistent with other endpoints.", + should_enhance=True, + expected_improvement=1.0, + context_quality=ContextQuality.GOOD, # Mixed but has user problem + ), + ] + + def run_enhancement_quality_test( + self, test_cases: List[WhyContextTestCase] + ) -> Dict[str, Any]: + """Test quality of WHY context enhancement""" + + console.print( + Panel( + "[bold]Testing WHY Context Enhancement Quality[/bold]\n" + "Comparing original vs enhanced commit messages", + title="🔍 Enhancement Quality Test", + border_style="blue", + ) + ) + + results = [] + correct_decisions = 0 + total_improvement = 0 + + for case in test_cases: + console.print(f"[blue]Testing: {case.name}[/blue]") + + # Generate original message evaluation + original_eval = self.evaluator.evaluate_commit_message( + case.commit_message, case.git_diff + ) + + # Generate enhanced message + enhanced_result = self.generator.enhance_with_why_context( + GenerationResult(message=case.commit_message), case.why_context + ) + + # Evaluate enhanced message + enhanced_eval = self.evaluator.evaluate_commit_message( + enhanced_result.message, case.git_diff + ) + + # Calculate improvement + why_improvement = enhanced_eval.why_score - original_eval.why_score + overall_improvement = ( + enhanced_eval.overall_score - original_eval.overall_score + ) + + # Check if enhancement decision was correct + was_enhanced = enhanced_result.message != case.commit_message + correct_decision = was_enhanced == case.should_enhance + + if correct_decision: + correct_decisions += 1 + + total_improvement += why_improvement + + result = { + "case_name": case.name, + "context_quality": case.context_quality, + "should_enhance": case.should_enhance, + "was_enhanced": was_enhanced, + "correct_decision": correct_decision, + "original_why_score": original_eval.why_score, + "enhanced_why_score": enhanced_eval.why_score, + "why_improvement": why_improvement, + "overall_improvement": overall_improvement, + "original_message": case.commit_message, + "enhanced_message": enhanced_result.message, + "expected_improvement": case.expected_improvement, + } + + results.append(result) + + # Calculate summary statistics + decision_accuracy = (correct_decisions / len(test_cases)) * 100 + avg_improvement = total_improvement / len(test_cases) + good_context_cases = [ + r for r in results if r["context_quality"] == ContextQuality.GOOD + ] + bad_context_cases = [ + r + for r in results + if r["context_quality"] + in [ContextQuality.TECHNICAL, ContextQuality.REDUNDANT] + ] + + summary = { + "decision_accuracy": decision_accuracy, + "average_why_improvement": avg_improvement, + "good_context_avg_improvement": sum( + r["why_improvement"] for r in good_context_cases + ) + / len(good_context_cases) + if good_context_cases + else 0, + "bad_context_avg_improvement": sum( + r["why_improvement"] for r in bad_context_cases + ) + / len(bad_context_cases) + if bad_context_cases + else 0, + "enhancement_rate": sum(1 for r in results if r["was_enhanced"]) + / len(results) + * 100, + } + + self._display_enhancement_results(results, summary) + + return { + "summary": summary, + "individual_results": results, + "passed": decision_accuracy >= 70 + and summary["good_context_avg_improvement"] > 0.5, + } + + def _display_enhancement_results( + self, results: List[Dict], summary: Dict[str, Any] + ): + """Display enhancement quality results""" + + table = Table(title="WHY Context Enhancement Results", box=box.SIMPLE) + table.add_column("Case", style="cyan") + table.add_column("Quality", justify="center") + table.add_column("Should Enhance", justify="center") + table.add_column("Was Enhanced", justify="center") + table.add_column("Decision", justify="center") + table.add_column("WHY Δ", justify="center") + + for result in results: + decision_color = "green" if result["correct_decision"] else "red" + decision_icon = "✅" if result["correct_decision"] else "❌" + + table.add_row( + result["case_name"], + result["context_quality"].value, + "Yes" if result["should_enhance"] else "No", + "Yes" if result["was_enhanced"] else "No", + f"[{decision_color}]{decision_icon}[/{decision_color}]", + f"{result['why_improvement']:+.1f}", + ) + + console.print(table) + + # Summary stats + console.print( + f"\n[blue]Decision Accuracy: {summary['decision_accuracy']:.1f}%[/blue]" + ) + console.print( + f"[blue]Average WHY Improvement: {summary['average_why_improvement']:+.2f}[/blue]" + ) + console.print( + f"[green]Good Context Improvement: {summary['good_context_avg_improvement']:+.2f}[/green]" + ) + console.print( + f"[red]Bad Context Improvement: {summary['bad_context_avg_improvement']:+.2f}[/red]" + ) + + +# Usage example: +def main(): + suite = WhyContextBenchmarkSuite() + + # Test enhancement quality + enhancement_results = suite.run_enhancement_quality_test( + suite.get_why_context_test_cases() + ) + + overall_passed = enhancement_results["passed"] + + if overall_passed: + console.print("\n[green]🎉 WHY Context Enhancement is working well![/green]") + else: + console.print("\n[red]❌ WHY Context Enhancement needs improvement[/red]") + + +if __name__ == "__main__": + main() diff --git a/src/diffmage/ai/client.py b/src/diffmage/ai/client.py index 2923cd7..1e27420 100644 --- a/src/diffmage/ai/client.py +++ b/src/diffmage/ai/client.py @@ -22,7 +22,7 @@ class AIClient: """ def __init__( - self, model_name: str, temperature: float = 0.1, max_tokens: int = 1000 + self, model_name: str, temperature: float = 0.0, max_tokens: int = 1000 ): self.model_config = get_model_by_name(model_name) self.temperature = temperature diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index ed14b49..c7c7efb 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -30,29 +30,29 @@ def get_commit_prompt( prompt = f"""Analyze the following and generate a concise commit message {context_info}: - Instructions: - - Use conventional commits format:(): - - Only return the commit message, nothing else. - - Use imperative mood ("add", "fix", "update", not "added", "fixed", "updated") - - Focus on WHAT changed and WHY, not HOW. Consider WHAT was impacted and WHY it was needed. + Instructions: + - Use conventional commits format:(): + - Only return the commit message, nothing else. + - Use imperative mood ("add", "fix", "update", not "added", "fixed", "updated") + - Focus on WHAT changed and WHY, not HOW. Consider WHAT was impacted and WHY it was needed. - Common types: - - feat: New feature, enhancement, or functionality - - fix: Bug fix or error correction - - refactor: Code restructuring for readability, performance, or maintainability - - docs: Documentation changes only - - test: Adding or updating tests - - chore: Maintenance, dependencies, build changes + Common types: + - feat: New feature, enhancement, or functionality + - fix: Bug fix or error correction + - refactor: Code restructuring for readability, performance, or maintainability + - docs: Documentation changes only + - test: Adding or updating tests + - chore: Maintenance, dependencies, build changes - - {diff_content} - + + {diff_content} + - Generate the commit message, nothing else. + Generate the commit message, nothing else. - Commit message: - """ + Commit message: + """ return prompt @@ -63,15 +63,56 @@ def get_generation_system_prompt() -> str: """ return """ - Your commit messages are: - - Descriptive but brief - - Written in imperative mood - - Focused on the purpose and impact of changes - - Helpful for other developers and future maintenance - - Following industry best practices for writing commit messages + Your commit messages are: + - Descriptive but brief + - Written in imperative mood + - Focused on the purpose and impact of changes + - Helpful for other developers and future maintenance + - Following industry best practices for writing commit messages + + Analyze code changes carefully and generate commit messages that accurately describe what was changed and why. + """ + + +def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: + """ + Build the prompt for enhancing an existing commit message with external 'why' context. + """ + return f"""You are a Git expert. Your task is to decide whether to enhance a commit message with external context. + + STRICT EVALUATION CRITERIA: + - Does the explain a USER PROBLEM or BUSINESS NEED that's not obvious from the code changes or ? + - Does it add meaningful insight about WHY this change was necessary? + - WHY is defined as the rationale and intent behind the changes made in a git commit. It goes beyond simply stating what was changed and delves into why those changes were necessary. + - Does the avoid redundant technical implementation details already clear from the diff? + + If any of these are true, return the EXACTLY as provided. + + + {preliminary_message} + + + + {why_context} + - Analyze code changes carefully and generate commit messages that accurately describe what was changed and why. - """ + + 1. First decide: Does this add valuable WHY information that explains user problems, business needs, or meaningful impact that is not already clear from the ? + + 2. If NO (context is technical details, obvious info, or redundant): + - Return the EXACTLY as provided + + 3. If YES ( explains real user problems, value, impact, reasoning, intent, context, background, implications): + - Focus on: What problem did this solve? What was the reasoning and intent? What is the context or background? What are the implications and impact? + - A gold standard "WHY" of a commit message explains the rationale and intent behind the changes made in a git commit. It goes beyond simply stating what was changed and delves into why those changes were necessary. + - Focus on this scenario: If a developer is to come across this in a git blame months down the road, what would they want and need to know? + - Keep it under 150 words total addition + - DO NOT make up information that is not in the or that can not be inferred directly from the + + + + ONLY RETURN THE FINAL COMMIT MESSAGE that will be submitted to the git commit. DO NOT INCLUDE YOUR REASONING OR ANYTHING ELSE. + + final commit message:""" # Evaluation prompts @@ -83,11 +124,12 @@ def get_evaluation_system_prompt() -> str: """ return """ - Your commit messages are: - - Descriptive but brief - - Written in imperative mood - - Focused on the purpose and impact of changes - """ + Your commit messages are: + - Descriptive but brief + - Written in imperative mood + - Focused on the purpose and impact of changes + - Focus on the accuracy, validity, and truthfulness of what is being described. + """ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: @@ -97,94 +139,167 @@ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: return f"""You are an expert code reviewer evaluating commit message quality using Chain-of-Though reasoning. + If the commit message is untruthful, inaccurate, or misrepresents the code changes, return a score of 1 for both WHAT and WHY. - WHAT (1-5): How accurately and completely does the message describe the actual code changes? - * 5 = All changes captured accurately and completely - * 3 = Main changes described, some details missing - * 1 = Major changes omitted or misrepresented - - WHY (1-5): How clearly does it explain the purpose/reasoning/impact of the changes? Be more lenient on low impact changes. - - Scale: 1=Very Poor, 2=Poor, 3=Average, 4=Good, 5=Excellent + * 5 = All high impact changes captured accurately and completely with precise details. + * 4 = Main changes described accurately and clearly, only minor details missing + * 3 = Core changes mentioned, some important details missing + * 2 = Basic changes mentioned, significant gaps in description + * 1 = Major changes omitted, misrepresented, not described, or inaccurate. The message is completely non-descriptive and could apply to any commit. It provides zero insight into the change, location or substance. All changes are effectively omitted. + + - WHY (1-5): How clearly does it explain the PURPOSE/REASONING/IMPACT of the changes? + * 5 = Extremely clear user problem + impact, intent, reasoning, context, background, implications explained. + * 4 = Strong rationale with clear purpose/reasoning/impact/background/motivation. + * 3 = Basic purpose/reasoning/impact/background/motivation explained, could be more detailed. + * 2 = Minimal or technical-only reasoning, no user problem, impact, intent, reasoning, context, background, or implications explained + * 1 = No explanation of purpose/impact. The message is completely non-descriptive and could apply to any commit. It provides zero insight into the change, location or substance. All changes are effectively omitted. + + - If the change is low impact (docs, refactoring, type safety, performance optimizations, tests, dependencies, configs), be lenient with the WHY score. Otherwise, adhere to the provided . + - If the WHY can be inferred from the , brief benefit mentions are sufficient and leniency is allowed for the WHY score. + - DO NOT penalize the WHY score for not explaining obvious, low impact technical benefits in depth. + + Example 1: - COMMIT: "fix bug" + COMMIT: "update config" DIFF: - --- a/src/auth.py - +++ b/src/auth.py - @@ -15,1 +15,2 @@ def authenticate_user(username, password): - user = get_user(username) - - return user.validate_password(password) - + if user: - + return user.validate_password(password) + --- a/config/database.yml + +++ b/config/database.yml + @@ -5,7 +5,7 @@ production: + adapter: postgresql + encoding: unicode + database: myapp_production + - pool: 5 + + pool: 25 + username: <%= ENV['DATABASE_USER'] %> + password: <%= ENV['DATABASE_PASS'] %> ANALYSIS: - - WHAT: 2/5 - Too vague, doesn't specify what bug or where - - WHY: 1/5 - No explanation of impact or reasoning + - WHAT: 2/5 - Basic changes mentioned but significant gaps: doesn't specify what config change or why pool size matters + - WHY: 1/5 - No explanation of purpose/impact beyond obvious technical changes Example 2: - COMMIT: "feat(auth): add JWT authentication to secure API endpoints" + COMMIT: "chore: upgrade React from v16 to v18 for better performance + + Updates all React dependencies and components to use new APIs. Improves rendering + performance and enables concurrent features for smoother user interactions." + DIFF: - --- a/src/middleware/auth.py - +++ b/src/middleware/auth.py - @@ -0,0 +1,12 @@ - +import jwt - +from flask import request, jsonify + --- a/package.json + +++ b/package.json + @@ -10,8 +10,8 @@ + "dependencies": + - "react": "^16.14.0", + - "react-dom": "^16.14.0", + + "react": "^18.2.0", + + "react-dom": "^18.2.0", + "react-router": "^6.0.0" + + --- a/src/App.jsx + +++ b/src/App.jsx + @@ -1,4 +1,4 @@ + -import (render) from 'react-dom'; + +import (createRoot) from 'react-dom/client'; + + -(render)(, document.getElementById('root')); + +(const root = createRoot(document.getElementById('root')); + +root.render(); + + ANALYSIS: + - WHAT: 4/5 - Main changes described accurately: version upgrade and API updates, minor implementation details missing + - WHY: 3/5 - Basic purpose explained (performance, concurrent features) but could be clearer about specific benefits + + Example 3: + COMMIT: "fix: correct email validation regex to prevent invalid addresses + + Email validation was accepting malformed addresses like 'user@' and 'invalid.email', + causing downstream errors in the notification system and user registration failures." + + DIFF: + --- a/src/utils/validation.js + +++ b/src/utils/validation.js + @@ -8,7 +8,7 @@ export const validateEmail = (email) => ( + return false; + + - const emailRegex = /S+@S+/; + + const emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z](2,)$/; + return emailRegex.test(email) + + ANALYSIS: + - WHAT: 4/5 - Main changes described clearly: email validation fix with specific problem examples + - WHY: 4/5 - Good rationale with clear benefit: prevents downstream errors and registration issues + + Example 4: + COMMIT: "feat: implement caching layer reducing API response time by 75% + + Added Redis caching for frequently accessed product data after observing + average API response times of 2.4 seconds affecting user experience. + Cache hits now serve requests in 300ms, improving page load performance + and reducing database load by 60%. + + Implements TTL of 1 hour with automatic cache invalidation on product updates." + + DIFF: + --- a/src/services/productService.js + +++ b/src/services/productService.js + @@ -1,4 +1,5 @@ + const database = require('../database'); + +const redis = require('redis'); + +const client = redis.createClient(); + + class ProductService ( + async getProduct(id) + + // Check cache first + + const cached = await client.get(`product:$(id)`); + + if (cached) ) + + return JSON.parse(cached); + + ) + - +def require_auth(f): - + def decorated(*args, **kwargs): - + token = request.headers.get('Authorization') - + if not token: - + return jsonify({{'error': 'No token provided'}}), 401 - + try: - + jwt.decode(token, app.config['SECRET_KEY']) - + except jwt.InvalidTokenError: - + return jsonify({{'error': 'Invalid token'}}), 401 - + return f(*args, **kwargs) - + return decorated - - + def add_items_to_cart(cart, items): - + # adds items to the cart - + if not items: - + raise ValueError('No items to add') - + cart.add_items(items) + const product = await database.query('SELECT * FROM products WHERE id = ?', [id]); + + + + // Cache for 1 hour + + await client.setex(`product:$(id)`, 3600, JSON.stringify(product)); + return product; + ) + ) ANALYSIS: - - WHAT: 3/5 - The commit message is missing important added functionality (doesn't mention the add_items_to_cart function) - - WHY: 4/5 - Clear security purpose, could detail specific benefits + - WHAT: 5/5 - All high impact changes captured with precise details: Redis integration, caching logic, TTL configuration + - WHY: 5/5 - Clear user problem (2.4s response times) + quantified impact (75% reduction, 300ms response) + comprehensive context (database load reduction) + Example 5: + COMMIT: "test: add unit tests for password hashing utility" - Example 3: - COMMIT: "refactor: extract validation logic into UserService class" DIFF: - --- a/src/controllers/user_controller.py - +++ b/src/controllers/user_controller.py - @@ -8,6 +8,4 @@ class UserController: - def create_user(self, data): - - if not data.get('email'): - - raise ValueError('Email required') - - if len(data.get('password', '')) < 8: - - raise ValueError('Password too short') - + UserService.validate_user_data(data) - return User.create(data) - --- a/src/services/user_service.py - +++ b/src/services/user_service.py - @@ -0,0 +1,8 @@ - +class UserService: - + @staticmethod - + def validate_user_data(data): - + if not data.get('email'): - + raise ValueError('Email required') - + if len(data.get('password', '')) < 8: - + raise ValueError('Password too short') + --- /dev/null + +++ b/tests/utils/passwordUtils.test.js + @@ -0,0 +1,25 @@ + +const ( hashPassword, verifyPassword ) = require('../../src/utils/passwordUtils'); + + + +describe('Password Utils', () => ( + + test('should hash password correctly', () => ( + + const password = 'testpassword123'; + + const hash = hashPassword(password); + + expect(hash).toBeDefined(); + + expect(hash).not.toBe(password); + + ) + + + + test('should verify correct password', () => ( + + const password = 'testpassword123'; + + const hash = hashPassword(password); + + expect(verifyPassword(password, hash)).toBe(true); + + ) + +); ANALYSIS: - - WHAT: 5/5 - Exact description of the refactoring action - - WHY: 4/5 - Good architectural reasoning (separation of concerns) + - WHAT: 3/5 - Core changes described: test addition for password utility, some important details missing about test coverage + - WHY: 2/5 - Minimal reasoning: no explanation of why tests were needed or what problems they prevent - NOW EVALUATE THE FOLLOWING COMMIT MESSAGE: {commit_message} @@ -211,6 +326,8 @@ def create_user(self, data): - Keep it concise (3-5 sentences) + ONLY RETURN THE JSON RESPONSE, NOTHING ELSE. + REQUIRED JSON RESPONSE: {{ "what_score": <1-5>, diff --git a/src/diffmage/cli/generate.py b/src/diffmage/cli/generate.py index 486504c..41b8675 100644 --- a/src/diffmage/cli/generate.py +++ b/src/diffmage/cli/generate.py @@ -15,6 +15,9 @@ def generate( False, "--list-models", help="List all available models" ), repo_path: str = typer.Option(".", "--repo", "-r"), + why_context: str = typer.Option( + None, "--context", "-c", help="Context to use to improve the WHY generation" + ), ) -> None: """Generate commit message from staged changes""" @@ -33,7 +36,7 @@ def generate( service = GenerationService(model_name=model_config.name) request = GenerationRequest(repo_path=repo_path) - result = service.generate_commit_message(request) + result = service.generate_commit_message(request, why_context) console.print(f"[green]Commit message:[/green] {result.message}") diff --git a/src/diffmage/evaluation/commit_message_evaluator.py b/src/diffmage/evaluation/commit_message_evaluator.py index acb0fe7..7cd7d5f 100644 --- a/src/diffmage/evaluation/commit_message_evaluator.py +++ b/src/diffmage/evaluation/commit_message_evaluator.py @@ -25,7 +25,7 @@ class CommitMessageEvaluator: """ - def __init__(self, model_name: Optional[str] = None, temperature: float = 0.7): + def __init__(self, model_name: Optional[str] = None, temperature: float = 0.0): """ Initialize the LLM evaluator. diff --git a/src/diffmage/generation/commit_message_generator.py b/src/diffmage/generation/commit_message_generator.py index 2a8e04f..be54af1 100644 --- a/src/diffmage/generation/commit_message_generator.py +++ b/src/diffmage/generation/commit_message_generator.py @@ -3,7 +3,7 @@ from typing import Optional from diffmage.ai.client import AIClient from diffmage.ai.models import get_default_model -from diffmage.ai.prompt_manager import get_commit_prompt +from diffmage.ai.prompt_manager import get_commit_prompt, get_why_context_prompt from diffmage.generation.models import GenerationResult @@ -14,7 +14,7 @@ class CommitMessageGenerator: This specialist focuses solely on the generation logic and prompt handling """ - def __init__(self, model_name: Optional[str] = None, temperature: float = 0.1): + def __init__(self, model_name: Optional[str] = None, temperature: float = 0.0): """ Initialize the LLM Generator @@ -55,6 +55,29 @@ def generate_commit_message( except Exception as e: raise ValueError(f"Error generating commit message: {e}") + def enhance_with_why_context( + self, result: GenerationResult, why_context: str + ) -> GenerationResult: + """ + Enhance existing commit message with external why context + + Args: + result: Initial commit message result + why_context: External business/contextual information + + Returns: + Enhanced GenerationResult with integrated context + """ + if not why_context: + return result + + prompt = get_why_context_prompt(result.message, why_context) + try: + message = self.client.generate_commit_message(prompt) + return GenerationResult(message=message.strip()) + except Exception as e: + raise ValueError(f"Error enhancing commit message with why context: {e}") + def _build_prompt( self, git_diff: str, file_count: int, lines_added: int, lines_removed: int ) -> str: diff --git a/src/diffmage/generation/service.py b/src/diffmage/generation/service.py index 0cfeeb2..45fef47 100644 --- a/src/diffmage/generation/service.py +++ b/src/diffmage/generation/service.py @@ -12,7 +12,9 @@ class GenerationService: def __init__(self, model_name: Optional[str] = None) -> None: self.generator = CommitMessageGenerator(model_name=model_name) - def generate_commit_message(self, request: GenerationRequest) -> GenerationResult: + def generate_commit_message( + self, request: GenerationRequest, why_context: Optional[str] = None + ) -> GenerationResult: """ Generate a commit message from staged changes @@ -33,4 +35,10 @@ def generate_commit_message(self, request: GenerationRequest) -> GenerationResul git_diff, file_count, lines_added, lines_removed ) + # handle why context separately to separate concerns and handle llm focus. + # handling why context in the intial generate_commit_message was adding too + # much noise to the commit message generation. + if why_context: + return self.generator.enhance_with_why_context(result, why_context) + return result diff --git a/tests/ai/test_client.py b/tests/ai/test_client.py index e5a0b0c..9eb4bd3 100644 --- a/tests/ai/test_client.py +++ b/tests/ai/test_client.py @@ -68,18 +68,18 @@ def test_ai_client_initialization(): client = AIClient(model_name="openai/gpt-4o-mini") assert client.model_config.name == "openai/gpt-4o-mini" - assert client.temperature == 0.1 + assert client.temperature == 0.0 assert client.max_tokens == 1000 def test_ai_client_initialization_with_custom_params(): """Test AIClient initialization with custom parameters.""" client = AIClient( - model_name="anthropic/claude-haiku", temperature=0.5, max_tokens=2000 + model_name="anthropic/claude-haiku", temperature=0.0, max_tokens=2000 ) assert client.model_config.name == "anthropic/claude-haiku" - assert client.temperature == 0.5 + assert client.temperature == 0.0 assert client.max_tokens == 2000 @@ -109,7 +109,7 @@ def test_generate_commit_message_success(mock_completion, mock_ai_response): mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "openai/gpt-4o-mini" - assert call_args[1]["temperature"] == 0.1 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1000 assert call_args[1]["stream"] is False @@ -144,7 +144,7 @@ def test_generate_commit_message_with_custom_params( # Create client with custom parameters client = AIClient( - model_name="anthropic/claude-haiku", temperature=0.7, max_tokens=1500 + model_name="anthropic/claude-haiku", temperature=0.0, max_tokens=1500 ) result = client.generate_commit_message(mock_commit_analysis) @@ -155,7 +155,7 @@ def test_generate_commit_message_with_custom_params( mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "anthropic/claude-haiku" - assert call_args[1]["temperature"] == 0.7 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1500 @@ -215,7 +215,7 @@ def test_evaluate_with_llm_success(mock_completion, mock_evaluation_response): mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "openai/gpt-4o-mini" - assert call_args[1]["temperature"] == 0.1 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1000 assert call_args[1]["stream"] is False @@ -269,7 +269,7 @@ def test_evaluate_with_llm_with_custom_params( # Create client with custom parameters client = AIClient( - model_name="anthropic/claude-haiku", temperature=0.7, max_tokens=1500 + model_name="anthropic/claude-haiku", temperature=0.0, max_tokens=1500 ) evaluation_prompt = "test evaluation prompt" result = client.evaluate_with_llm(evaluation_prompt) @@ -281,5 +281,5 @@ def test_evaluate_with_llm_with_custom_params( mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "anthropic/claude-haiku" - assert call_args[1]["temperature"] == 0.7 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1500 diff --git a/tests/evaluation/test_llm_evaluator.py b/tests/evaluation/test_llm_evaluator.py index ae4e327..65a67a5 100644 --- a/tests/evaluation/test_llm_evaluator.py +++ b/tests/evaluation/test_llm_evaluator.py @@ -20,9 +20,9 @@ def test_init_with_custom_model(self): def test_init_with_custom_temperature(self): """Test LLMEvaluator initialization with custom temperature.""" - evaluator = CommitMessageEvaluator(temperature=0.5) + evaluator = CommitMessageEvaluator(temperature=0.0) - assert evaluator.ai_client.temperature == 0.5 + assert evaluator.ai_client.temperature == 0.0 def test_evaluate_commit_message_success(self): """Test successful commit message evaluation.""" diff --git a/tests/generation/test_commit_message_generator.py b/tests/generation/test_commit_message_generator.py index 4aef2bf..d217814 100644 --- a/tests/generation/test_commit_message_generator.py +++ b/tests/generation/test_commit_message_generator.py @@ -36,9 +36,9 @@ def test_init_with_custom_model(self): def test_init_with_custom_temperature(self): """Test CommitMessageGenerator initialization with custom temperature.""" - generator = CommitMessageGenerator(temperature=0.5) + generator = CommitMessageGenerator(temperature=0.0) - assert generator.client.temperature == 0.5 + assert generator.client.temperature == 0.0 def test_generate_commit_message_success(self): """Test successful commit message generation.""" @@ -97,3 +97,58 @@ def test_build_prompt(self): mock_get_prompt.assert_called_once_with( diff_content="test diff", file_count=1, lines_added=5, lines_removed=2 ) + + def test_enhance_with_why_context_success(self): + """Test successful enhancement with why context.""" + generator = CommitMessageGenerator(model_name="openai/gpt-4o-mini") + initial_result = GenerationResult(message="feat: add authentication") + why_context = "Implementing OAuth2 for better security compliance" + + with patch( + "diffmage.generation.commit_message_generator.get_why_context_prompt" + ) as mock_get_prompt: + mock_get_prompt.return_value = "Enhanced prompt" + + with patch.object( + generator.client, + "generate_commit_message", + return_value="feat: implement OAuth2 authentication for security compliance", + ): + result = generator.enhance_with_why_context(initial_result, why_context) + + assert isinstance(result, GenerationResult) + assert ( + result.message + == "feat: implement OAuth2 authentication for security compliance" + ) + mock_get_prompt.assert_called_once_with( + "feat: add authentication", why_context + ) + + def test_enhance_with_why_context_empty_context(self): + """Test enhancement with empty why context returns original result.""" + generator = CommitMessageGenerator() + initial_result = GenerationResult(message="feat: add authentication") + + result = generator.enhance_with_why_context(initial_result, "") + + assert result is initial_result + + def test_enhance_with_why_context_ai_error(self): + """Test enhancement when AI call fails.""" + generator = CommitMessageGenerator() + initial_result = GenerationResult(message="feat: add authentication") + why_context = "Adding for security" + + with patch( + "diffmage.generation.commit_message_generator.get_why_context_prompt" + ): + with patch.object( + generator.client, + "generate_commit_message", + side_effect=Exception("API Error"), + ): + with pytest.raises( + ValueError, match="Error enhancing commit message with why context" + ): + generator.enhance_with_why_context(initial_result, why_context) diff --git a/tests/generation/test_service.py b/tests/generation/test_service.py index b159aa6..90c5494 100644 --- a/tests/generation/test_service.py +++ b/tests/generation/test_service.py @@ -109,3 +109,70 @@ def test_generate_commit_message_success( assert call_args[0][1] == mock_commit_analysis.total_files assert call_args[0][2] == mock_commit_analysis.total_lines_added assert call_args[0][3] == mock_commit_analysis.total_lines_removed + + @patch("diffmage.generation.service.CommitMessageGenerator") + @patch("diffmage.generation.service.GitDiffParser") + def test_generate_commit_message_with_why_context( + self, mock_git_parser_class, mock_generator_class, mock_commit_analysis + ): + """Test commit message generation with why context enhancement.""" + mock_parser = Mock() + mock_parser.parse_staged_changes.return_value = mock_commit_analysis + mock_git_parser_class.return_value = mock_parser + + mock_generator = Mock() + initial_result = GenerationResult(message="feat: add new feature") + enhanced_result = GenerationResult( + message="feat: add user authentication for improved security" + ) + mock_generator.generate_commit_message.return_value = initial_result + mock_generator.enhance_with_why_context.return_value = enhanced_result + mock_generator_class.return_value = mock_generator + + # Create service and request + service = GenerationService(model_name="openai/gpt-4o-mini") + request = GenerationRequest(repo_path=".") + why_context = "Adding authentication to improve security posture" + + # Execute + result = service.generate_commit_message(request, why_context=why_context) + + # Verify + assert isinstance(result, GenerationResult) + assert result.message == "feat: add user authentication for improved security" + + # Verify both methods were called + mock_generator.generate_commit_message.assert_called_once() + mock_generator.enhance_with_why_context.assert_called_once_with( + initial_result, why_context + ) + + @patch("diffmage.generation.service.CommitMessageGenerator") + @patch("diffmage.generation.service.GitDiffParser") + def test_generate_commit_message_with_empty_why_context( + self, mock_git_parser_class, mock_generator_class, mock_commit_analysis + ): + """Test commit message generation with empty why context (should not call enhance).""" + mock_parser = Mock() + mock_parser.parse_staged_changes.return_value = mock_commit_analysis + mock_git_parser_class.return_value = mock_parser + + mock_generator = Mock() + initial_result = GenerationResult(message="feat: add new feature") + mock_generator.generate_commit_message.return_value = initial_result + mock_generator_class.return_value = mock_generator + + # Create service and request + service = GenerationService(model_name="openai/gpt-4o-mini") + request = GenerationRequest(repo_path=".") + + # Execute with empty why_context + result = service.generate_commit_message(request, why_context="") + + # Verify + assert isinstance(result, GenerationResult) + assert result.message == "feat: add new feature" + + # Verify enhance was NOT called + mock_generator.generate_commit_message.assert_called_once() + mock_generator.enhance_with_why_context.assert_not_called() diff --git a/uv.lock b/uv.lock index 829e8b0..da15447 100644 --- a/uv.lock +++ b/uv.lock @@ -314,7 +314,7 @@ wheels = [ [[package]] name = "diffmage" -version = "0.5.0" +version = "0.6.2" source = { editable = "." } dependencies = [ { name = "gitpython" },