From 741958537349ef7cf8e08cd0feacac83e623d870 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Tue, 5 Aug 2025 20:56:49 +0700 Subject: [PATCH 01/13] feat(generation): add support for enhancing commit messages with external why context - Introduce new prompt template for integrating external business/contextual information - Add method to enhance existing commit messages with why context - Modify service layer to accept optional why_context parameter - Separate why context handling from initial commit message generation to reduce noise and improve focus This change addresses inconsistent "why" scores in commit messages by implementing a two-stage prompting approach. Previously, the model would prioritize the core diff over external context when both were provided in a single prompt, resulting in strong "what" descriptions but weak "whys." The solution separates the concerns: first generating a preliminary message from the diff, then enhancing it with a second prompt that specifically focuses on integrating the "why" context. This approach improves accuracy and reduces noise, aligning with RAG patterns for better context-aware commit messages. --- src/diffmage/ai/prompt_manager.py | 31 +++++++++++++++++++ .../generation/commit_message_generator.py | 25 ++++++++++++++- src/diffmage/generation/service.py | 12 ++++++- 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index ed14b49..ede4637 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -74,6 +74,37 @@ def get_generation_system_prompt() -> str: """ +def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: + """ + Build the prompt for enhancing an existing commit message with external 'why' context. + """ + return f""" + You are a professional software engineer and an expert at writing concise, high-quality Git commit messages that follow the Conventional Commits specification. + + Your task is to refine and enhance the preliminary commit message by improving and expanding upon the "WHY" part of the commit message by integrating external context. + + Do not include any additional explanatory text, commentary, or conversational filler. The output should be ready to be copied and pasted directly into a `git commit` command. + + + {preliminary_message} + + + + {why_context} + + + + Lead with the problem. Start the WHY by clearly stating the problem or issue that the change is solving, state the solution, and end with the benefit. Be concise but thorough. + + Based on the provided information, return the exact with the enhanced "why" from the external context. Ensure the WHY follows the Conventional Commits format, ensuring the body clearly explains the problem and the benefit/impact of the change. + + Do not include any additional explanatory text, commentary, or conversational filler. The output should be ready to be copied and pasted directly into a `git commit` command. + + + Final commit message: + """ + + # Evaluation prompts diff --git a/src/diffmage/generation/commit_message_generator.py b/src/diffmage/generation/commit_message_generator.py index 2a8e04f..35b18f9 100644 --- a/src/diffmage/generation/commit_message_generator.py +++ b/src/diffmage/generation/commit_message_generator.py @@ -3,7 +3,7 @@ from typing import Optional from diffmage.ai.client import AIClient from diffmage.ai.models import get_default_model -from diffmage.ai.prompt_manager import get_commit_prompt +from diffmage.ai.prompt_manager import get_commit_prompt, get_why_context_prompt from diffmage.generation.models import GenerationResult @@ -55,6 +55,29 @@ def generate_commit_message( except Exception as e: raise ValueError(f"Error generating commit message: {e}") + def enhance_with_why_context( + self, result: GenerationResult, why_context: str + ) -> GenerationResult: + """ + Enhance existing commit message with external why context + + Args: + result: Initial commit message result + why_context: External business/contextual information + + Returns: + Enhanced GenerationResult with integrated context + """ + if not why_context: + return result + + prompt = get_why_context_prompt(str(result), why_context) + try: + message = self.client.generate_commit_message(prompt) + return GenerationResult(message=message.strip()) + except Exception as e: + raise ValueError(f"Error enhancing commit message with why context: {e}") + def _build_prompt( self, git_diff: str, file_count: int, lines_added: int, lines_removed: int ) -> str: diff --git a/src/diffmage/generation/service.py b/src/diffmage/generation/service.py index 0cfeeb2..879b721 100644 --- a/src/diffmage/generation/service.py +++ b/src/diffmage/generation/service.py @@ -12,7 +12,9 @@ class GenerationService: def __init__(self, model_name: Optional[str] = None) -> None: self.generator = CommitMessageGenerator(model_name=model_name) - def generate_commit_message(self, request: GenerationRequest) -> GenerationResult: + def generate_commit_message( + self, request: GenerationRequest, why_context: Optional[str] = None + ) -> GenerationResult: """ Generate a commit message from staged changes @@ -33,4 +35,12 @@ def generate_commit_message(self, request: GenerationRequest) -> GenerationResul git_diff, file_count, lines_added, lines_removed ) + # handle why context separately to separate concerns and handle llm focus. + # handling why context in the intial generate_commit_message was adding too + # much noise to the commit message generation. + if why_context: + result: GenerationResult = self.generator.enhance_with_why_context( + result, why_context + ) + return result From 61c53068c210ea190c1cbb85868ab647b7b9039d Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 11:24:02 +0700 Subject: [PATCH 02/13] feat(generation): enhance commit messages with why context and add comprehensive tests - Modify CommitMessageGenerator to use message content instead of full object when enhancing with why context - Add test coverage for why context enhancement including success, empty context, and error scenarios - Extend GenerationService to support why context integration in commit message generation - Add service layer tests for commit message generation with and without why context - Improve error handling for AI enhancement failures The previous implementation passed the full result object instead of just the message content, causing unnecessary data transfer and potential serialization issues. This change ensures only the relevant message content is used for AI enhancement, improving efficiency and reducing complexity. Additionally, comprehensive tests were added to validate the new why context feature works correctly across different scenarios including success cases, empty contexts, and error conditions. This provides confidence in the feature's reliability and makes future maintenance easier. --- .../generation/commit_message_generator.py | 2 +- .../test_commit_message_generator.py | 55 +++++++++++++++ tests/generation/test_service.py | 67 +++++++++++++++++++ 3 files changed, 123 insertions(+), 1 deletion(-) diff --git a/src/diffmage/generation/commit_message_generator.py b/src/diffmage/generation/commit_message_generator.py index 35b18f9..07545f1 100644 --- a/src/diffmage/generation/commit_message_generator.py +++ b/src/diffmage/generation/commit_message_generator.py @@ -71,7 +71,7 @@ def enhance_with_why_context( if not why_context: return result - prompt = get_why_context_prompt(str(result), why_context) + prompt = get_why_context_prompt(result.message, why_context) try: message = self.client.generate_commit_message(prompt) return GenerationResult(message=message.strip()) diff --git a/tests/generation/test_commit_message_generator.py b/tests/generation/test_commit_message_generator.py index 4aef2bf..88693d4 100644 --- a/tests/generation/test_commit_message_generator.py +++ b/tests/generation/test_commit_message_generator.py @@ -97,3 +97,58 @@ def test_build_prompt(self): mock_get_prompt.assert_called_once_with( diff_content="test diff", file_count=1, lines_added=5, lines_removed=2 ) + + def test_enhance_with_why_context_success(self): + """Test successful enhancement with why context.""" + generator = CommitMessageGenerator(model_name="openai/gpt-4o-mini") + initial_result = GenerationResult(message="feat: add authentication") + why_context = "Implementing OAuth2 for better security compliance" + + with patch( + "diffmage.generation.commit_message_generator.get_why_context_prompt" + ) as mock_get_prompt: + mock_get_prompt.return_value = "Enhanced prompt" + + with patch.object( + generator.client, + "generate_commit_message", + return_value="feat: implement OAuth2 authentication for security compliance", + ): + result = generator.enhance_with_why_context(initial_result, why_context) + + assert isinstance(result, GenerationResult) + assert ( + result.message + == "feat: implement OAuth2 authentication for security compliance" + ) + mock_get_prompt.assert_called_once_with( + "feat: add authentication", why_context + ) + + def test_enhance_with_why_context_empty_context(self): + """Test enhancement with empty why context returns original result.""" + generator = CommitMessageGenerator() + initial_result = GenerationResult(message="feat: add authentication") + + result = generator.enhance_with_why_context(initial_result, "") + + assert result is initial_result + + def test_enhance_with_why_context_ai_error(self): + """Test enhancement when AI call fails.""" + generator = CommitMessageGenerator() + initial_result = GenerationResult(message="feat: add authentication") + why_context = "Adding for security" + + with patch( + "diffmage.generation.commit_message_generator.get_why_context_prompt" + ): + with patch.object( + generator.client, + "generate_commit_message", + side_effect=Exception("API Error"), + ): + with pytest.raises( + ValueError, match="Error enhancing commit message with why context" + ): + generator.enhance_with_why_context(initial_result, why_context) diff --git a/tests/generation/test_service.py b/tests/generation/test_service.py index b159aa6..90c5494 100644 --- a/tests/generation/test_service.py +++ b/tests/generation/test_service.py @@ -109,3 +109,70 @@ def test_generate_commit_message_success( assert call_args[0][1] == mock_commit_analysis.total_files assert call_args[0][2] == mock_commit_analysis.total_lines_added assert call_args[0][3] == mock_commit_analysis.total_lines_removed + + @patch("diffmage.generation.service.CommitMessageGenerator") + @patch("diffmage.generation.service.GitDiffParser") + def test_generate_commit_message_with_why_context( + self, mock_git_parser_class, mock_generator_class, mock_commit_analysis + ): + """Test commit message generation with why context enhancement.""" + mock_parser = Mock() + mock_parser.parse_staged_changes.return_value = mock_commit_analysis + mock_git_parser_class.return_value = mock_parser + + mock_generator = Mock() + initial_result = GenerationResult(message="feat: add new feature") + enhanced_result = GenerationResult( + message="feat: add user authentication for improved security" + ) + mock_generator.generate_commit_message.return_value = initial_result + mock_generator.enhance_with_why_context.return_value = enhanced_result + mock_generator_class.return_value = mock_generator + + # Create service and request + service = GenerationService(model_name="openai/gpt-4o-mini") + request = GenerationRequest(repo_path=".") + why_context = "Adding authentication to improve security posture" + + # Execute + result = service.generate_commit_message(request, why_context=why_context) + + # Verify + assert isinstance(result, GenerationResult) + assert result.message == "feat: add user authentication for improved security" + + # Verify both methods were called + mock_generator.generate_commit_message.assert_called_once() + mock_generator.enhance_with_why_context.assert_called_once_with( + initial_result, why_context + ) + + @patch("diffmage.generation.service.CommitMessageGenerator") + @patch("diffmage.generation.service.GitDiffParser") + def test_generate_commit_message_with_empty_why_context( + self, mock_git_parser_class, mock_generator_class, mock_commit_analysis + ): + """Test commit message generation with empty why context (should not call enhance).""" + mock_parser = Mock() + mock_parser.parse_staged_changes.return_value = mock_commit_analysis + mock_git_parser_class.return_value = mock_parser + + mock_generator = Mock() + initial_result = GenerationResult(message="feat: add new feature") + mock_generator.generate_commit_message.return_value = initial_result + mock_generator_class.return_value = mock_generator + + # Create service and request + service = GenerationService(model_name="openai/gpt-4o-mini") + request = GenerationRequest(repo_path=".") + + # Execute with empty why_context + result = service.generate_commit_message(request, why_context="") + + # Verify + assert isinstance(result, GenerationResult) + assert result.message == "feat: add new feature" + + # Verify enhance was NOT called + mock_generator.generate_commit_message.assert_called_once() + mock_generator.enhance_with_why_context.assert_not_called() From dbdb2fd3a1ba5a0f5b5460b305d5985e358a76b0 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 11:28:51 +0700 Subject: [PATCH 03/13] docs(prompt_manager): clarify prompt instructions for enhancing why context in commit messages - Specify that preliminary message content should not be repeated - Add criteria for including WHY context (relevance, helpfulness, impact level) - Refine instruction to emphasize problem-solution-benefit structure - Maintain focus on concise, conventional commit format output The previous WHY context was too verbose and unfocused, leading to less effective commit message generation. This change streamlines the guidance to focus on impact and conciseness. By clearly stating the problem, solution, and benefit, developers can generate more precise and useful commit messages that improve code maintainability and collaboration. --- src/diffmage/ai/prompt_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index ede4637..0211676 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -94,9 +94,15 @@ def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: + Do NOT repeat anything that is already in the preliminary commit message. + + Only add WHY context if it is relevant to the changes and is actually helpful for generating a better commit message. + + Only add WHY context if it is a medium to high impact change. + Lead with the problem. Start the WHY by clearly stating the problem or issue that the change is solving, state the solution, and end with the benefit. Be concise but thorough. - Based on the provided information, return the exact with the enhanced "why" from the external context. Ensure the WHY follows the Conventional Commits format, ensuring the body clearly explains the problem and the benefit/impact of the change. + Based on the provided information, return the exact with the enhanced "why" from the external context. Ensure the WHY follows the Conventional Commits format, ensuring the body explains the problem and the benefit/impact of the change while being concise and to the point. Do not include any additional explanatory text, commentary, or conversational filler. The output should be ready to be copied and pasted directly into a `git commit` command. From 6ed1df6bcfec0c3a01f347a3c61e62231039b7d1 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 11:44:08 +0700 Subject: [PATCH 04/13] feat(cli): add context option for improved WHY generation - Add / option to command - Pass context to - Improve commit message generation with additional context This enables users to provide custom context for better commit message generation, solving the issue of generic or irrelevant commit messages. By allowing users to pass context, the generated commit messages become more accurate and meaningful, leading to better code documentation and easier maintenance. --- src/diffmage/cli/generate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffmage/cli/generate.py b/src/diffmage/cli/generate.py index 486504c..41b8675 100644 --- a/src/diffmage/cli/generate.py +++ b/src/diffmage/cli/generate.py @@ -15,6 +15,9 @@ def generate( False, "--list-models", help="List all available models" ), repo_path: str = typer.Option(".", "--repo", "-r"), + why_context: str = typer.Option( + None, "--context", "-c", help="Context to use to improve the WHY generation" + ), ) -> None: """Generate commit message from staged changes""" @@ -33,7 +36,7 @@ def generate( service = GenerationService(model_name=model_config.name) request = GenerationRequest(repo_path=repo_path) - result = service.generate_commit_message(request) + result = service.generate_commit_message(request, why_context) console.print(f"[green]Commit message:[/green] {result.message}") From 0b0811315ce1206ae15d35763c4b9c1b884e5d57 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 14:54:51 +0700 Subject: [PATCH 05/13] feat(validation): enhance commit message evaluation criteria and examples - Add detailed scoring rubrics for WHAT and WHY components - Include comprehensive examples covering excellent to poor quality messages - Improve prompt logic to better distinguish valuable context from technical noise - Update validation cases with more realistic scenarios and business impact descriptions - Refine evaluation thresholds and criteria descriptions for consistency This change improves the accuracy of commit message quality assessment by providing clearer guidelines and more representative test cases. --- benchmarks/validation_suite.py | 353 ++++++++++++++++++++++++------ src/diffmage/ai/prompt_manager.py | 303 +++++++++++++++---------- 2 files changed, 476 insertions(+), 180 deletions(-) diff --git a/benchmarks/validation_suite.py b/benchmarks/validation_suite.py index bd1ebff..fa31ceb 100644 --- a/benchmarks/validation_suite.py +++ b/benchmarks/validation_suite.py @@ -57,89 +57,310 @@ def get_obvious_test_cases(self) -> List[ValidationCase]: return [ # EXCELLENT cases (4.5-5.0) + ValidationCase( + name="excellent_security_with_impact", + commit_message="fix: resolve authentication bypass allowing unauthorized admin access\n\nCritical security vulnerability discovered in production where users could escalate privileges by manipulating JWT tokens. Affecting ~2,300 enterprise customers with potential for full system compromise.\n\n The root cause is a missing signature verification in token validation middleware. Impact: Complete authentication bypass, unauthorized admin panel access\nSolution: Implement proper HMAC signature validation with key rotation\nTesting: Verified with penetration testing and security audit\n\nResolves: SEC-2024-001, addresses compliance requirements for SOC2 certification.", + git_diff="""--- a/src/middleware/auth.py + +++ b/src/middleware/auth.py + @@ -1,4 +1,5 @@ + import jwt + +import hmac + +import hashlib + from flask import request, jsonify + + def verify_token(token): + try: + - payload = jwt.decode(token, verify=False) + + # Verify signature with rotating keys + + payload = jwt.decode(token, get_current_secret_key(), algorithms=['HS256']) + + + + # Additional signature verification + + expected_sig = hmac.new( + + get_signing_key().encode(), + + f"{payload['user_id']}{payload['exp']}".encode(), + + hashlib.sha256 + + ).hexdigest() + + + + if not hmac.compare_digest(payload.get('sig', ''), expected_sig): + + raise jwt.InvalidTokenError("Invalid signature") + + + return payload + except jwt.InvalidTokenError: + return None""", + expected_score_range=(4.5, 5.0), + expected_quality="excellent", + description="Security fix with clear business impact, root cause analysis, and comprehensive solution", + ), + ValidationCase( + name="excellent_performance_with_metrics", + commit_message="perf: optimize user search query reducing response time from 8s to 200ms\n\nUser search was causing timeouts and abandonment during peak hours. Analysis showed 47% of users abandoned search after 5+ second wait times, directly impacting conversion rates.\n\nProblem: Sequential database queries without indexing on 2M+ user records\nMetrics: 8-12 second response times, 47% abandonment rate, 200+ timeout errors/day\nSolution: Implement composite indexes + query optimization + result pagination\nResults: 200ms average response time, 12% abandonment rate, zero timeouts\n\nA/B testing shows 23% increase in search completion rates.\nEstimated impact: +$180K quarterly revenue from improved user engagement.", + git_diff="""--- a/src/models/User.js + +++ b/src/models/User.js + @@ -10,15 +10,25 @@ const userSchema = new mongoose.Schema({ + updatedAt: { type: Date, default: Date.now } + }); + + +// Composite indexes for optimized search + +userSchema.index({ + + firstName: 'text', + + lastName: 'text', + + email: 'text' + +}, { + + weights: { firstName: 10, lastName: 5, email: 1 } + +}); + +userSchema.index({ createdAt: -1 }); + +userSchema.index({ email: 1 }, { unique: true }); + + // Optimized search with pagination and relevance scoring + -userSchema.statics.searchUsers = function(searchTerm) { + +userSchema.statics.searchUsers = function(searchTerm, page = 1, limit = 20) { + const regex = new RegExp(searchTerm, 'i'); + - return this.find({ + + + + return this.find({ + $or: [ + - { email: regex }, + - { firstName: regex }, + - { lastName: regex } + + { $text: { $search: searchTerm } }, + + { email: regex } + ] + - }); + + }, { score: { $meta: 'textScore' } }) + + .sort({ score: { $meta: 'textScore' }, createdAt: -1 }) + + .limit(limit) + + .skip((page - 1) * limit); + };""", + expected_score_range=(4.5, 5.0), + expected_quality="excellent", + description="Performance optimization with detailed metrics, business impact, and quantified results", + ), + # GOOD cases (3.5-4.4) ValidationCase( name="security_fix", commit_message="fix: resolve critical SQL injection vulnerability in user authentication", git_diff="""--- a/src/auth/UserAuth.py -+++ b/src/auth/UserAuth.py -@@ -23,7 +23,8 @@ class UserAuth: - def authenticate_user(self, username, password): -- query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" -+ query = "SELECT * FROM users WHERE username=? AND password=?" -+ return self.db.execute(query, (username, password)) -- return self.db.execute(query)""", - expected_score_range=(4.0, 5.0), - expected_quality="excellent", + +++ b/src/auth/UserAuth.py + @@ -23,7 +23,8 @@ class UserAuth: + def authenticate_user(self, username, password): + - query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" + + query = "SELECT * FROM users WHERE username=? AND password=?" + + return self.db.execute(query, (username, password)) + - return self.db.execute(query)""", + expected_score_range=(3.5, 4.4), + expected_quality="good", description="Clear security fix with good explanation", ), ValidationCase( name="feature_with_context", commit_message="feat: implement user password reset with email verification\n\nAdds secure password reset flow:\n- Generate time-limited reset tokens\n- Send verification emails\n- Validate tokens before allowing reset\n- Log security events for auditing", git_diff="""--- a/src/auth/PasswordReset.py -+++ b/src/auth/PasswordReset.py -@@ -0,0 +1,45 @@ -+import secrets -+import hashlib -+from datetime import datetime, timedelta -+ -+class PasswordReset: -+ def __init__(self, email_service, user_service): -+ self.email_service = email_service -+ self.user_service = user_service -+ -+ def request_reset(self, email): -+ user = self.user_service.find_by_email(email) -+ if not user: -+ return False # Don't reveal if email exists -+ -+ token = secrets.token_urlsafe(32) -+ expires_at = datetime.now() + timedelta(hours=1) -+ -+ self.user_service.store_reset_token(user.id, token, expires_at) -+ self.email_service.send_reset_email(email, token) -+ -+ return True""", - expected_score_range=(4.0, 5.0), - expected_quality="excellent", + +++ b/src/auth/PasswordReset.py + @@ -0,0 +1,45 @@ + +import secrets + +import hashlib + +from datetime import datetime, timedelta + + + +class PasswordReset: + + def __init__(self, email_service, user_service): + + self.email_service = email_service + + self.user_service = user_service + + + + def request_reset(self, email): + + user = self.user_service.find_by_email(email) + + if not user: + + return False # Don't reveal if email exists + + + + token = secrets.token_urlsafe(32) + + expires_at = datetime.now() + timedelta(hours=1) + + + + self.user_service.store_reset_token(user.id, token, expires_at) + + self.email_service.send_reset_email(email, token) + + + + return True""", + expected_score_range=(3.5, 4.4), + expected_quality="good", description="Comprehensive feature with detailed explanation", ), - # GOOD cases (3.5-4.0) ValidationCase( name="simple_bug_fix", commit_message="fix: handle null values in user profile display", git_diff="""--- a/src/components/UserProfile.jsx -+++ b/src/components/UserProfile.jsx -@@ -15,7 +15,7 @@ const UserProfile = ({ user }) => { -
-

{user.name}

--

Email: {user.email}

-+

Email: {user.email || 'Not provided'}

--

Bio: {user.bio}

-+

Bio: {user.bio || 'No bio available'}

-
- );""", - expected_score_range=(3.0, 4.5), + +++ b/src/components/UserProfile.jsx + @@ -15,7 +15,7 @@ const UserProfile = ({ user }) => { +
+

{user.name}

+ -

Email: {user.email}

+ +

Email: {user.email || 'Not provided'}

+ -

Bio: {user.bio}

+ +

Bio: {user.bio || 'No bio available'}

+
+ );""", + expected_score_range=(3.5, 4.4), expected_quality="good", description="Clear bug fix, could use more detail", ), - # AVERAGE cases (2.5-3.5) + # AVERAGE cases (2.5-3.4) + ValidationCase( + name="average_refactor", + commit_message="refactor: move validation logic to separate service", + git_diff="""--- a/src/controllers/UserController.js + +++ b/src/controllers/UserController.js + @@ -5,12 +5,7 @@ class UserController { + async createUser(userData) { + try { + - // Validate email format + - if (!userData.email || !userData.email.includes('@')) { + - throw new Error('Invalid email'); + - } + - // Validate required fields + - if (!userData.firstName || !userData.lastName) { + - throw new Error('Missing required fields'); + - } + + ValidationService.validateUserData(userData); + + const user = await User.create(userData); + return { success: true, user }; + @@ -18,4 +13,17 @@ class UserController { + return { success: false, error: error.message }; + } + } + +} + + + +--- /dev/null + ++++ b/src/services/ValidationService.js + @@ -0,0 +1,15 @@ + +class ValidationService { + + static validateUserData(userData) { + + if (!userData.email || !userData.email.includes('@')) { + + throw new Error('Invalid email'); + + } + + + + if (!userData.firstName || !userData.lastName) { + + throw new Error('Missing required fields'); + + } + + } + }""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Basic refactor, describes what but minimal why context", + ), + ValidationCase( + name="average_bug_fix", + commit_message="fix: prevent crash when user profile is empty", + git_diff="""--- a/src/components/UserProfile.jsx + +++ b/src/components/UserProfile.jsx + @@ -8,7 +8,7 @@ const UserProfile = ({ user }) => { + return ( +
+

{user.name}

+ -

Bio: {user.bio}

+ +

Bio: {user.bio || 'No bio provided'}

+ -

Joined: {new Date(user.joinedAt).toLocaleDateString()}

+ +

Joined: {user.joinedAt ? new Date(user.joinedAt).toLocaleDateString() : 'Unknown'}

+
+ ); + };""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Fixes the issue but doesn't explain impact or root cause", + ), + ValidationCase( + name="average_feature_basic", + commit_message="feat: add dark mode toggle to settings page\n\n- Add toggle switch component\n- Store preference in localStorage\n- Apply theme on page load", + git_diff="""--- a/src/pages/Settings.jsx + +++ b/src/pages/Settings.jsx + @@ -1,4 +1,5 @@ + import React, { useState } from 'react'; + +import ToggleSwitch from '../components/ToggleSwitch'; + + const Settings = () => { + const [notifications, setNotifications] = useState(true); + + const [darkMode, setDarkMode] = useState( + + localStorage.getItem('darkMode') === 'true' + + ); + + + const handleDarkModeToggle = (enabled) => { + + setDarkMode(enabled); + + localStorage.setItem('darkMode', enabled); + + document.body.classList.toggle('dark-mode', enabled); + + }; + + return ( +
+

Settings

+ +
+ + +
+ + +
+ + + + + +
+
+ ); + };""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Describes the feature clearly but lacks user motivation or business context", + ), + ValidationCase( + name="average_docs_update", + commit_message="docs: update installation instructions for new deployment process", + git_diff="""--- a/README.md + +++ b/README.md + @@ -15,8 +15,12 @@ A modern web application for managing user accounts. + + ## Installation + + -1. Clone the repository + -2. Run `npm install` + -3. Start with `npm start` + +1. Clone the repository: `git clone https://github.com/company/app.git` + +2. Install dependencies: `npm install` + +3. Copy environment file: `cp .env.example .env` + +4. Configure your database settings in `.env` + +5. Run database migrations: `npm run migrate` + +6. Start the application: `npm start` + + ## Configuration + @@ -25,4 +29,8 @@ A modern web application for managing user accounts. + + - `DATABASE_URL` - Your database connection string + - `JWT_SECRET` - Secret key for JWT tokens + +- `SMTP_HOST` - Email server host + +- `SMTP_PORT` - Email server port + +- `SMTP_USER` - Email username + +- `SMTP_PASS` - Email password""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Updates documentation but doesn't explain why changes were needed", + ), + # POOR cases (1.5-2.4) ValidationCase( name="generic_update", commit_message="update user component", git_diff="""--- a/src/components/User.jsx -+++ b/src/components/User.jsx -@@ -10,6 +10,7 @@ const User = ({ userData }) => { - return ( -
- {userData.name} -+ {userData.role} -
- );""", - expected_score_range=(2.0, 3.5), - expected_quality="average", + +++ b/src/components/User.jsx + @@ -10,6 +10,7 @@ const User = ({ userData }) => { + return ( +
+ {userData.name} + + {userData.role} +
+ );""", + expected_score_range=(1.5, 2.4), + expected_quality="poor", description="Vague message, minimal change", ), - # POOR cases (1.5-2.5) ValidationCase( name="meaningless_message", commit_message="fix stuff", @@ -152,11 +373,11 @@ def authenticate_user(self, username, password): - return data.map(item => item.value); + return data.map(item => item.value || 0); }""", - expected_score_range=(1.0, 2.5), + expected_score_range=(1.5, 2.4), expected_quality="poor", description="Meaningless commit message", ), - # VERY POOR cases (1.0-2.0) + # VERY POOR cases (1.0-1.4) ValidationCase( name="gibberish", commit_message="asdf jkl; qwerty", @@ -166,7 +387,7 @@ def authenticate_user(self, username, password): // Test file console.log('hello'); +console.log('world');""", - expected_score_range=(1.0, 2.0), + expected_score_range=(1.0, 1.4), expected_quality="very_poor", description="Nonsensical commit message", ), @@ -180,7 +401,7 @@ def get_edge_test_cases(self) -> List[ValidationCase]: name="empty_message", commit_message="", git_diff="--- a/file.txt\n+++ b/file.txt\n@@ -1 +1,2 @@\n hello\n+world", - expected_score_range=(1.0, 2.0), + expected_score_range=(1.0, 1.4), expected_quality="very_poor", description="Empty commit message", ), @@ -191,7 +412,7 @@ def get_edge_test_cases(self) -> List[ValidationCase]: + "long commit message that goes on and on and provides way too much detail about a simple change " * 10, git_diff="--- a/file.txt\n+++ b/file.txt\n@@ -1 +1,2 @@\n hello\n+world", - expected_score_range=(1.0, 3.0), + expected_score_range=(1.0, 1.4), expected_quality="poor", description="Excessively long commit message", ), @@ -204,7 +425,7 @@ def get_edge_test_cases(self) -> List[ValidationCase]: def process_input(text): + text = text.encode('utf-8').decode('utf-8') return text.strip()""", - expected_score_range=(3.0, 4.5), + expected_score_range=(3.5, 4.4), expected_quality="good", description="Special characters and emojis", ), @@ -212,7 +433,7 @@ def process_input(text): name="no_diff", commit_message="fix: important bug fix", git_diff="", - expected_score_range=(1.0, 3.0), + expected_score_range=(1.0, 2.4), expected_quality="poor", description="No diff provided", ), @@ -233,7 +454,7 @@ def login(user, password): + +def reset_password(email): + send_reset_email(email)""", - expected_score_range=(1.0, 3.0), + expected_score_range=(1.0, 2.4), expected_quality="poor", description="Merge commit (usually auto-generated)", ), diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index 0211676..14bc090 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -30,29 +30,29 @@ def get_commit_prompt( prompt = f"""Analyze the following and generate a concise commit message {context_info}: - Instructions: - - Use conventional commits format:(): - - Only return the commit message, nothing else. - - Use imperative mood ("add", "fix", "update", not "added", "fixed", "updated") - - Focus on WHAT changed and WHY, not HOW. Consider WHAT was impacted and WHY it was needed. + Instructions: + - Use conventional commits format:(): + - Only return the commit message, nothing else. + - Use imperative mood ("add", "fix", "update", not "added", "fixed", "updated") + - Focus on WHAT changed and WHY, not HOW. Consider WHAT was impacted and WHY it was needed. - Common types: - - feat: New feature, enhancement, or functionality - - fix: Bug fix or error correction - - refactor: Code restructuring for readability, performance, or maintainability - - docs: Documentation changes only - - test: Adding or updating tests - - chore: Maintenance, dependencies, build changes + Common types: + - feat: New feature, enhancement, or functionality + - fix: Bug fix or error correction + - refactor: Code restructuring for readability, performance, or maintainability + - docs: Documentation changes only + - test: Adding or updating tests + - chore: Maintenance, dependencies, build changes - - {diff_content} - + + {diff_content} + - Generate the commit message, nothing else. + Generate the commit message, nothing else. - Commit message: - """ + Commit message: + """ return prompt @@ -63,52 +63,61 @@ def get_generation_system_prompt() -> str: """ return """ - Your commit messages are: - - Descriptive but brief - - Written in imperative mood - - Focused on the purpose and impact of changes - - Helpful for other developers and future maintenance - - Following industry best practices for writing commit messages - - Analyze code changes carefully and generate commit messages that accurately describe what was changed and why. - """ + Your commit messages are: + - Descriptive but brief + - Written in imperative mood + - Focused on the purpose and impact of changes + - Helpful for other developers and future maintenance + - Following industry best practices for writing commit messages + + Analyze code changes carefully and generate commit messages that accurately describe what was changed and why. + """ def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: """ Build the prompt for enhancing an existing commit message with external 'why' context. """ - return f""" - You are a professional software engineer and an expert at writing concise, high-quality Git commit messages that follow the Conventional Commits specification. - - Your task is to refine and enhance the preliminary commit message by improving and expanding upon the "WHY" part of the commit message by integrating external context. + return f"""You are a Git expert. Your task is to decide whether to enhance a commit message with external context. - Do not include any additional explanatory text, commentary, or conversational filler. The output should be ready to be copied and pasted directly into a `git commit` command. + STRICT EVALUATION CRITERIA: + - Does the context explain a USER PROBLEM or BUSINESS NEED that's not obvious from the code changes? + - Does it add meaningful insight about WHY this change was necessary? + - Does it avoid redundant technical implementation details already clear from the diff? - + {preliminary_message} - + {why_context} - - Do NOT repeat anything that is already in the preliminary commit message. + + 1. First decide: Does this add valuable WHY information that explains user problems, business needs, or meaningful impact? - Only add WHY context if it is relevant to the changes and is actually helpful for generating a better commit message. + 2. If NO (context is technical details, obvious info, or redundant): + - Return the EXACTLY as provided - Only add WHY context if it is a medium to high impact change. + 3. If YES ( explains real user problems, value, impact, reasoning, intent, context, background, implications): + - Add 1-5 concise sentences after the bullet points + - Focus on: What problem did this solve? What was the reasoning and intent? What is the context or background? What are the implications and impact? + - A gold standard "WHY" of a commit message explains the rationale and intent behind the changes made in a git commit. It goes beyond simply stating what was changed and delves into why those changes were necessary. + - Focus on this scenario: If a developer is to come across this in a git blame months down the road, what would they want and need to know? + - Keep it under 150 words total addition + - Lead with the problem. Start the WHY by clearly stating the problem or issue that the change is solving, state the solution, and end with the benefit. Be concise but thorough. + EXAMPLES OF GOOD TO CONSIDER: + - "Users were experiencing 3-second page load delays during peak traffic" + - "Support team received 50+ tickets weekly about login failures" + - "Business requirement to comply with GDPR by Q2" - Based on the provided information, return the exact with the enhanced "why" from the external context. Ensure the WHY follows the Conventional Commits format, ensuring the body explains the problem and the benefit/impact of the change while being concise and to the point. + EXAMPLES OF BAD TO IGNORE: + - "The previous implementation passed the full result object instead of just the message content" + - "This ensures proper test coverage for the new feature" + - "Using result.message to follow the pydantic structure" - Do not include any additional explanatory text, commentary, or conversational filler. The output should be ready to be copied and pasted directly into a `git commit` command. - - - Final commit message: - """ + Output the final commit message only:""" # Evaluation prompts @@ -120,11 +129,11 @@ def get_evaluation_system_prompt() -> str: """ return """ - Your commit messages are: - - Descriptive but brief - - Written in imperative mood - - Focused on the purpose and impact of changes - """ + Your commit messages are: + - Descriptive but brief + - Written in imperative mood + - Focused on the purpose and impact of changes + """ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: @@ -137,91 +146,157 @@ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: - WHAT (1-5): How accurately and completely does the message describe the actual code changes? - * 5 = All changes captured accurately and completely - * 3 = Main changes described, some details missing + * 5 = All high impactchanges captured accurately and completely with precise details. + * 4 = Main changes described accurately and clearly, only minor details missing + * 3 = Core changes described, some important details missing + * 2 = Basic changes mentioned, significant gaps in description * 1 = Major changes omitted or misrepresented - - WHY (1-5): How clearly does it explain the purpose/reasoning/impact of the changes? Be more lenient on low impact changes. - - Scale: 1=Very Poor, 2=Poor, 3=Average, 4=Good, 5=Excellent + + - WHY (1-5): How clearly does it explain the PURPOSE/REASONING/IMPACT of the changes? + * 5 = Clear user problem + impact, intent, reasoning, context, background, implications explained. + * 4 = Good rationale with clear benefit/motivation + * 3 = Basic purpose explained, could be clearer, minimal impact context + * 2 = Minimal or technical-only reasoning, no user problem, impact, intent, reasoning, context, background, implications explained + * 1 = No explanation of purpose/impact beyond obvious technical changes Example 1: - COMMIT: "fix bug" + COMMIT: "update config" DIFF: - --- a/src/auth.py - +++ b/src/auth.py - @@ -15,1 +15,2 @@ def authenticate_user(username, password): - user = get_user(username) - - return user.validate_password(password) - + if user: - + return user.validate_password(password) + --- a/config/database.yml + +++ b/config/database.yml + @@ -5,7 +5,7 @@ production: + adapter: postgresql + encoding: unicode + database: myapp_production + - pool: 5 + + pool: 25 + username: <%= ENV['DATABASE_USER'] %> + password: <%= ENV['DATABASE_PASS'] %> ANALYSIS: - - WHAT: 2/5 - Too vague, doesn't specify what bug or where - - WHY: 1/5 - No explanation of impact or reasoning + - WHAT: 2/5 - Basic changes mentioned but significant gaps: doesn't specify what config change or why pool size matters + - WHY: 1/5 - No explanation of purpose/impact beyond obvious technical changes Example 2: - COMMIT: "feat(auth): add JWT authentication to secure API endpoints" + COMMIT: "chore: upgrade React from v16 to v18 for better performance + + Updates all React dependencies and components to use new APIs. Improves rendering + performance and enables concurrent features for smoother user interactions." + + DIFF: + --- a/package.json + +++ b/package.json + @@ -10,8 +10,8 @@ + "dependencies": + - "react": "^16.14.0", + - "react-dom": "^16.14.0", + + "react": "^18.2.0", + + "react-dom": "^18.2.0", + "react-router": "^6.0.0" + + --- a/src/App.jsx + +++ b/src/App.jsx + @@ -1,4 +1,4 @@ + -import (render) from 'react-dom'; + +import (createRoot) from 'react-dom/client'; + + -(render)(, document.getElementById('root')); + +(const root = createRoot(document.getElementById('root')); + +root.render(); + + ANALYSIS: + - WHAT: 4/5 - Main changes described accurately: version upgrade and API updates, minor implementation details missing + - WHY: 3/5 - Basic purpose explained (performance, concurrent features) but could be clearer about specific benefits + + Example 3: + COMMIT: "fix: correct email validation regex to prevent invalid addresses + + Email validation was accepting malformed addresses like 'user@' and 'invalid.email', + causing downstream errors in the notification system and user registration failures." + DIFF: - --- a/src/middleware/auth.py - +++ b/src/middleware/auth.py - @@ -0,0 +1,12 @@ - +import jwt - +from flask import request, jsonify + --- a/src/utils/validation.js + +++ b/src/utils/validation.js + @@ -8,7 +8,7 @@ export const validateEmail = (email) => ( + return false; + + - const emailRegex = /S+@S+/; + + const emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{(2,)}$/; + return emailRegex.test(email) + + ANALYSIS: + - WHAT: 4/5 - Main changes described clearly: email validation fix with specific problem examples + - WHY: 4/5 - Good rationale with clear benefit: prevents downstream errors and registration issues + + Example 4: + COMMIT: "feat: implement caching layer reducing API response time by 75% + + Added Redis caching for frequently accessed product data after observing + average API response times of 2.4 seconds affecting user experience. + Cache hits now serve requests in 300ms, improving page load performance + and reducing database load by 60%. + + Implements TTL of 1 hour with automatic cache invalidation on product updates." + + DIFF: + --- a/src/services/productService.js + +++ b/src/services/productService.js + @@ -1,4 +1,5 @@ + const database = require('../database'); + +const redis = require('redis'); + +const client = redis.createClient(); + + class ProductService ( + async getProduct(id) + + // Check cache first + + const cached = await client.get(`product:$(id)`); + + if (cached) ) + + return JSON.parse(cached); + + ) + - +def require_auth(f): - + def decorated(*args, **kwargs): - + token = request.headers.get('Authorization') - + if not token: - + return jsonify({{'error': 'No token provided'}}), 401 - + try: - + jwt.decode(token, app.config['SECRET_KEY']) - + except jwt.InvalidTokenError: - + return jsonify({{'error': 'Invalid token'}}), 401 - + return f(*args, **kwargs) - + return decorated - - + def add_items_to_cart(cart, items): - + # adds items to the cart - + if not items: - + raise ValueError('No items to add') - + cart.add_items(items) + const product = await database.query('SELECT * FROM products WHERE id = ?', [id]); + + + + // Cache for 1 hour + + await client.setex(`product:$(id)`, 3600, JSON.stringify(product)); + return product; + ) + ) ANALYSIS: - - WHAT: 3/5 - The commit message is missing important added functionality (doesn't mention the add_items_to_cart function) - - WHY: 4/5 - Clear security purpose, could detail specific benefits + - WHAT: 5/5 - All high impact changes captured with precise details: Redis integration, caching logic, TTL configuration + - WHY: 5/5 - Clear user problem (2.4s response times) + quantified impact (75% reduction, 300ms response) + comprehensive context (database load reduction) + Example 5: + COMMIT: "test: add unit tests for password hashing utility" - Example 3: - COMMIT: "refactor: extract validation logic into UserService class" DIFF: - --- a/src/controllers/user_controller.py - +++ b/src/controllers/user_controller.py - @@ -8,6 +8,4 @@ class UserController: - def create_user(self, data): - - if not data.get('email'): - - raise ValueError('Email required') - - if len(data.get('password', '')) < 8: - - raise ValueError('Password too short') - + UserService.validate_user_data(data) - return User.create(data) - --- a/src/services/user_service.py - +++ b/src/services/user_service.py - @@ -0,0 +1,8 @@ - +class UserService: - + @staticmethod - + def validate_user_data(data): - + if not data.get('email'): - + raise ValueError('Email required') - + if len(data.get('password', '')) < 8: - + raise ValueError('Password too short') + --- /dev/null + +++ b/tests/utils/passwordUtils.test.js + @@ -0,0 +1,25 @@ + +const ( hashPassword, verifyPassword ) = require('../../src/utils/passwordUtils'); + + + +describe('Password Utils', () => ( + + test('should hash password correctly', () => ( + + const password = 'testpassword123'; + + const hash = hashPassword(password); + + expect(hash).toBeDefined(); + + expect(hash).not.toBe(password); + + ) + + + + test('should verify correct password', () => ( + + const password = 'testpassword123'; + + const hash = hashPassword(password); + + expect(verifyPassword(password, hash)).toBe(true); + + ) + +); ANALYSIS: - - WHAT: 5/5 - Exact description of the refactoring action - - WHY: 4/5 - Good architectural reasoning (separation of concerns) + - WHAT: 3/5 - Core changes described: test addition for password utility, some important details missing about test coverage + - WHY: 2/5 - Minimal reasoning: no explanation of why tests were needed or what problems they prevent - NOW EVALUATE THE FOLLOWING COMMIT MESSAGE: {commit_message} From c2422ca37fec857fc1695e0052245ac0e051890f Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 16:13:29 +0700 Subject: [PATCH 06/13] chore(ai): set default temperature to 0.0 for deterministic AI responses --- src/diffmage/ai/client.py | 2 +- src/diffmage/ai/prompt_manager.py | 2 +- .../evaluation/commit_message_evaluator.py | 2 +- .../generation/commit_message_generator.py | 2 +- tests/ai/test_client.py | 18 +++++++++--------- tests/evaluation/test_llm_evaluator.py | 4 ++-- .../test_commit_message_generator.py | 4 ++-- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/diffmage/ai/client.py b/src/diffmage/ai/client.py index 2923cd7..1e27420 100644 --- a/src/diffmage/ai/client.py +++ b/src/diffmage/ai/client.py @@ -22,7 +22,7 @@ class AIClient: """ def __init__( - self, model_name: str, temperature: float = 0.1, max_tokens: int = 1000 + self, model_name: str, temperature: float = 0.0, max_tokens: int = 1000 ): self.model_config = get_model_by_name(model_name) self.temperature = temperature diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index 14bc090..e8d3e4b 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -224,7 +224,7 @@ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: return false; - const emailRegex = /S+@S+/; - + const emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{(2,)}$/; + + const emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z](2,)$/; return emailRegex.test(email) ANALYSIS: diff --git a/src/diffmage/evaluation/commit_message_evaluator.py b/src/diffmage/evaluation/commit_message_evaluator.py index acb0fe7..7cd7d5f 100644 --- a/src/diffmage/evaluation/commit_message_evaluator.py +++ b/src/diffmage/evaluation/commit_message_evaluator.py @@ -25,7 +25,7 @@ class CommitMessageEvaluator: """ - def __init__(self, model_name: Optional[str] = None, temperature: float = 0.7): + def __init__(self, model_name: Optional[str] = None, temperature: float = 0.0): """ Initialize the LLM evaluator. diff --git a/src/diffmage/generation/commit_message_generator.py b/src/diffmage/generation/commit_message_generator.py index 07545f1..be54af1 100644 --- a/src/diffmage/generation/commit_message_generator.py +++ b/src/diffmage/generation/commit_message_generator.py @@ -14,7 +14,7 @@ class CommitMessageGenerator: This specialist focuses solely on the generation logic and prompt handling """ - def __init__(self, model_name: Optional[str] = None, temperature: float = 0.1): + def __init__(self, model_name: Optional[str] = None, temperature: float = 0.0): """ Initialize the LLM Generator diff --git a/tests/ai/test_client.py b/tests/ai/test_client.py index e5a0b0c..9eb4bd3 100644 --- a/tests/ai/test_client.py +++ b/tests/ai/test_client.py @@ -68,18 +68,18 @@ def test_ai_client_initialization(): client = AIClient(model_name="openai/gpt-4o-mini") assert client.model_config.name == "openai/gpt-4o-mini" - assert client.temperature == 0.1 + assert client.temperature == 0.0 assert client.max_tokens == 1000 def test_ai_client_initialization_with_custom_params(): """Test AIClient initialization with custom parameters.""" client = AIClient( - model_name="anthropic/claude-haiku", temperature=0.5, max_tokens=2000 + model_name="anthropic/claude-haiku", temperature=0.0, max_tokens=2000 ) assert client.model_config.name == "anthropic/claude-haiku" - assert client.temperature == 0.5 + assert client.temperature == 0.0 assert client.max_tokens == 2000 @@ -109,7 +109,7 @@ def test_generate_commit_message_success(mock_completion, mock_ai_response): mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "openai/gpt-4o-mini" - assert call_args[1]["temperature"] == 0.1 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1000 assert call_args[1]["stream"] is False @@ -144,7 +144,7 @@ def test_generate_commit_message_with_custom_params( # Create client with custom parameters client = AIClient( - model_name="anthropic/claude-haiku", temperature=0.7, max_tokens=1500 + model_name="anthropic/claude-haiku", temperature=0.0, max_tokens=1500 ) result = client.generate_commit_message(mock_commit_analysis) @@ -155,7 +155,7 @@ def test_generate_commit_message_with_custom_params( mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "anthropic/claude-haiku" - assert call_args[1]["temperature"] == 0.7 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1500 @@ -215,7 +215,7 @@ def test_evaluate_with_llm_success(mock_completion, mock_evaluation_response): mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "openai/gpt-4o-mini" - assert call_args[1]["temperature"] == 0.1 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1000 assert call_args[1]["stream"] is False @@ -269,7 +269,7 @@ def test_evaluate_with_llm_with_custom_params( # Create client with custom parameters client = AIClient( - model_name="anthropic/claude-haiku", temperature=0.7, max_tokens=1500 + model_name="anthropic/claude-haiku", temperature=0.0, max_tokens=1500 ) evaluation_prompt = "test evaluation prompt" result = client.evaluate_with_llm(evaluation_prompt) @@ -281,5 +281,5 @@ def test_evaluate_with_llm_with_custom_params( mock_completion.assert_called_once() call_args = mock_completion.call_args assert call_args[1]["model"] == "anthropic/claude-haiku" - assert call_args[1]["temperature"] == 0.7 + assert call_args[1]["temperature"] == 0.0 assert call_args[1]["max_tokens"] == 1500 diff --git a/tests/evaluation/test_llm_evaluator.py b/tests/evaluation/test_llm_evaluator.py index ae4e327..65a67a5 100644 --- a/tests/evaluation/test_llm_evaluator.py +++ b/tests/evaluation/test_llm_evaluator.py @@ -20,9 +20,9 @@ def test_init_with_custom_model(self): def test_init_with_custom_temperature(self): """Test LLMEvaluator initialization with custom temperature.""" - evaluator = CommitMessageEvaluator(temperature=0.5) + evaluator = CommitMessageEvaluator(temperature=0.0) - assert evaluator.ai_client.temperature == 0.5 + assert evaluator.ai_client.temperature == 0.0 def test_evaluate_commit_message_success(self): """Test successful commit message evaluation.""" diff --git a/tests/generation/test_commit_message_generator.py b/tests/generation/test_commit_message_generator.py index 88693d4..d217814 100644 --- a/tests/generation/test_commit_message_generator.py +++ b/tests/generation/test_commit_message_generator.py @@ -36,9 +36,9 @@ def test_init_with_custom_model(self): def test_init_with_custom_temperature(self): """Test CommitMessageGenerator initialization with custom temperature.""" - generator = CommitMessageGenerator(temperature=0.5) + generator = CommitMessageGenerator(temperature=0.0) - assert generator.client.temperature == 0.5 + assert generator.client.temperature == 0.0 def test_generate_commit_message_success(self): """Test successful commit message generation.""" From 99a0bc540b769d73983e3aa524265f747e563c0b Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 17:15:17 +0700 Subject: [PATCH 07/13] refactor(prompt_manager): enhance commit message evaluation criteria and instructions - Update evaluation criteria to reference and placeholders - Clarify definition of WHY in commit messages - Improve instructions for handling external context - Remove outdated examples that were causing confusion - Focus on preserving original commit message when context doesn't add value - Emphasize not making up information not present in provided context - Streamline prompt to reduce redundancy and improve clarity External context: Improved why context decision accuracy from 57.1% to 71.4% in benchmarks. This change correctly skips enhancement for cases like test_coverage_context that shouldn't be processed, marking the first successful implementation of this behavior. --- src/diffmage/ai/prompt_manager.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index e8d3e4b..c781474 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -81,9 +81,12 @@ def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: return f"""You are a Git expert. Your task is to decide whether to enhance a commit message with external context. STRICT EVALUATION CRITERIA: - - Does the context explain a USER PROBLEM or BUSINESS NEED that's not obvious from the code changes? + - Does the explain a USER PROBLEM or BUSINESS NEED that's not obvious from the code changes or ? - Does it add meaningful insight about WHY this change was necessary? - - Does it avoid redundant technical implementation details already clear from the diff? + - WHY is defined as the rationale and intent behind the changes made in a git commit. It goes beyond simply stating what was changed and delves into why those changes were necessary. + - Does the avoid redundant technical implementation details already clear from the diff? + + If any of these are true, return the EXACTLY as provided. {preliminary_message} @@ -94,29 +97,19 @@ def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: - 1. First decide: Does this add valuable WHY information that explains user problems, business needs, or meaningful impact? + 1. First decide: Does this add valuable WHY information that explains user problems, business needs, or meaningful impact that is not already clear from the ? 2. If NO (context is technical details, obvious info, or redundant): - Return the EXACTLY as provided 3. If YES ( explains real user problems, value, impact, reasoning, intent, context, background, implications): - - Add 1-5 concise sentences after the bullet points - Focus on: What problem did this solve? What was the reasoning and intent? What is the context or background? What are the implications and impact? - A gold standard "WHY" of a commit message explains the rationale and intent behind the changes made in a git commit. It goes beyond simply stating what was changed and delves into why those changes were necessary. - Focus on this scenario: If a developer is to come across this in a git blame months down the road, what would they want and need to know? - Keep it under 150 words total addition + - DO NOT make up information that is not in the or that can not be inferred directly from the + - EXAMPLES OF GOOD TO CONSIDER: - - "Users were experiencing 3-second page load delays during peak traffic" - - "Support team received 50+ tickets weekly about login failures" - - "Business requirement to comply with GDPR by Q2" - - EXAMPLES OF BAD TO IGNORE: - - "The previous implementation passed the full result object instead of just the message content" - - "This ensures proper test coverage for the new feature" - - "Using result.message to follow the pydantic structure" - Output the final commit message only:""" @@ -143,7 +136,6 @@ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: return f"""You are an expert code reviewer evaluating commit message quality using Chain-of-Though reasoning. - - WHAT (1-5): How accurately and completely does the message describe the actual code changes? * 5 = All high impactchanges captured accurately and completely with precise details. From f49c73fc65fa92eb6a6e7efb60636c557766669f Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 17:57:51 +0700 Subject: [PATCH 08/13] fix(prompt_manager): clarify JSON response requirement in evaluation prompt Some models weren't returning JSON responses consistently. Explicitly stating the JSON requirement ensures all models comply, addressing compatibility issues across different model providers. This change prevents parsing errors and ensures reliable evaluation results. --- src/diffmage/ai/prompt_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index c781474..aa73669 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -315,6 +315,8 @@ class ProductService ( - Keep it concise (3-5 sentences) + ONLY RETURN THE JSON RESPONSE, NOTHING ELSE. + REQUIRED JSON RESPONSE: {{ "what_score": <1-5>, From 594a152125fbb25ceee4de77bf5e7d63429061a1 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 18:14:53 +0700 Subject: [PATCH 09/13] feat(benchmarks): add comprehensive WHY context enhancement benchmark suite Adds a benchmarking suite to evaluate the effectiveness of WHY context enhancement in commit messages, ensuring it improves the WHY score without introducing noise or redundant technical detatails. Currently achieves a 75% success rate across most models, with areas for improvement identified in simple and average bug fix cases. --- benchmarks/why_context_benchmarks.py | 344 +++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 benchmarks/why_context_benchmarks.py diff --git a/benchmarks/why_context_benchmarks.py b/benchmarks/why_context_benchmarks.py new file mode 100644 index 0000000..d6f824f --- /dev/null +++ b/benchmarks/why_context_benchmarks.py @@ -0,0 +1,344 @@ +""" +WHY Context Enhancement Benchmarking Suite + +Add this to your benchmarks directory to test WHY context enhancement quality +""" + +from typing import List, Dict, Any, Optional +from dataclasses import dataclass +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich import box + +from diffmage.generation.commit_message_generator import CommitMessageGenerator +from diffmage.generation.models import GenerationResult +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator + +console = Console() + + +@dataclass +class WhyContextTestCase: + """Test case for WHY context enhancement""" + + name: str + commit_message: str + git_diff: str + why_context: str + should_enhance: bool # Should this context actually improve the message? + expected_improvement: float # Expected WHY score improvement (0 if no enhancement) + context_quality: str # "good", "bad", "technical", "redundant" + + +class WhyContextBenchmarkSuite: + """Benchmark suite specifically for WHY context enhancement""" + + def __init__(self, model_name: Optional[str] = None): + self.generator = CommitMessageGenerator(model_name=model_name) + self.evaluator = CommitMessageEvaluator(model_name=model_name) + + def get_why_context_test_cases(self) -> List[WhyContextTestCase]: + """Get comprehensive test cases for WHY context enhancement""" + + return [ + # GOOD WHY CONTEXT (should enhance) + WhyContextTestCase( + name="user_problem_context", + commit_message="fix: resolve null pointer exception in user validation", + git_diff="""--- a/src/auth/UserValidator.java + +++ b/src/auth/UserValidator.java + @@ -23,7 +23,7 @@ public class UserValidator { + public boolean isValidUser(User user) { + - return user.getEmail().contains("@"); + + return user != null && user.getEmail() != null && user.getEmail().contains("@"); + } + }""", + why_context="Users were experiencing app crashes during login attempts. Support team received 47 crash reports in the last week, all traced to this validation issue. This fix prevents the crashes and improves user experience.", + should_enhance=True, + expected_improvement=1.5, + context_quality="good", + ), + WhyContextTestCase( + name="performance_impact_context", + commit_message="perf: optimize database query for user search", + git_diff="""--- a/src/models/User.js + +++ b/src/models/User.js + @@ -15,8 +15,12 @@ const userSchema = new mongoose.Schema({ + +userSchema.index({ email: 1 }); + +userSchema.index({ firstName: 1, lastName: 1 }); + userSchema.statics.searchUsers = function(searchTerm) { + - return this.find({ + + return this.find({ + $or: [ + { email: regex }, + { firstName: regex } + ] + - }); + + }).limit(20).sort({ createdAt: -1 }); + };""", + why_context="User search was taking 8-12 seconds with 100k+ users. Customer support reported users abandoning the search feature. This optimization reduces search time to under 500ms, improving user engagement.", + should_enhance=True, + expected_improvement=2.0, + context_quality="good", + ), + WhyContextTestCase( + name="compliance_requirement_context", + commit_message="feat: add data encryption for user profiles", + git_diff="""--- a/src/models/UserProfile.py + +++ b/src/models/UserProfile.py + @@ -1,4 +1,5 @@ + from django.db import models + +from cryptography.fernet import Fernet + + class UserProfile(models.Model): + user = models.OneToOneField(User, on_delete=models.CASCADE) + - ssn = models.CharField(max_length=11) + + ssn_encrypted = models.BinaryField() + + + def set_ssn(self, ssn): + + f = Fernet(settings.ENCRYPTION_KEY) + + self.ssn_encrypted = f.encrypt(ssn.encode()) + + + + def get_ssn(self): + + f = Fernet(settings.ENCRYPTION_KEY) + + return f.decrypt(self.ssn_encrypted).decode()""", + why_context="Legal compliance requirement for GDPR and HIPAA. Audit found unencrypted PII in database. This change is required before Q2 compliance review to avoid potential $2.7M fine.", + should_enhance=True, + expected_improvement=1.8, + context_quality="good", + ), + # BAD WHY CONTEXT (should NOT enhance) + WhyContextTestCase( + name="technical_implementation_context", + commit_message="refactor: extract validation logic into separate class", + git_diff="""--- a/src/controllers/UserController.py + +++ b/src/controllers/UserController.py + @@ -8,6 +8,4 @@ class UserController: + def create_user(self, data): + - if not data.get('email'): + - raise ValueError('Email required') + + UserValidator.validate_email(data) + return User.create(data)""", + why_context="The previous implementation had validation logic directly in the controller. This refactor moves it to a separate UserValidator class following single responsibility principle and makes the code more maintainable.", + should_enhance=False, + expected_improvement=0.0, + context_quality="technical", + ), + WhyContextTestCase( + name="redundant_context", + commit_message="fix: handle null values in email validation", + git_diff="""--- a/src/utils/validator.js + +++ b/src/utils/validator.js + @@ -5,7 +5,7 @@ function validateEmail(email) { + - return email.includes('@'); + + return email && email.includes('@'); + }""", + why_context="This change fixes null pointer exceptions when email is null by adding a null check before calling includes method.", + should_enhance=False, + expected_improvement=0.0, + context_quality="redundant", + ), + WhyContextTestCase( + name="test_coverage_context", + commit_message="test: add unit tests for authentication service", + git_diff="""--- /dev/null + +++ b/tests/auth.test.js + @@ -0,0 +1,25 @@ + +describe('AuthService', () => { + + test('should authenticate valid user', () => { + + // test implementation + + }); + +});""", + why_context="Added comprehensive test coverage for the authentication service to ensure code quality and catch regressions. Tests cover both success and failure scenarios.", + should_enhance=False, + expected_improvement=0.0, + context_quality="technical", + ), + # BORDERLINE CASES + WhyContextTestCase( + name="mixed_context", + commit_message="fix: update API response format for user endpoints", + git_diff="""--- a/src/api/users.py + +++ b/src/api/users.py + @@ -15,7 +15,10 @@ def get_user(user_id): + user = User.find(user_id) + - return {'user': user.to_dict()} + + return { + + 'user': user.to_dict(), + + 'timestamp': datetime.now().isoformat() + + }""", + why_context="The mobile app team requested timestamp information for caching purposes. This was causing cache invalidation issues. Also updated the response format to be more consistent with other endpoints.", + should_enhance=True, + expected_improvement=1.0, + context_quality="good", # Mixed but has user problem + ), + ] + + def run_enhancement_quality_test( + self, test_cases: List[WhyContextTestCase] + ) -> Dict[str, Any]: + """Test quality of WHY context enhancement""" + + console.print( + Panel( + "[bold]Testing WHY Context Enhancement Quality[/bold]\n" + "Comparing original vs enhanced commit messages", + title="🔍 Enhancement Quality Test", + border_style="blue", + ) + ) + + results = [] + correct_decisions = 0 + total_improvement = 0 + + for case in test_cases: + console.print(f"[blue]Testing: {case.name}[/blue]") + + # Generate original message evaluation + original_eval = self.evaluator.evaluate_commit_message( + case.commit_message, case.git_diff + ) + + # Generate enhanced message + enhanced_result = self.generator.enhance_with_why_context( + GenerationResult(message=case.commit_message), case.why_context + ) + + # Evaluate enhanced message + enhanced_eval = self.evaluator.evaluate_commit_message( + enhanced_result.message, case.git_diff + ) + + # Calculate improvement + why_improvement = enhanced_eval.why_score - original_eval.why_score + overall_improvement = ( + enhanced_eval.overall_score - original_eval.overall_score + ) + + # Check if enhancement decision was correct + was_enhanced = enhanced_result.message != case.commit_message + correct_decision = was_enhanced == case.should_enhance + + if correct_decision: + correct_decisions += 1 + + total_improvement += why_improvement + + result = { + "case_name": case.name, + "context_quality": case.context_quality, + "should_enhance": case.should_enhance, + "was_enhanced": was_enhanced, + "correct_decision": correct_decision, + "original_why_score": original_eval.why_score, + "enhanced_why_score": enhanced_eval.why_score, + "why_improvement": why_improvement, + "overall_improvement": overall_improvement, + "original_message": case.commit_message, + "enhanced_message": enhanced_result.message, + "expected_improvement": case.expected_improvement, + } + + results.append(result) + + # Calculate summary statistics + decision_accuracy = (correct_decisions / len(test_cases)) * 100 + avg_improvement = total_improvement / len(test_cases) + good_context_cases = [r for r in results if r["context_quality"] == "good"] + bad_context_cases = [ + r for r in results if r["context_quality"] in ["technical", "redundant"] + ] + + summary = { + "decision_accuracy": decision_accuracy, + "average_why_improvement": avg_improvement, + "good_context_avg_improvement": sum( + r["why_improvement"] for r in good_context_cases + ) + / len(good_context_cases) + if good_context_cases + else 0, + "bad_context_avg_improvement": sum( + r["why_improvement"] for r in bad_context_cases + ) + / len(bad_context_cases) + if bad_context_cases + else 0, + "enhancement_rate": sum(1 for r in results if r["was_enhanced"]) + / len(results) + * 100, + } + + self._display_enhancement_results(results, summary) + + return { + "summary": summary, + "individual_results": results, + "passed": decision_accuracy >= 70 + and summary["good_context_avg_improvement"] > 0.5, + } + + def _display_enhancement_results( + self, results: List[Dict], summary: Dict[str, Any] + ): + """Display enhancement quality results""" + + table = Table(title="WHY Context Enhancement Results", box=box.SIMPLE) + table.add_column("Case", style="cyan") + table.add_column("Quality", justify="center") + table.add_column("Should Enhance", justify="center") + table.add_column("Was Enhanced", justify="center") + table.add_column("Decision", justify="center") + table.add_column("WHY Δ", justify="center") + + for result in results: + decision_color = "green" if result["correct_decision"] else "red" + decision_icon = "✅" if result["correct_decision"] else "❌" + + table.add_row( + result["case_name"], + result["context_quality"], + "Yes" if result["should_enhance"] else "No", + "Yes" if result["was_enhanced"] else "No", + f"[{decision_color}]{decision_icon}[/{decision_color}]", + f"{result['why_improvement']:+.1f}", + ) + + console.print(table) + + # Summary stats + console.print( + f"\n[blue]Decision Accuracy: {summary['decision_accuracy']:.1f}%[/blue]" + ) + console.print( + f"[blue]Average WHY Improvement: {summary['average_why_improvement']:+.2f}[/blue]" + ) + console.print( + f"[green]Good Context Improvement: {summary['good_context_avg_improvement']:+.2f}[/green]" + ) + console.print( + f"[red]Bad Context Improvement: {summary['bad_context_avg_improvement']:+.2f}[/red]" + ) + + +# Usage example: +def main(): + suite = WhyContextBenchmarkSuite() + + # Test enhancement quality + enhancement_results = suite.run_enhancement_quality_test( + suite.get_why_context_test_cases() + ) + + overall_passed = enhancement_results["passed"] + + if overall_passed: + console.print("\n[green]🎉 WHY Context Enhancement is working well![/green]") + else: + console.print("\n[red]❌ WHY Context Enhancement needs improvement[/red]") + + +if __name__ == "__main__": + main() From 1d689c223be20d95d77feb2d9d650be62a4226fa Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 18:20:46 +0700 Subject: [PATCH 10/13] docs(ai): clarify prompt manager instructions for why_context generation output Fixes issue where entire reasoning chain was being sent to commit message generation, causing verbose and confusing output. This change ensures only the essential prompt instructions are used, improving the quality and relevance of generated commit messages. --- src/diffmage/ai/prompt_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index aa73669..1b9ce6f 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -110,7 +110,9 @@ def get_why_context_prompt(preliminary_message: str, why_context: str) -> str: - DO NOT make up information that is not in the or that can not be inferred directly from the + - Output the final commit message only:""" + ONLY RETURN THE FINAL COMMIT MESSAGE that will be submitted to the git commit. DO NOT INCLUDE YOUR REASONING OR ANYTHING ELSE. + + final commit message:""" # Evaluation prompts From b53f169d5fb49bd1ca95c079a38b8b86e51c8100 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Wed, 6 Aug 2025 18:33:48 +0700 Subject: [PATCH 11/13] feat(ai): enhance commit message evaluation criteria for accuracy and truthfulness - Add requirement for messages to focus on accuracy, validity, and truthfulness - Update evaluation criteria to score 1 for WHAT/WHY when messages are untruthful or inaccurate - Clarify that score 1 applies when changes are misrepresented, omitted, or described inaccurately - Improve prompt guidance to ensure high-quality, honest commit message assessment --- src/diffmage/ai/prompt_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index 1b9ce6f..3aac5e5 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -128,6 +128,7 @@ def get_evaluation_system_prompt() -> str: - Descriptive but brief - Written in imperative mood - Focused on the purpose and impact of changes + - Focus on the accuracy, validity, and truthfulness of what is being described. """ @@ -138,13 +139,15 @@ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: return f"""You are an expert code reviewer evaluating commit message quality using Chain-of-Though reasoning. + If the commit message is untruthful, inaccurate, or misrepresents the code changes, return a score of 1 for both WHAT and WHY. + - WHAT (1-5): How accurately and completely does the message describe the actual code changes? * 5 = All high impactchanges captured accurately and completely with precise details. * 4 = Main changes described accurately and clearly, only minor details missing * 3 = Core changes described, some important details missing * 2 = Basic changes mentioned, significant gaps in description - * 1 = Major changes omitted or misrepresented + * 1 = Major changes omitted, misrepresented, not described, or inaccurate. - WHY (1-5): How clearly does it explain the PURPOSE/REASONING/IMPACT of the changes? * 5 = Clear user problem + impact, intent, reasoning, context, background, implications explained. From be89315dc41ae18efa5b6d896a65c1fc5a75f05c Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Thu, 7 Aug 2025 12:22:15 +0700 Subject: [PATCH 12/13] feat(validation): enhance commit message evaluation criteria and benchmarks - Update validation suite with new test cases for documentation, feature implementations, and security fixes - Refactor validation cases to better categorize quality levels (good, average, poor, very poor) - Improve evaluation criteria for WHAT/WHY scoring with clearer guidelines - Add WHY context guidance for lenient scoring of low-impact changes - Update benchmark suite to use enum-based context quality classification - Enhance result reporting with WHAT/WHY score breakdown - Fix service logic to properly return enhanced results when why_context is provided --- benchmarks/validation_suite.py | 308 +++++++++++++++++---------- benchmarks/why_context_benchmarks.py | 36 ++-- src/diffmage/ai/prompt_manager.py | 22 +- src/diffmage/generation/service.py | 4 +- uv.lock | 2 +- 5 files changed, 233 insertions(+), 139 deletions(-) diff --git a/benchmarks/validation_suite.py b/benchmarks/validation_suite.py index fa31ceb..b5b83a7 100644 --- a/benchmarks/validation_suite.py +++ b/benchmarks/validation_suite.py @@ -136,24 +136,9 @@ def verify_token(token): description="Performance optimization with detailed metrics, business impact, and quantified results", ), # GOOD cases (3.5-4.4) - ValidationCase( - name="security_fix", - commit_message="fix: resolve critical SQL injection vulnerability in user authentication", - git_diff="""--- a/src/auth/UserAuth.py - +++ b/src/auth/UserAuth.py - @@ -23,7 +23,8 @@ class UserAuth: - def authenticate_user(self, username, password): - - query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" - + query = "SELECT * FROM users WHERE username=? AND password=?" - + return self.db.execute(query, (username, password)) - - return self.db.execute(query)""", - expected_score_range=(3.5, 4.4), - expected_quality="good", - description="Clear security fix with good explanation", - ), ValidationCase( name="feature_with_context", - commit_message="feat: implement user password reset with email verification\n\nAdds secure password reset flow:\n- Generate time-limited reset tokens\n- Send verification emails\n- Validate tokens before allowing reset\n- Log security events for auditing", + commit_message="feat: implement user password reset with email verification\n\nAdds secure password reset flow:\n- Generate time-limited reset tokens for security\n- Send verification emails\n- Validate tokens before allowing reset\n- Log security events for auditing", git_diff="""--- a/src/auth/PasswordReset.py +++ b/src/auth/PasswordReset.py @@ -0,0 +1,45 @@ @@ -182,6 +167,177 @@ def authenticate_user(self, username, password): expected_quality="good", description="Comprehensive feature with detailed explanation", ), + ValidationCase( + name="good_docs_update", + commit_message="docs: update installation instructions for new deployment process", + git_diff="""--- a/README.md + +++ b/README.md + @@ -15,8 +15,12 @@ A modern web application for managing user accounts. + + ## Installation + + -1. Clone the repository + -2. Run `npm install` + -3. Start with `npm start` + +1. Clone the repository: `git clone https://github.com/company/app.git` + +2. Install dependencies: `npm install` + +3. Copy environment file: `cp .env.example .env` + +4. Configure your database settings in `.env` + +5. Run database migrations: `npm run migrate` + +6. Start the application: `npm start` + + ## Configuration + @@ -25,4 +29,8 @@ A modern web application for managing user accounts. + + - `DATABASE_URL` - Your database connection string + - `JWT_SECRET` - Secret key for JWT tokens + +- `SMTP_HOST` - Email server host + +- `SMTP_PORT` - Email server port + +- `SMTP_USER` - Email username + +- `SMTP_PASS` - Email password""", + expected_score_range=(3.5, 4.4), + expected_quality="good", + description="Updates documentation. Doesn't explain why changes were needed, but this is an example of a low impact change that we don't penalize for. Excellent would be a more detailed explanation of the why.", + ), + ValidationCase( + name="good_feature_with_context", + commit_message="feat: add email verification for new user registrations\n\nNew users must verify their email address before accessing the platform. Sends verification email with time-limited token (24hr expiry) and blocks login until confirmed. Reduces fake accounts and improves email deliverability for notifications.", + git_diff="""--- a/src/controllers/AuthController.js + +++ b/src/controllers/AuthController.js + @@ -1,4 +1,5 @@ + const User = require('../models/User'); + +const EmailService = require('../services/EmailService'); + const bcrypt = require('bcrypt'); + +const crypto = require('crypto'); + + class AuthController { + async register(req, res) { + @@ -10,8 +11,18 @@ class AuthController { + + const user = await User.create({ + email: userData.email, + password: hashedPassword, + - isActive: true + + isActive: false, + + emailVerified: false, + + verificationToken: crypto.randomBytes(32).toString('hex'), + + verificationExpires: new Date(Date.now() + 24 * 60 * 60 * 1000) + }); + + + + // Send verification email + + await EmailService.sendVerificationEmail( + + user.email, + + user.verificationToken + + ); + + - res.json({ success: true, userId: user.id }); + + res.json({ + + success: true, + + message: 'Please check your email to verify your account' + + }); + } + + + + async verifyEmail(req, res) { + + const { token } = req.params; + + + + const user = await User.findOne({ + + verificationToken: token, + + verificationExpires: { $gt: new Date() } + + }); + + + + if (!user) { + + return res.status(400).json({ error: 'Invalid or expired token' }); + + } + + + + user.emailVerified = true; + + user.isActive = true; + + user.verificationToken = null; + + user.verificationExpires = null; + + await user.save(); + + + + res.json({ success: true, message: 'Email verified successfully' }); + + } + + async login(req, res) { + const user = await User.findOne({ email: req.body.email }); + - + - if (!user || !bcrypt.compareSync(req.body.password, user.password)) { + + + + if (!user || !user.emailVerified || !bcrypt.compareSync(req.body.password, user.password)) { + return res.status(401).json({ error: 'Invalid credentials' }); + } + + + + if (!user.emailVerified) { + + return res.status(401).json({ error: 'Please verify your email first' }); + + } + """, + expected_score_range=(3.5, 4.4), + expected_quality="good", + description="Feature with clear user benefit and implementation details", + ), + ValidationCase( + name="good_feature_basic", + commit_message="feat: add dark mode toggle to settings page\n\n- Add toggle switch component\n- Store preference in localStorage\n- Apply theme on page load", + git_diff="""--- a/src/pages/Settings.jsx + +++ b/src/pages/Settings.jsx + @@ -1,4 +1,5 @@ + import React, { useState } from 'react'; + +import ToggleSwitch from '../components/ToggleSwitch'; + + const Settings = () => { + const [notifications, setNotifications] = useState(true); + + const [darkMode, setDarkMode] = useState( + + localStorage.getItem('darkMode') === 'true' + + ); + + + const handleDarkModeToggle = (enabled) => { + + setDarkMode(enabled); + + localStorage.setItem('darkMode', enabled); + + document.body.classList.toggle('dark-mode', enabled); + + }; + + return ( +
+

Settings

+ +
+ + +
+ + +
+ + + + + +
+
+ ); + };""", + expected_score_range=(3.5, 4.4), + expected_quality="good", + description="Describes the changes but lacks some what context and lacks user motivation or business context", + ), + # AVERAGE cases (2.5-3.4) + ValidationCase( + name="security_fix", + commit_message="fix: resolve critical SQL injection vulnerability in user authentication", + git_diff="""--- a/src/auth/UserAuth.py + +++ b/src/auth/UserAuth.py + @@ -23,7 +23,8 @@ class UserAuth: + def authenticate_user(self, username, password): + - query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" + + query = "SELECT * FROM users WHERE username=? AND password=?" + + return self.db.execute(query, (username, password)) + - return self.db.execute(query)""", + expected_score_range=(2.5, 3.4), + expected_quality="average", + description="Clear security fix with no technical explanation", + ), ValidationCase( name="simple_bug_fix", commit_message="fix: handle null values in user profile display", @@ -196,11 +352,10 @@ def authenticate_user(self, username, password): +

Bio: {user.bio || 'No bio available'}

);""", - expected_score_range=(3.5, 4.4), - expected_quality="good", + expected_score_range=(2.5, 3.4), + expected_quality="average", description="Clear bug fix, could use more detail", ), - # AVERAGE cases (2.5-3.4) ValidationCase( name="average_refactor", commit_message="refactor: move validation logic to separate service", @@ -265,85 +420,6 @@ def authenticate_user(self, username, password): expected_quality="average", description="Fixes the issue but doesn't explain impact or root cause", ), - ValidationCase( - name="average_feature_basic", - commit_message="feat: add dark mode toggle to settings page\n\n- Add toggle switch component\n- Store preference in localStorage\n- Apply theme on page load", - git_diff="""--- a/src/pages/Settings.jsx - +++ b/src/pages/Settings.jsx - @@ -1,4 +1,5 @@ - import React, { useState } from 'react'; - +import ToggleSwitch from '../components/ToggleSwitch'; - - const Settings = () => { - const [notifications, setNotifications] = useState(true); - + const [darkMode, setDarkMode] = useState( - + localStorage.getItem('darkMode') === 'true' - + ); - - + const handleDarkModeToggle = (enabled) => { - + setDarkMode(enabled); - + localStorage.setItem('darkMode', enabled); - + document.body.classList.toggle('dark-mode', enabled); - + }; - - return ( -
-

Settings

- -
- - -
- - +
- + - + - +
-
- ); - };""", - expected_score_range=(2.5, 3.4), - expected_quality="average", - description="Describes the feature clearly but lacks user motivation or business context", - ), - ValidationCase( - name="average_docs_update", - commit_message="docs: update installation instructions for new deployment process", - git_diff="""--- a/README.md - +++ b/README.md - @@ -15,8 +15,12 @@ A modern web application for managing user accounts. - - ## Installation - - -1. Clone the repository - -2. Run `npm install` - -3. Start with `npm start` - +1. Clone the repository: `git clone https://github.com/company/app.git` - +2. Install dependencies: `npm install` - +3. Copy environment file: `cp .env.example .env` - +4. Configure your database settings in `.env` - +5. Run database migrations: `npm run migrate` - +6. Start the application: `npm start` - - ## Configuration - @@ -25,4 +29,8 @@ A modern web application for managing user accounts. - - - `DATABASE_URL` - Your database connection string - - `JWT_SECRET` - Secret key for JWT tokens - +- `SMTP_HOST` - Email server host - +- `SMTP_PORT` - Email server port - +- `SMTP_USER` - Email username - +- `SMTP_PASS` - Email password""", - expected_score_range=(2.5, 3.4), - expected_quality="average", - description="Updates documentation but doesn't explain why changes were needed", - ), # POOR cases (1.5-2.4) ValidationCase( name="generic_update", @@ -361,32 +437,32 @@ def authenticate_user(self, username, password): expected_quality="poor", description="Vague message, minimal change", ), + # VERY POOR cases (1.0-1.4) ValidationCase( name="meaningless_message", commit_message="fix stuff", git_diff="""--- a/src/utils/helper.js -+++ b/src/utils/helper.js -@@ -5,7 +5,7 @@ function processData(data) { - if (!data) { - return null; - } -- return data.map(item => item.value); -+ return data.map(item => item.value || 0); - }""", - expected_score_range=(1.5, 2.4), - expected_quality="poor", - description="Meaningless commit message", + +++ b/src/utils/helper.js + @@ -5,7 +5,7 @@ function processData(data) { + if (!data) { + return null; + } + - return data.map(item => item.value); + + return data.map(item => item.value || 0); + }""", + expected_score_range=(1.0, 1.4), + expected_quality="very_poor", + description="Meaningless commit message without any context", ), - # VERY POOR cases (1.0-1.4) ValidationCase( name="gibberish", commit_message="asdf jkl; qwerty", git_diff="""--- a/src/test.js -+++ b/src/test.js -@@ -1,3 +1,4 @@ - // Test file - console.log('hello'); -+console.log('world');""", + +++ b/src/test.js + @@ -1,3 +1,4 @@ + // Test file + console.log('hello'); + +console.log('world');""", expected_score_range=(1.0, 1.4), expected_quality="very_poor", description="Nonsensical commit message", @@ -864,6 +940,7 @@ def _display_obvious_cases_results( table.add_column("Case", style="cyan") table.add_column("Expected", justify="center") table.add_column("Actual", justify="center") + table.add_column("What/Why", justify="center") table.add_column("Status", justify="center") table.add_column("Deviation", justify="center") @@ -880,6 +957,7 @@ def _display_obvious_cases_results( r["case"].name, expected_range, actual_score, + f"{r['result'].what_score:.1f}/{r['result'].why_score:.1f}", f"[{status_color}]{status}[/{status_color}]", deviation, ) diff --git a/benchmarks/why_context_benchmarks.py b/benchmarks/why_context_benchmarks.py index d6f824f..e0b8ed4 100644 --- a/benchmarks/why_context_benchmarks.py +++ b/benchmarks/why_context_benchmarks.py @@ -10,7 +10,7 @@ from rich.table import Table from rich.panel import Panel from rich import box - +from enum import Enum from diffmage.generation.commit_message_generator import CommitMessageGenerator from diffmage.generation.models import GenerationResult from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator @@ -18,6 +18,13 @@ console = Console() +class ContextQuality(str, Enum): + GOOD = "good" + BAD = "bad" + TECHNICAL = "technical" + REDUNDANT = "redundant" + + @dataclass class WhyContextTestCase: """Test case for WHY context enhancement""" @@ -28,7 +35,7 @@ class WhyContextTestCase: why_context: str should_enhance: bool # Should this context actually improve the message? expected_improvement: float # Expected WHY score improvement (0 if no enhancement) - context_quality: str # "good", "bad", "technical", "redundant" + context_quality: ContextQuality class WhyContextBenchmarkSuite: @@ -57,7 +64,7 @@ def get_why_context_test_cases(self) -> List[WhyContextTestCase]: why_context="Users were experiencing app crashes during login attempts. Support team received 47 crash reports in the last week, all traced to this validation issue. This fix prevents the crashes and improves user experience.", should_enhance=True, expected_improvement=1.5, - context_quality="good", + context_quality=ContextQuality.GOOD, ), WhyContextTestCase( name="performance_impact_context", @@ -80,7 +87,7 @@ def get_why_context_test_cases(self) -> List[WhyContextTestCase]: why_context="User search was taking 8-12 seconds with 100k+ users. Customer support reported users abandoning the search feature. This optimization reduces search time to under 500ms, improving user engagement.", should_enhance=True, expected_improvement=2.0, - context_quality="good", + context_quality=ContextQuality.GOOD, ), WhyContextTestCase( name="compliance_requirement_context", @@ -106,7 +113,7 @@ class UserProfile(models.Model): why_context="Legal compliance requirement for GDPR and HIPAA. Audit found unencrypted PII in database. This change is required before Q2 compliance review to avoid potential $2.7M fine.", should_enhance=True, expected_improvement=1.8, - context_quality="good", + context_quality=ContextQuality.GOOD, ), # BAD WHY CONTEXT (should NOT enhance) WhyContextTestCase( @@ -123,7 +130,7 @@ def create_user(self, data): why_context="The previous implementation had validation logic directly in the controller. This refactor moves it to a separate UserValidator class following single responsibility principle and makes the code more maintainable.", should_enhance=False, expected_improvement=0.0, - context_quality="technical", + context_quality=ContextQuality.TECHNICAL, ), WhyContextTestCase( name="redundant_context", @@ -137,7 +144,7 @@ def create_user(self, data): why_context="This change fixes null pointer exceptions when email is null by adding a null check before calling includes method.", should_enhance=False, expected_improvement=0.0, - context_quality="redundant", + context_quality=ContextQuality.REDUNDANT, ), WhyContextTestCase( name="test_coverage_context", @@ -153,7 +160,7 @@ def create_user(self, data): why_context="Added comprehensive test coverage for the authentication service to ensure code quality and catch regressions. Tests cover both success and failure scenarios.", should_enhance=False, expected_improvement=0.0, - context_quality="technical", + context_quality=ContextQuality.TECHNICAL, ), # BORDERLINE CASES WhyContextTestCase( @@ -171,7 +178,7 @@ def create_user(self, data): why_context="The mobile app team requested timestamp information for caching purposes. This was causing cache invalidation issues. Also updated the response format to be more consistent with other endpoints.", should_enhance=True, expected_improvement=1.0, - context_quality="good", # Mixed but has user problem + context_quality=ContextQuality.GOOD, # Mixed but has user problem ), ] @@ -246,9 +253,14 @@ def run_enhancement_quality_test( # Calculate summary statistics decision_accuracy = (correct_decisions / len(test_cases)) * 100 avg_improvement = total_improvement / len(test_cases) - good_context_cases = [r for r in results if r["context_quality"] == "good"] + good_context_cases = [ + r for r in results if r["context_quality"] == ContextQuality.GOOD + ] bad_context_cases = [ - r for r in results if r["context_quality"] in ["technical", "redundant"] + r + for r in results + if r["context_quality"] + in [ContextQuality.TECHNICAL, ContextQuality.REDUNDANT] ] summary = { @@ -299,7 +311,7 @@ def _display_enhancement_results( table.add_row( result["case_name"], - result["context_quality"], + result["context_quality"].value, "Yes" if result["should_enhance"] else "No", "Yes" if result["was_enhanced"] else "No", f"[{decision_color}]{decision_icon}[/{decision_color}]", diff --git a/src/diffmage/ai/prompt_manager.py b/src/diffmage/ai/prompt_manager.py index 3aac5e5..c7c7efb 100644 --- a/src/diffmage/ai/prompt_manager.py +++ b/src/diffmage/ai/prompt_manager.py @@ -143,20 +143,26 @@ def get_evaluation_prompt(commit_message: str, git_diff: str) -> str: - WHAT (1-5): How accurately and completely does the message describe the actual code changes? - * 5 = All high impactchanges captured accurately and completely with precise details. + * 5 = All high impact changes captured accurately and completely with precise details. * 4 = Main changes described accurately and clearly, only minor details missing - * 3 = Core changes described, some important details missing + * 3 = Core changes mentioned, some important details missing * 2 = Basic changes mentioned, significant gaps in description - * 1 = Major changes omitted, misrepresented, not described, or inaccurate. + * 1 = Major changes omitted, misrepresented, not described, or inaccurate. The message is completely non-descriptive and could apply to any commit. It provides zero insight into the change, location or substance. All changes are effectively omitted. - WHY (1-5): How clearly does it explain the PURPOSE/REASONING/IMPACT of the changes? - * 5 = Clear user problem + impact, intent, reasoning, context, background, implications explained. - * 4 = Good rationale with clear benefit/motivation - * 3 = Basic purpose explained, could be clearer, minimal impact context - * 2 = Minimal or technical-only reasoning, no user problem, impact, intent, reasoning, context, background, implications explained - * 1 = No explanation of purpose/impact beyond obvious technical changes + * 5 = Extremely clear user problem + impact, intent, reasoning, context, background, implications explained. + * 4 = Strong rationale with clear purpose/reasoning/impact/background/motivation. + * 3 = Basic purpose/reasoning/impact/background/motivation explained, could be more detailed. + * 2 = Minimal or technical-only reasoning, no user problem, impact, intent, reasoning, context, background, or implications explained + * 1 = No explanation of purpose/impact. The message is completely non-descriptive and could apply to any commit. It provides zero insight into the change, location or substance. All changes are effectively omitted. + + - If the change is low impact (docs, refactoring, type safety, performance optimizations, tests, dependencies, configs), be lenient with the WHY score. Otherwise, adhere to the provided . + - If the WHY can be inferred from the , brief benefit mentions are sufficient and leniency is allowed for the WHY score. + - DO NOT penalize the WHY score for not explaining obvious, low impact technical benefits in depth. + + Example 1: COMMIT: "update config" diff --git a/src/diffmage/generation/service.py b/src/diffmage/generation/service.py index 879b721..45fef47 100644 --- a/src/diffmage/generation/service.py +++ b/src/diffmage/generation/service.py @@ -39,8 +39,6 @@ def generate_commit_message( # handling why context in the intial generate_commit_message was adding too # much noise to the commit message generation. if why_context: - result: GenerationResult = self.generator.enhance_with_why_context( - result, why_context - ) + return self.generator.enhance_with_why_context(result, why_context) return result diff --git a/uv.lock b/uv.lock index 829e8b0..da15447 100644 --- a/uv.lock +++ b/uv.lock @@ -314,7 +314,7 @@ wheels = [ [[package]] name = "diffmage" -version = "0.5.0" +version = "0.6.2" source = { editable = "." } dependencies = [ { name = "gitpython" }, From fb9a66c5799a14fb3c258f0ebcd2a82042a277e9 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Thu, 7 Aug 2025 13:07:42 +0700 Subject: [PATCH 13/13] docs(benchmarks): remove redundant comments and streamline descriptions No functional changes; improves readability and conciceness of benchmark documentation --- benchmarks/stability_benchmarks.py | 5 +---- benchmarks/validation_suite.py | 3 --- benchmarks/why_context_benchmarks.py | 2 -- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/benchmarks/stability_benchmarks.py b/benchmarks/stability_benchmarks.py index 0c4082f..243fff1 100644 --- a/benchmarks/stability_benchmarks.py +++ b/benchmarks/stability_benchmarks.py @@ -1,9 +1,6 @@ """ Stability Benchmarking Script for DiffMage -This script uses your existing EvaluationBenchmarks class to test evaluation -stability across various real commit examples and scenarios. - Usage: python benchmark_stability.py --repo-path /path/to/repo --runs 10 python benchmark_stability.py --commit-range "HEAD~20..HEAD" --runs 5 @@ -618,7 +615,7 @@ def main(): description="Stability Benchmarking Script for DiffMage" ) parser.add_argument("--repo-path", default=".", help="Path to git repository") - parser.add_argument("--model", help="AI model to test (default: uses your default)") + parser.add_argument("--model", help="AI model to test (default: uses default)") parser.add_argument("--runs", type=int, default=5, help="Number of runs per test") parser.add_argument( "--variance-threshold", diff --git a/benchmarks/validation_suite.py b/benchmarks/validation_suite.py index b5b83a7..88c079f 100644 --- a/benchmarks/validation_suite.py +++ b/benchmarks/validation_suite.py @@ -1,9 +1,6 @@ """ Evaluation *Smoke Test* Validation Suite for DiffMage -This script performs fundamental sanity checks on your LLM evaluator to ensure -it's working correctly before running extensive benchmarks. - Usage: python validation_suite.py --all python validation_suite.py --test obvious-cases diff --git a/benchmarks/why_context_benchmarks.py b/benchmarks/why_context_benchmarks.py index e0b8ed4..a779645 100644 --- a/benchmarks/why_context_benchmarks.py +++ b/benchmarks/why_context_benchmarks.py @@ -1,7 +1,5 @@ """ WHY Context Enhancement Benchmarking Suite - -Add this to your benchmarks directory to test WHY context enhancement quality """ from typing import List, Dict, Any, Optional