|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Manual validation script for Task 7 quality metrics. |
| 4 | +
|
| 5 | +This script helps validate that confidence scores and quality flags |
| 6 | +are accurate by presenting requirements for human review. |
| 7 | +""" |
| 8 | + |
| 9 | +import json |
| 10 | +from pathlib import Path |
| 11 | +import random |
| 12 | +import sys |
| 13 | + |
| 14 | +# Add project root to path |
| 15 | +project_root = Path(__file__).parent.parent |
| 16 | +sys.path.insert(0, str(project_root)) |
| 17 | + |
| 18 | + |
| 19 | +def load_benchmark_results(results_file: Path) -> dict: |
| 20 | + """Load benchmark results from JSON file.""" |
| 21 | + with open(results_file) as f: |
| 22 | + return json.load(f) |
| 23 | + |
| 24 | + |
| 25 | +def select_sample_requirements(all_results: list[dict], sample_size: int = 20) -> list[dict]: |
| 26 | + """ |
| 27 | + Select a stratified sample of requirements for validation. |
| 28 | +
|
| 29 | + Ensures sample includes: |
| 30 | + - Mix of confidence levels |
| 31 | + - Mix of document types |
| 32 | + - Requirements with and without quality flags |
| 33 | + """ |
| 34 | + |
| 35 | + # Collect all requirements with metadata |
| 36 | + all_reqs = [] |
| 37 | + for doc_result in all_results: |
| 38 | + doc_file = doc_result.get('file', 'unknown') |
| 39 | + # Simulate requirements (in real benchmark, they're in the result) |
| 40 | + doc_result.get('task7_quality_metrics', {}) |
| 41 | + req_count = doc_result.get('requirements_total', 0) |
| 42 | + |
| 43 | + # Create placeholder requirements for demonstration |
| 44 | + for i in range(req_count): |
| 45 | + all_reqs.append({ |
| 46 | + 'document': doc_file, |
| 47 | + 'index': i, |
| 48 | + 'confidence': 0.965, # From benchmark results |
| 49 | + 'quality_flags': [], |
| 50 | + 'req_id': f'REQ-{i+1:03d}', |
| 51 | + 'category': 'functional', |
| 52 | + 'body': f'Sample requirement {i+1} from {doc_file}' |
| 53 | + }) |
| 54 | + |
| 55 | + # Stratified sampling |
| 56 | + if len(all_reqs) <= sample_size: |
| 57 | + return all_reqs |
| 58 | + |
| 59 | + # Random sample |
| 60 | + return random.sample(all_reqs, min(sample_size, len(all_reqs))) |
| 61 | + |
| 62 | + |
| 63 | +def validate_requirement(req: dict, index: int, total: int) -> dict: |
| 64 | + """ |
| 65 | + Present a requirement for manual validation. |
| 66 | +
|
| 67 | + Returns validation results. |
| 68 | + """ |
| 69 | + print("\n" + "=" * 70) |
| 70 | + print(f"Requirement {index}/{total}") |
| 71 | + print("=" * 70) |
| 72 | + |
| 73 | + print(f"\n📄 Document: {req.get('document', 'unknown')}") |
| 74 | + print(f"🆔 ID: {req.get('req_id', 'N/A')}") |
| 75 | + print(f"📂 Category: {req.get('category', 'N/A')}") |
| 76 | + print("\n📝 Body:") |
| 77 | + print(f" {req.get('body', 'N/A')}") |
| 78 | + |
| 79 | + print("\n🎯 Task 7 Metrics:") |
| 80 | + print(f" • Confidence: {req.get('confidence', 0):.3f}") |
| 81 | + quality_flags = req.get('quality_flags', []) |
| 82 | + print(f" • Quality Flags: {', '.join(quality_flags) if quality_flags else 'None'}") |
| 83 | + |
| 84 | + print("\n" + "-" * 70) |
| 85 | + print("Manual Validation:") |
| 86 | + print("-" * 70) |
| 87 | + |
| 88 | + # Question 1: Is the requirement complete? |
| 89 | + print("\n1. Is this requirement complete and well-formed?") |
| 90 | + print(" (Does it have all necessary information?)") |
| 91 | + complete = input(" [y/n]: ").strip().lower() == 'y' |
| 92 | + |
| 93 | + # Question 2: Is the ID correct? |
| 94 | + print("\n2. Is the requirement ID appropriate?") |
| 95 | + id_correct = input(" [y/n]: ").strip().lower() == 'y' |
| 96 | + |
| 97 | + # Question 3: Is the category correct? |
| 98 | + print("\n3. Is the category classification correct?") |
| 99 | + category_correct = input(" [y/n]: ").strip().lower() == 'y' |
| 100 | + |
| 101 | + # Question 4: Are there any quality issues? |
| 102 | + print("\n4. Are there any quality issues you notice?") |
| 103 | + print(" (vague, ambiguous, missing context, etc.)") |
| 104 | + has_issues = input(" [y/n]: ").strip().lower() == 'y' |
| 105 | + |
| 106 | + issues = [] |
| 107 | + if has_issues: |
| 108 | + print("\n Describe the issues (comma-separated):") |
| 109 | + issues_str = input(" > ") |
| 110 | + issues = [i.strip() for i in issues_str.split(',') if i.strip()] |
| 111 | + |
| 112 | + # Question 5: Would you approve this requirement? |
| 113 | + print("\n5. Would you approve this requirement as-is?") |
| 114 | + would_approve = input(" [y/n]: ").strip().lower() == 'y' |
| 115 | + |
| 116 | + # Question 6: Rate the confidence score accuracy |
| 117 | + print(f"\n6. The system assigned confidence: {req.get('confidence', 0):.3f}") |
| 118 | + print(" Do you agree with this confidence level?") |
| 119 | + print(" 1 = Too high, 2 = About right, 3 = Too low") |
| 120 | + confidence_rating = input(" [1/2/3]: ").strip() |
| 121 | + |
| 122 | + return { |
| 123 | + 'requirement_id': req.get('req_id'), |
| 124 | + 'document': req.get('document'), |
| 125 | + 'complete': complete, |
| 126 | + 'id_correct': id_correct, |
| 127 | + 'category_correct': category_correct, |
| 128 | + 'has_issues': has_issues, |
| 129 | + 'issues': issues, |
| 130 | + 'would_approve': would_approve, |
| 131 | + 'confidence_rating': confidence_rating, |
| 132 | + 'system_confidence': req.get('confidence', 0), |
| 133 | + 'system_flags': req.get('quality_flags', []) |
| 134 | + } |
| 135 | + |
| 136 | + |
| 137 | +def generate_validation_report(validations: list[dict]) -> dict: |
| 138 | + """Generate summary report from validations.""" |
| 139 | + |
| 140 | + total = len(validations) |
| 141 | + if total == 0: |
| 142 | + return {} |
| 143 | + |
| 144 | + report = { |
| 145 | + 'total_validated': total, |
| 146 | + 'complete_count': sum(1 for v in validations if v['complete']), |
| 147 | + 'id_correct_count': sum(1 for v in validations if v['id_correct']), |
| 148 | + 'category_correct_count': sum(1 for v in validations if v['category_correct']), |
| 149 | + 'has_issues_count': sum(1 for v in validations if v['has_issues']), |
| 150 | + 'would_approve_count': sum(1 for v in validations if v['would_approve']), |
| 151 | + 'confidence_ratings': { |
| 152 | + 'too_high': sum(1 for v in validations if v['confidence_rating'] == '1'), |
| 153 | + 'about_right': sum(1 for v in validations if v['confidence_rating'] == '2'), |
| 154 | + 'too_low': sum(1 for v in validations if v['confidence_rating'] == '3') |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + # Calculate percentages |
| 159 | + report['complete_percentage'] = report['complete_count'] / total * 100 |
| 160 | + report['id_correct_percentage'] = report['id_correct_count'] / total * 100 |
| 161 | + report['category_correct_percentage'] = report['category_correct_count'] / total * 100 |
| 162 | + report['would_approve_percentage'] = report['would_approve_count'] / total * 100 |
| 163 | + |
| 164 | + # Aggregate issues |
| 165 | + all_issues = [] |
| 166 | + for v in validations: |
| 167 | + all_issues.extend(v['issues']) |
| 168 | + report['common_issues'] = list(set(all_issues)) |
| 169 | + |
| 170 | + return report |
| 171 | + |
| 172 | + |
| 173 | +def print_validation_report(report: dict) -> None: |
| 174 | + """Print validation report.""" |
| 175 | + |
| 176 | + print("\n" + "=" * 70) |
| 177 | + print("VALIDATION REPORT") |
| 178 | + print("=" * 70) |
| 179 | + |
| 180 | + total = report['total_validated'] |
| 181 | + |
| 182 | + print(f"\n📊 Overall Results (n={total}):") |
| 183 | + print("-" * 70) |
| 184 | + print(f" • Complete & well-formed: {report['complete_count']}/{total} ({report['complete_percentage']:.1f}%)") |
| 185 | + print(f" • ID correct: {report['id_correct_count']}/{total} ({report['id_correct_percentage']:.1f}%)") |
| 186 | + print(f" • Category correct: {report['category_correct_count']}/{total} ({report['category_correct_percentage']:.1f}%)") |
| 187 | + print(f" • Would approve: {report['would_approve_count']}/{total} ({report['would_approve_percentage']:.1f}%)") |
| 188 | + print(f" • Has quality issues: {report['has_issues_count']}/{total}") |
| 189 | + |
| 190 | + print("\n🎯 Confidence Score Assessment:") |
| 191 | + print("-" * 70) |
| 192 | + ratings = report['confidence_ratings'] |
| 193 | + print(f" • Too high (overconfident): {ratings['too_high']}/{total}") |
| 194 | + print(f" • About right (accurate): {ratings['about_right']}/{total}") |
| 195 | + print(f" • Too low (underconfident): {ratings['too_low']}/{total}") |
| 196 | + |
| 197 | + if report['common_issues']: |
| 198 | + print("\n🚩 Common Issues Found:") |
| 199 | + print("-" * 70) |
| 200 | + for issue in report['common_issues']: |
| 201 | + print(f" • {issue}") |
| 202 | + |
| 203 | + print("\n" + "=" * 70) |
| 204 | + |
| 205 | + # Recommendations |
| 206 | + print("\n💡 Recommendations:") |
| 207 | + print("-" * 70) |
| 208 | + |
| 209 | + approve_pct = report['would_approve_percentage'] |
| 210 | + if approve_pct >= 90: |
| 211 | + print(" ✅ Validation confirms high quality extraction") |
| 212 | + print(" → Confidence scores appear accurate") |
| 213 | + print(" → System ready for production use") |
| 214 | + elif approve_pct >= 75: |
| 215 | + print(" ⚠️ Good quality but some issues detected") |
| 216 | + print(" → Review common issues for patterns") |
| 217 | + print(" → Consider threshold adjustments") |
| 218 | + else: |
| 219 | + print(" ❌ Significant quality concerns detected") |
| 220 | + print(" → Investigate extraction pipeline") |
| 221 | + print(" → Consider re-tuning Task 7 parameters") |
| 222 | + |
| 223 | + # Confidence rating assessment |
| 224 | + too_high = ratings['too_high'] |
| 225 | + if too_high > total * 0.3: # >30% say too high |
| 226 | + print("\n ⚠️ Confidence scores may be inflated") |
| 227 | + print(" → Consider lowering confidence calculations") |
| 228 | + print(" → Add more stringent quality checks") |
| 229 | + |
| 230 | + |
| 231 | +def main(): |
| 232 | + """Run manual validation process.""" |
| 233 | + |
| 234 | + print("=" * 70) |
| 235 | + print("Task 7 Quality Metrics - Manual Validation") |
| 236 | + print("=" * 70) |
| 237 | + print() |
| 238 | + print("This tool helps validate that Task 7 confidence scores and") |
| 239 | + print("quality flags are accurate through manual review.") |
| 240 | + print() |
| 241 | + |
| 242 | + # Load latest benchmark results |
| 243 | + results_file = project_root / "test/test_results/benchmark_logs/benchmark_latest.json" |
| 244 | + |
| 245 | + if not results_file.exists(): |
| 246 | + print(f"❌ Benchmark results not found: {results_file}") |
| 247 | + print(" Please run the benchmark first:") |
| 248 | + print(" python test/debug/benchmark_performance.py") |
| 249 | + return |
| 250 | + |
| 251 | + print(f"📂 Loading benchmark results from: {results_file.name}") |
| 252 | + results = load_benchmark_results(results_file) |
| 253 | + |
| 254 | + # Get sample size |
| 255 | + print("\n⚙️ Configuration:") |
| 256 | + default_sample_size = 20 |
| 257 | + try: |
| 258 | + sample_size_input = input(f" Sample size (default: {default_sample_size}): ").strip() |
| 259 | + sample_size = int(sample_size_input) if sample_size_input else default_sample_size |
| 260 | + except ValueError: |
| 261 | + sample_size = default_sample_size |
| 262 | + |
| 263 | + print(f" → Validating {sample_size} requirements") |
| 264 | + |
| 265 | + # Select sample |
| 266 | + all_results = results.get('results', []) |
| 267 | + # Note: In real implementation, we'd load actual requirements from documents |
| 268 | + # For now, this is a framework |
| 269 | + |
| 270 | + print(f"\n📋 Total documents in benchmark: {len(all_results)}") |
| 271 | + total_reqs = sum(r.get('requirements_total', 0) for r in all_results) |
| 272 | + print(f"📋 Total requirements: {total_reqs}") |
| 273 | + |
| 274 | + # For demonstration, we'll use a simplified approach |
| 275 | + print("\n⚠️ Note: This is a validation framework.") |
| 276 | + print(" Actual requirement data would be loaded from benchmark results.") |
| 277 | + print(" For full validation, integrate with actual requirement extraction.") |
| 278 | + |
| 279 | + print("\n" + "=" * 70) |
| 280 | + print("✨ Validation framework ready!") |
| 281 | + print("=" * 70) |
| 282 | + |
| 283 | + print("\n💡 To perform actual validation:") |
| 284 | + print(" 1. Load specific requirements from benchmark results") |
| 285 | + print(" 2. Present each requirement to reviewer") |
| 286 | + print(" 3. Collect validation responses") |
| 287 | + print(" 4. Generate validation report") |
| 288 | + print(" 5. Adjust Task 7 parameters based on findings") |
| 289 | + |
| 290 | + |
| 291 | +if __name__ == "__main__": |
| 292 | + main() |
0 commit comments