SoftwareDevLabs
diff --git a/‎test/benchmark/benchmark_performance.py‎
Lines changed: 505 additions & 0 deletions b/‎test/benchmark/benchmark_performance.py‎
Lines changed: 505 additions & 0 deletions
diff --git a/‎test/benchmark/monitor_benchmark.sh‎
Lines changed: 41 additions & 0 deletions b/‎test/benchmark/monitor_benchmark.sh‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎test/benchmark/samples/architecture.pptx‎
29.5 KB b/‎test/benchmark/samples/architecture.pptx‎
29.5 KB
diff --git a/‎test/benchmark/samples/business_requirements.docx‎
36.2 KB b/‎test/benchmark/samples/business_requirements.docx‎
36.2 KB
diff --git a/‎test/benchmark/samples/large_requirements.pdf‎
20.1 KB b/‎test/benchmark/samples/large_requirements.pdf‎
20.1 KB
diff --git a/‎test/benchmark/samples/small_requirements.pdf‎
3.28 KB b/‎test/benchmark/samples/small_requirements.pdf‎
3.28 KB
diff --git a/‎test/manual/quality_validation.py‎
Lines changed: 292 additions & 0 deletions b/‎test/manual/quality_validation.py‎
Lines changed: 292 additions & 0 deletions
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Benchmark monitoring script
+# Monitors the benchmark progress and displays updates
+
+LOG_FILE="test_results/benchmark_optimized_output.log"
+
+echo ""
+echo "🔍 Benchmark Monitor Started"
+echo "   Checking progress every 30 seconds..."
+echo "   Press Ctrl+C to stop monitoring"
+echo ""
+
+while true; do
+    # Check if benchmark is still running
+    if ! ps aux | grep -q "[p]ython test/debug/benchmark_performance.py"; then
+        echo "✅ Benchmark process completed!"
+        echo ""
+        echo "📊 Final Results:"
+        tail -50 "$LOG_FILE" | grep -E "(Testing:|completed|BENCHMARK|Total)"
+        break
+    fi
+    
+    # Show current progress
+    echo "────────────────────────────────────────────────────────────────"
+    echo "⏳ $(date '+%H:%M:%S') - Benchmark running..."
+    echo ""
+    
+    # Extract latest status
+    tail -30 "$LOG_FILE" | grep -E "(Testing:|Processing chunk|Extraction completed)" | tail -5
+    
+    echo ""
+    echo "💤 Next check in 30 seconds..."
+    echo ""
+    
+    sleep 30
+done
+
+echo ""
+echo "🎉 Monitoring complete!"
+echo ""
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+"""
+Manual validation script for Task 7 quality metrics.
+
+This script helps validate that confidence scores and quality flags
+are accurate by presenting requirements for human review.
+"""
+
+import json
+from pathlib import Path
+import random
+import sys
+
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+
+def load_benchmark_results(results_file: Path) -> dict:
+    """Load benchmark results from JSON file."""
+    with open(results_file) as f:
+        return json.load(f)
+
+
+def select_sample_requirements(all_results: list[dict], sample_size: int = 20) -> list[dict]:
+    """
+    Select a stratified sample of requirements for validation.
+
+    Ensures sample includes:
+    - Mix of confidence levels
+    - Mix of document types
+    - Requirements with and without quality flags
+    """
+
+    # Collect all requirements with metadata
+    all_reqs = []
+    for doc_result in all_results:
+        doc_file = doc_result.get('file', 'unknown')
+        # Simulate requirements (in real benchmark, they're in the result)
+        doc_result.get('task7_quality_metrics', {})
+        req_count = doc_result.get('requirements_total', 0)
+
+        # Create placeholder requirements for demonstration
+        for i in range(req_count):
+            all_reqs.append({
+                'document': doc_file,
+                'index': i,
+                'confidence': 0.965,  # From benchmark results
+                'quality_flags': [],
+                'req_id': f'REQ-{i+1:03d}',
+                'category': 'functional',
+                'body': f'Sample requirement {i+1} from {doc_file}'
+            })
+
+    # Stratified sampling
+    if len(all_reqs) <= sample_size:
+        return all_reqs
+
+    # Random sample
+    return random.sample(all_reqs, min(sample_size, len(all_reqs)))
+
+
+def validate_requirement(req: dict, index: int, total: int) -> dict:
+    """
+    Present a requirement for manual validation.
+
+    Returns validation results.
+    """
+    print("\n" + "=" * 70)
+    print(f"Requirement {index}/{total}")
+    print("=" * 70)
+
+    print(f"\n📄 Document: {req.get('document', 'unknown')}")
+    print(f"🆔 ID: {req.get('req_id', 'N/A')}")
+    print(f"📂 Category: {req.get('category', 'N/A')}")
+    print("\n📝 Body:")
+    print(f"   {req.get('body', 'N/A')}")
+
+    print("\n🎯 Task 7 Metrics:")
+    print(f"   • Confidence: {req.get('confidence', 0):.3f}")
+    quality_flags = req.get('quality_flags', [])
+    print(f"   • Quality Flags: {', '.join(quality_flags) if quality_flags else 'None'}")
+
+    print("\n" + "-" * 70)
+    print("Manual Validation:")
+    print("-" * 70)
+
+    # Question 1: Is the requirement complete?
+    print("\n1. Is this requirement complete and well-formed?")
+    print("   (Does it have all necessary information?)")
+    complete = input("   [y/n]: ").strip().lower() == 'y'
+
+    # Question 2: Is the ID correct?
+    print("\n2. Is the requirement ID appropriate?")
+    id_correct = input("   [y/n]: ").strip().lower() == 'y'
+
+    # Question 3: Is the category correct?
+    print("\n3. Is the category classification correct?")
+    category_correct = input("   [y/n]: ").strip().lower() == 'y'
+
+    # Question 4: Are there any quality issues?
+    print("\n4. Are there any quality issues you notice?")
+    print("   (vague, ambiguous, missing context, etc.)")
+    has_issues = input("   [y/n]: ").strip().lower() == 'y'
+
+    issues = []
+    if has_issues:
+        print("\n   Describe the issues (comma-separated):")
+        issues_str = input("   > ")
+        issues = [i.strip() for i in issues_str.split(',') if i.strip()]
+
+    # Question 5: Would you approve this requirement?
+    print("\n5. Would you approve this requirement as-is?")
+    would_approve = input("   [y/n]: ").strip().lower() == 'y'
+
+    # Question 6: Rate the confidence score accuracy
+    print(f"\n6. The system assigned confidence: {req.get('confidence', 0):.3f}")
+    print("   Do you agree with this confidence level?")
+    print("   1 = Too high, 2 = About right, 3 = Too low")
+    confidence_rating = input("   [1/2/3]: ").strip()
+
+    return {
+        'requirement_id': req.get('req_id'),
+        'document': req.get('document'),
+        'complete': complete,
+        'id_correct': id_correct,
+        'category_correct': category_correct,
+        'has_issues': has_issues,
+        'issues': issues,
+        'would_approve': would_approve,
+        'confidence_rating': confidence_rating,
+        'system_confidence': req.get('confidence', 0),
+        'system_flags': req.get('quality_flags', [])
+    }
+
+
+def generate_validation_report(validations: list[dict]) -> dict:
+    """Generate summary report from validations."""
+
+    total = len(validations)
+    if total == 0:
+        return {}
+
+    report = {
+        'total_validated': total,
+        'complete_count': sum(1 for v in validations if v['complete']),
+        'id_correct_count': sum(1 for v in validations if v['id_correct']),
+        'category_correct_count': sum(1 for v in validations if v['category_correct']),
+        'has_issues_count': sum(1 for v in validations if v['has_issues']),
+        'would_approve_count': sum(1 for v in validations if v['would_approve']),
+        'confidence_ratings': {
+            'too_high': sum(1 for v in validations if v['confidence_rating'] == '1'),
+            'about_right': sum(1 for v in validations if v['confidence_rating'] == '2'),
+            'too_low': sum(1 for v in validations if v['confidence_rating'] == '3')
+        }
+    }
+
+    # Calculate percentages
+    report['complete_percentage'] = report['complete_count'] / total * 100
+    report['id_correct_percentage'] = report['id_correct_count'] / total * 100
+    report['category_correct_percentage'] = report['category_correct_count'] / total * 100
+    report['would_approve_percentage'] = report['would_approve_count'] / total * 100
+
+    # Aggregate issues
+    all_issues = []
+    for v in validations:
+        all_issues.extend(v['issues'])
+    report['common_issues'] = list(set(all_issues))
+
+    return report
+
+
+def print_validation_report(report: dict) -> None:
+    """Print validation report."""
+
+    print("\n" + "=" * 70)
+    print("VALIDATION REPORT")
+    print("=" * 70)
+
+    total = report['total_validated']
+
+    print(f"\n📊 Overall Results (n={total}):")
+    print("-" * 70)
+    print(f"   • Complete & well-formed: {report['complete_count']}/{total} ({report['complete_percentage']:.1f}%)")
+    print(f"   • ID correct: {report['id_correct_count']}/{total} ({report['id_correct_percentage']:.1f}%)")
+    print(f"   • Category correct: {report['category_correct_count']}/{total} ({report['category_correct_percentage']:.1f}%)")
+    print(f"   • Would approve: {report['would_approve_count']}/{total} ({report['would_approve_percentage']:.1f}%)")
+    print(f"   • Has quality issues: {report['has_issues_count']}/{total}")
+
+    print("\n🎯 Confidence Score Assessment:")
+    print("-" * 70)
+    ratings = report['confidence_ratings']
+    print(f"   • Too high (overconfident): {ratings['too_high']}/{total}")
+    print(f"   • About right (accurate): {ratings['about_right']}/{total}")
+    print(f"   • Too low (underconfident): {ratings['too_low']}/{total}")
+
+    if report['common_issues']:
+        print("\n🚩 Common Issues Found:")
+        print("-" * 70)
+        for issue in report['common_issues']:
+            print(f"   • {issue}")
+
+    print("\n" + "=" * 70)
+
+    # Recommendations
+    print("\n💡 Recommendations:")
+    print("-" * 70)
+
+    approve_pct = report['would_approve_percentage']
+    if approve_pct >= 90:
+        print("   ✅ Validation confirms high quality extraction")
+        print("      → Confidence scores appear accurate")
+        print("      → System ready for production use")
+    elif approve_pct >= 75:
+        print("   ⚠️  Good quality but some issues detected")
+        print("      → Review common issues for patterns")
+        print("      → Consider threshold adjustments")
+    else:
+        print("   ❌ Significant quality concerns detected")
+        print("      → Investigate extraction pipeline")
+        print("      → Consider re-tuning Task 7 parameters")
+
+    # Confidence rating assessment
+    too_high = ratings['too_high']
+    if too_high > total * 0.3:  # >30% say too high
+        print("\n   ⚠️  Confidence scores may be inflated")
+        print("      → Consider lowering confidence calculations")
+        print("      → Add more stringent quality checks")
+
+
+def main():
+    """Run manual validation process."""
+
+    print("=" * 70)
+    print("Task 7 Quality Metrics - Manual Validation")
+    print("=" * 70)
+    print()
+    print("This tool helps validate that Task 7 confidence scores and")
+    print("quality flags are accurate through manual review.")
+    print()
+
+    # Load latest benchmark results
+    results_file = project_root / "test/test_results/benchmark_logs/benchmark_latest.json"
+
+    if not results_file.exists():
+        print(f"❌ Benchmark results not found: {results_file}")
+        print("   Please run the benchmark first:")
+        print("   python test/debug/benchmark_performance.py")
+        return
+
+    print(f"📂 Loading benchmark results from: {results_file.name}")
+    results = load_benchmark_results(results_file)
+
+    # Get sample size
+    print("\n⚙️  Configuration:")
+    default_sample_size = 20
+    try:
+        sample_size_input = input(f"   Sample size (default: {default_sample_size}): ").strip()
+        sample_size = int(sample_size_input) if sample_size_input else default_sample_size
+    except ValueError:
+        sample_size = default_sample_size
+
+    print(f"   → Validating {sample_size} requirements")
+
+    # Select sample
+    all_results = results.get('results', [])
+    # Note: In real implementation, we'd load actual requirements from documents
+    # For now, this is a framework
+
+    print(f"\n📋 Total documents in benchmark: {len(all_results)}")
+    total_reqs = sum(r.get('requirements_total', 0) for r in all_results)
+    print(f"📋 Total requirements: {total_reqs}")
+
+    # For demonstration, we'll use a simplified approach
+    print("\n⚠️  Note: This is a validation framework.")
+    print("   Actual requirement data would be loaded from benchmark results.")
+    print("   For full validation, integrate with actual requirement extraction.")
+
+    print("\n" + "=" * 70)
+    print("✨ Validation framework ready!")
+    print("=" * 70)
+
+    print("\n💡 To perform actual validation:")
+    print("   1. Load specific requirements from benchmark results")
+    print("   2. Present each requirement to reviewer")
+    print("   3. Collect validation responses")
+    print("   4. Generate validation report")
+    print("   5. Adjust Task 7 parameters based on findings")
+
+
+if __name__ == "__main__":
+    main()