Skip to content

Commit 3b0e714

Browse files
committed
test: add benchmark and manual testing infrastructure
Add comprehensive testing infrastructure: - Benchmark suite for performance testing - Manual test scripts for validation - Test results tracking and analysis - Historical benchmark data - Performance regression detection Includes: - Benchmark logs with timestamps - Latest benchmark results - Performance metrics tracking - Manual integration test scenarios
1 parent ee5e7b2 commit 3b0e714

23 files changed

+1696
-0
lines changed

test/benchmark/benchmark_performance.py

Lines changed: 505 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
3+
# Benchmark monitoring script
4+
# Monitors the benchmark progress and displays updates
5+
6+
LOG_FILE="test_results/benchmark_optimized_output.log"
7+
8+
echo ""
9+
echo "🔍 Benchmark Monitor Started"
10+
echo " Checking progress every 30 seconds..."
11+
echo " Press Ctrl+C to stop monitoring"
12+
echo ""
13+
14+
while true; do
15+
# Check if benchmark is still running
16+
if ! ps aux | grep -q "[p]ython test/debug/benchmark_performance.py"; then
17+
echo "✅ Benchmark process completed!"
18+
echo ""
19+
echo "📊 Final Results:"
20+
tail -50 "$LOG_FILE" | grep -E "(Testing:|completed|BENCHMARK|Total)"
21+
break
22+
fi
23+
24+
# Show current progress
25+
echo "────────────────────────────────────────────────────────────────"
26+
echo "$(date '+%H:%M:%S') - Benchmark running..."
27+
echo ""
28+
29+
# Extract latest status
30+
tail -30 "$LOG_FILE" | grep -E "(Testing:|Processing chunk|Extraction completed)" | tail -5
31+
32+
echo ""
33+
echo "💤 Next check in 30 seconds..."
34+
echo ""
35+
36+
sleep 30
37+
done
38+
39+
echo ""
40+
echo "🎉 Monitoring complete!"
41+
echo ""
29.5 KB
Binary file not shown.
36.2 KB
Binary file not shown.
20.1 KB
Binary file not shown.
3.28 KB
Binary file not shown.

test/manual/quality_validation.py

Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Manual validation script for Task 7 quality metrics.
4+
5+
This script helps validate that confidence scores and quality flags
6+
are accurate by presenting requirements for human review.
7+
"""
8+
9+
import json
10+
from pathlib import Path
11+
import random
12+
import sys
13+
14+
# Add project root to path
15+
project_root = Path(__file__).parent.parent
16+
sys.path.insert(0, str(project_root))
17+
18+
19+
def load_benchmark_results(results_file: Path) -> dict:
20+
"""Load benchmark results from JSON file."""
21+
with open(results_file) as f:
22+
return json.load(f)
23+
24+
25+
def select_sample_requirements(all_results: list[dict], sample_size: int = 20) -> list[dict]:
26+
"""
27+
Select a stratified sample of requirements for validation.
28+
29+
Ensures sample includes:
30+
- Mix of confidence levels
31+
- Mix of document types
32+
- Requirements with and without quality flags
33+
"""
34+
35+
# Collect all requirements with metadata
36+
all_reqs = []
37+
for doc_result in all_results:
38+
doc_file = doc_result.get('file', 'unknown')
39+
# Simulate requirements (in real benchmark, they're in the result)
40+
doc_result.get('task7_quality_metrics', {})
41+
req_count = doc_result.get('requirements_total', 0)
42+
43+
# Create placeholder requirements for demonstration
44+
for i in range(req_count):
45+
all_reqs.append({
46+
'document': doc_file,
47+
'index': i,
48+
'confidence': 0.965, # From benchmark results
49+
'quality_flags': [],
50+
'req_id': f'REQ-{i+1:03d}',
51+
'category': 'functional',
52+
'body': f'Sample requirement {i+1} from {doc_file}'
53+
})
54+
55+
# Stratified sampling
56+
if len(all_reqs) <= sample_size:
57+
return all_reqs
58+
59+
# Random sample
60+
return random.sample(all_reqs, min(sample_size, len(all_reqs)))
61+
62+
63+
def validate_requirement(req: dict, index: int, total: int) -> dict:
64+
"""
65+
Present a requirement for manual validation.
66+
67+
Returns validation results.
68+
"""
69+
print("\n" + "=" * 70)
70+
print(f"Requirement {index}/{total}")
71+
print("=" * 70)
72+
73+
print(f"\n📄 Document: {req.get('document', 'unknown')}")
74+
print(f"🆔 ID: {req.get('req_id', 'N/A')}")
75+
print(f"📂 Category: {req.get('category', 'N/A')}")
76+
print("\n📝 Body:")
77+
print(f" {req.get('body', 'N/A')}")
78+
79+
print("\n🎯 Task 7 Metrics:")
80+
print(f" • Confidence: {req.get('confidence', 0):.3f}")
81+
quality_flags = req.get('quality_flags', [])
82+
print(f" • Quality Flags: {', '.join(quality_flags) if quality_flags else 'None'}")
83+
84+
print("\n" + "-" * 70)
85+
print("Manual Validation:")
86+
print("-" * 70)
87+
88+
# Question 1: Is the requirement complete?
89+
print("\n1. Is this requirement complete and well-formed?")
90+
print(" (Does it have all necessary information?)")
91+
complete = input(" [y/n]: ").strip().lower() == 'y'
92+
93+
# Question 2: Is the ID correct?
94+
print("\n2. Is the requirement ID appropriate?")
95+
id_correct = input(" [y/n]: ").strip().lower() == 'y'
96+
97+
# Question 3: Is the category correct?
98+
print("\n3. Is the category classification correct?")
99+
category_correct = input(" [y/n]: ").strip().lower() == 'y'
100+
101+
# Question 4: Are there any quality issues?
102+
print("\n4. Are there any quality issues you notice?")
103+
print(" (vague, ambiguous, missing context, etc.)")
104+
has_issues = input(" [y/n]: ").strip().lower() == 'y'
105+
106+
issues = []
107+
if has_issues:
108+
print("\n Describe the issues (comma-separated):")
109+
issues_str = input(" > ")
110+
issues = [i.strip() for i in issues_str.split(',') if i.strip()]
111+
112+
# Question 5: Would you approve this requirement?
113+
print("\n5. Would you approve this requirement as-is?")
114+
would_approve = input(" [y/n]: ").strip().lower() == 'y'
115+
116+
# Question 6: Rate the confidence score accuracy
117+
print(f"\n6. The system assigned confidence: {req.get('confidence', 0):.3f}")
118+
print(" Do you agree with this confidence level?")
119+
print(" 1 = Too high, 2 = About right, 3 = Too low")
120+
confidence_rating = input(" [1/2/3]: ").strip()
121+
122+
return {
123+
'requirement_id': req.get('req_id'),
124+
'document': req.get('document'),
125+
'complete': complete,
126+
'id_correct': id_correct,
127+
'category_correct': category_correct,
128+
'has_issues': has_issues,
129+
'issues': issues,
130+
'would_approve': would_approve,
131+
'confidence_rating': confidence_rating,
132+
'system_confidence': req.get('confidence', 0),
133+
'system_flags': req.get('quality_flags', [])
134+
}
135+
136+
137+
def generate_validation_report(validations: list[dict]) -> dict:
138+
"""Generate summary report from validations."""
139+
140+
total = len(validations)
141+
if total == 0:
142+
return {}
143+
144+
report = {
145+
'total_validated': total,
146+
'complete_count': sum(1 for v in validations if v['complete']),
147+
'id_correct_count': sum(1 for v in validations if v['id_correct']),
148+
'category_correct_count': sum(1 for v in validations if v['category_correct']),
149+
'has_issues_count': sum(1 for v in validations if v['has_issues']),
150+
'would_approve_count': sum(1 for v in validations if v['would_approve']),
151+
'confidence_ratings': {
152+
'too_high': sum(1 for v in validations if v['confidence_rating'] == '1'),
153+
'about_right': sum(1 for v in validations if v['confidence_rating'] == '2'),
154+
'too_low': sum(1 for v in validations if v['confidence_rating'] == '3')
155+
}
156+
}
157+
158+
# Calculate percentages
159+
report['complete_percentage'] = report['complete_count'] / total * 100
160+
report['id_correct_percentage'] = report['id_correct_count'] / total * 100
161+
report['category_correct_percentage'] = report['category_correct_count'] / total * 100
162+
report['would_approve_percentage'] = report['would_approve_count'] / total * 100
163+
164+
# Aggregate issues
165+
all_issues = []
166+
for v in validations:
167+
all_issues.extend(v['issues'])
168+
report['common_issues'] = list(set(all_issues))
169+
170+
return report
171+
172+
173+
def print_validation_report(report: dict) -> None:
174+
"""Print validation report."""
175+
176+
print("\n" + "=" * 70)
177+
print("VALIDATION REPORT")
178+
print("=" * 70)
179+
180+
total = report['total_validated']
181+
182+
print(f"\n📊 Overall Results (n={total}):")
183+
print("-" * 70)
184+
print(f" • Complete & well-formed: {report['complete_count']}/{total} ({report['complete_percentage']:.1f}%)")
185+
print(f" • ID correct: {report['id_correct_count']}/{total} ({report['id_correct_percentage']:.1f}%)")
186+
print(f" • Category correct: {report['category_correct_count']}/{total} ({report['category_correct_percentage']:.1f}%)")
187+
print(f" • Would approve: {report['would_approve_count']}/{total} ({report['would_approve_percentage']:.1f}%)")
188+
print(f" • Has quality issues: {report['has_issues_count']}/{total}")
189+
190+
print("\n🎯 Confidence Score Assessment:")
191+
print("-" * 70)
192+
ratings = report['confidence_ratings']
193+
print(f" • Too high (overconfident): {ratings['too_high']}/{total}")
194+
print(f" • About right (accurate): {ratings['about_right']}/{total}")
195+
print(f" • Too low (underconfident): {ratings['too_low']}/{total}")
196+
197+
if report['common_issues']:
198+
print("\n🚩 Common Issues Found:")
199+
print("-" * 70)
200+
for issue in report['common_issues']:
201+
print(f" • {issue}")
202+
203+
print("\n" + "=" * 70)
204+
205+
# Recommendations
206+
print("\n💡 Recommendations:")
207+
print("-" * 70)
208+
209+
approve_pct = report['would_approve_percentage']
210+
if approve_pct >= 90:
211+
print(" ✅ Validation confirms high quality extraction")
212+
print(" → Confidence scores appear accurate")
213+
print(" → System ready for production use")
214+
elif approve_pct >= 75:
215+
print(" ⚠️ Good quality but some issues detected")
216+
print(" → Review common issues for patterns")
217+
print(" → Consider threshold adjustments")
218+
else:
219+
print(" ❌ Significant quality concerns detected")
220+
print(" → Investigate extraction pipeline")
221+
print(" → Consider re-tuning Task 7 parameters")
222+
223+
# Confidence rating assessment
224+
too_high = ratings['too_high']
225+
if too_high > total * 0.3: # >30% say too high
226+
print("\n ⚠️ Confidence scores may be inflated")
227+
print(" → Consider lowering confidence calculations")
228+
print(" → Add more stringent quality checks")
229+
230+
231+
def main():
232+
"""Run manual validation process."""
233+
234+
print("=" * 70)
235+
print("Task 7 Quality Metrics - Manual Validation")
236+
print("=" * 70)
237+
print()
238+
print("This tool helps validate that Task 7 confidence scores and")
239+
print("quality flags are accurate through manual review.")
240+
print()
241+
242+
# Load latest benchmark results
243+
results_file = project_root / "test/test_results/benchmark_logs/benchmark_latest.json"
244+
245+
if not results_file.exists():
246+
print(f"❌ Benchmark results not found: {results_file}")
247+
print(" Please run the benchmark first:")
248+
print(" python test/debug/benchmark_performance.py")
249+
return
250+
251+
print(f"📂 Loading benchmark results from: {results_file.name}")
252+
results = load_benchmark_results(results_file)
253+
254+
# Get sample size
255+
print("\n⚙️ Configuration:")
256+
default_sample_size = 20
257+
try:
258+
sample_size_input = input(f" Sample size (default: {default_sample_size}): ").strip()
259+
sample_size = int(sample_size_input) if sample_size_input else default_sample_size
260+
except ValueError:
261+
sample_size = default_sample_size
262+
263+
print(f" → Validating {sample_size} requirements")
264+
265+
# Select sample
266+
all_results = results.get('results', [])
267+
# Note: In real implementation, we'd load actual requirements from documents
268+
# For now, this is a framework
269+
270+
print(f"\n📋 Total documents in benchmark: {len(all_results)}")
271+
total_reqs = sum(r.get('requirements_total', 0) for r in all_results)
272+
print(f"📋 Total requirements: {total_reqs}")
273+
274+
# For demonstration, we'll use a simplified approach
275+
print("\n⚠️ Note: This is a validation framework.")
276+
print(" Actual requirement data would be loaded from benchmark results.")
277+
print(" For full validation, integrate with actual requirement extraction.")
278+
279+
print("\n" + "=" * 70)
280+
print("✨ Validation framework ready!")
281+
print("=" * 70)
282+
283+
print("\n💡 To perform actual validation:")
284+
print(" 1. Load specific requirements from benchmark results")
285+
print(" 2. Present each requirement to reviewer")
286+
print(" 3. Collect validation responses")
287+
print(" 4. Generate validation report")
288+
print(" 5. Adjust Task 7 parameters based on findings")
289+
290+
291+
if __name__ == "__main__":
292+
main()

0 commit comments

Comments
 (0)