-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_summary_table.py
More file actions
122 lines (98 loc) · 4.62 KB
/
generate_summary_table.py
File metadata and controls
122 lines (98 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
"""
Generate a markdown summary table from scorecard JSON.
Usage: python3 generate_summary_table.py test_scorecard.json
"""
import json
import sys
from pathlib import Path
def generate_table(scorecard_path):
with open(scorecard_path) as f:
data = json.load(f)
print("\n# SochDB Comprehensive Test Harness - Results Summary\n")
# Run Meta
print("## Run Configuration\n")
meta = data['run_meta']
print(f"| Setting | Value |")
print(f"|---------|-------|")
print(f"| Seed | `{meta['seed']}` |")
print(f"| Scale | `{meta['scale']}` |")
print(f"| Mode | `{meta['mode']}` |")
print(f"| SDK Version | `{meta['sdk_version']}` |")
print(f"| Duration | `{meta['duration_s']:.2f}s` |")
print(f"| Timestamp | `{meta['started_at']}` |")
# Overall
print("\n## Overall Results\n")
overall = data['overall']
status_emoji = "✅" if overall['pass'] else "❌"
print(f"| Metric | Value |")
print(f"|--------|-------|")
print(f"| **Status** | **{status_emoji} {'PASS' if overall['pass'] else 'FAIL'}** |")
print(f"| **Score** | **{overall['score_0_100']:.1f}/100** |")
print(f"| Passed Scenarios | {overall['passed_scenarios']}/{overall['total_scenarios']} |")
print(f"| Failed Checks | {len(overall['failed_checks'])} |")
# Scenario Breakdown
print("\n## Scenario Results\n")
print("| # | Scenario | Status | NDCG@10 | Recall@10 | Leakage | Atomicity |")
print("|---|----------|--------|---------|-----------|---------|-----------|")
for i, (scenario_id, score) in enumerate(data['scenario_scores'].items(), 1):
status = "✅ PASS" if score['pass'] else "❌ FAIL"
metrics = score['metrics']
ndcg = metrics['retrieval']['ndcg_at_10']
ndcg_str = f"{ndcg:.3f}" if ndcg is not None else "N/A"
recall = metrics['retrieval']['recall_at_10']
recall_str = f"{recall:.3f}" if recall is not None else "N/A"
leakage = metrics['correctness']['leakage_rate']
leakage_str = f"{leakage:.1%}"
atomicity = metrics['correctness']['atomicity_failures']
name = scenario_id.replace('_', ' ').title()
print(f"| {i} | {name} | {status} | {ndcg_str} | {recall_str} | {leakage_str} | {atomicity} |")
# Performance
print("\n## Performance Metrics (P95 Latencies)\n")
p95 = data['global_metrics']['p95_latency_ms']
if p95:
print("| Operation | P95 Latency | Target | Status |")
print("|-----------|-------------|--------|--------|")
thresholds = {
'vector_search': 20,
'hybrid_search': 50,
'txn_commit': 10,
'ledger_commit': 10,
}
for op_type, latency in p95.items():
threshold = thresholds.get(op_type, 50)
status = "✅" if latency < threshold else "⚠️"
print(f"| {op_type.replace('_', ' ').title()} | {latency:.2f}ms | <{threshold}ms | {status} |")
# Failed Checks
if overall['failed_checks']:
print("\n## Failed Checks\n")
for check in overall['failed_checks']:
print(f"- ❌ {check}")
# Success Criteria
print("\n## Success Criteria\n")
print("| Criterion | Target | Actual | Status |")
print("|-----------|--------|--------|--------|")
# Calculate aggregate correctness
total_leakage = 0
total_atomicity = 0
total_consistency = 0
count = 0
for score in data['scenario_scores'].values():
metrics = score['metrics']
total_leakage += metrics['correctness']['leakage_rate']
total_atomicity += metrics['correctness']['atomicity_failures']
total_consistency += metrics['correctness']['consistency_failures']
count += 1
avg_leakage = total_leakage / count if count > 0 else 0
print(f"| Zero Leakage | 0% | {avg_leakage:.1%} | {'✅' if avg_leakage == 0 else '❌'} |")
print(f"| Zero Atomicity Failures | 0 | {total_atomicity} | {'✅' if total_atomicity == 0 else '❌'} |")
print(f"| Zero Consistency Failures | 0 | {total_consistency} | {'✅' if total_consistency == 0 else '❌'} |")
print(f"| Overall Pass Rate | ≥90% | {overall['score_0_100']:.1f}% | {'✅' if overall['score_0_100'] >= 90 else '⚠️'} |")
print("\n---\n")
print(f"**Generated**: {Path(scorecard_path).stat().st_mtime}")
print(f"**Harness Version**: 1.0.0")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 generate_summary_table.py <scorecard.json>")
sys.exit(1)
generate_table(sys.argv[1])