Skip to content

Commit 3287bff

Browse files
committed
merge: evaluation harness with 38 fixtures and aggregate reporting
- Extended EvalRunner with 5 assertion sections (evidence, prompt, connections, subject, breaking) and symbols.toml injection - Added run_sync() for integration test usage - 38 fixtures covering all 11 commit types, AST features, edge cases - Per-type accuracy breakdown and overall score reporting - 348 tests passing with eval feature * feat/eval-harness: feat(eval): complete eval harness with 38 fixtures and aggregate reporting feat(eval): add 10 fixtures and integration test suite feat(eval): extend EvalRunner with assertion sections and symbols.toml loading
2 parents a86345d + a778dac commit 3287bff

File tree

85 files changed

+2887
-35
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+2887
-35
lines changed

src/eval.rs

Lines changed: 609 additions & 35 deletions
Large diffs are not rendered by default.

tests/eval.rs

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
// SPDX-FileCopyrightText: 2026 Sephyi <me@sephy.io>
2+
//
3+
// SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
4+
5+
//! Integration tests for the evaluation harness.
6+
//!
7+
//! Runs all fixtures through the deterministic (no-LLM) pipeline and
8+
//! asserts type inference, evidence flags, prompt content, connections,
9+
//! and breaking change detection.
10+
11+
#![cfg(feature = "eval")]
12+
13+
use std::path::PathBuf;
14+
15+
use commitbee::eval::EvalRunner;
16+
17+
fn fixtures_dir() -> PathBuf {
18+
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval")
19+
}
20+
21+
/// Run all fixtures and assert every one passes.
22+
#[test]
23+
fn all_fixtures_pass() {
24+
let runner = EvalRunner::new(fixtures_dir(), None);
25+
let results = runner.run_sync().expect("eval runner should not error");
26+
27+
assert!(!results.is_empty(), "should discover at least one fixture");
28+
29+
let mut failures = Vec::new();
30+
for result in &results {
31+
if !result.passed() {
32+
let mut detail = format!("FIXTURE FAILED: {}\n", result.fixture_name);
33+
if !result.type_passed {
34+
detail.push_str(&format!(
35+
" Type: expected={}, actual={}\n",
36+
result.expected_type, result.actual_type
37+
));
38+
}
39+
if !result.scope_passed {
40+
detail.push_str(&format!(
41+
" Scope: expected={:?}, actual={:?}\n",
42+
result.expected_scope, result.actual_scope
43+
));
44+
}
45+
if !result.prompt_assembled {
46+
detail.push_str(" Prompt: failed to assemble\n");
47+
}
48+
for failure in &result.assertion_failures {
49+
detail.push_str(&format!(" {}\n", failure));
50+
}
51+
if let Some(ref err) = result.error {
52+
detail.push_str(&format!(" Error: {}\n", err));
53+
}
54+
failures.push(detail);
55+
}
56+
}
57+
58+
if !failures.is_empty() {
59+
panic!(
60+
"{} of {} fixtures failed:\n\n{}",
61+
failures.len(),
62+
results.len(),
63+
failures.join("\n")
64+
);
65+
}
66+
}
67+
68+
/// Each fixture category runs independently.
69+
#[test]
70+
fn type_inference_fixtures() {
71+
let runner = EvalRunner::new(fixtures_dir(), None);
72+
let results = runner.run_sync().expect("eval runner should not error");
73+
74+
for result in &results {
75+
assert!(
76+
result.type_passed,
77+
"Type mismatch in {}: expected={}, actual={}",
78+
result.fixture_name, result.expected_type, result.actual_type
79+
);
80+
}
81+
}
82+
83+
#[test]
84+
fn evidence_flag_fixtures() {
85+
let runner = EvalRunner::new(fixtures_dir(), None);
86+
let results = runner.run_sync().expect("eval runner should not error");
87+
88+
for result in &results {
89+
let evidence_failures: Vec<_> = result
90+
.assertion_failures
91+
.iter()
92+
.filter(|f| f.category == "evidence")
93+
.collect();
94+
95+
assert!(
96+
evidence_failures.is_empty(),
97+
"Evidence failures in {}: {:?}",
98+
result.fixture_name,
99+
evidence_failures
100+
.iter()
101+
.map(|f| &f.message)
102+
.collect::<Vec<_>>()
103+
);
104+
}
105+
}
106+
107+
#[test]
108+
fn prompt_content_fixtures() {
109+
let runner = EvalRunner::new(fixtures_dir(), None);
110+
let results = runner.run_sync().expect("eval runner should not error");
111+
112+
for result in &results {
113+
assert!(
114+
result.prompt_assembled,
115+
"Prompt assembly failed for {}",
116+
result.fixture_name
117+
);
118+
119+
let prompt_failures: Vec<_> = result
120+
.assertion_failures
121+
.iter()
122+
.filter(|f| f.category == "prompt")
123+
.collect();
124+
125+
assert!(
126+
prompt_failures.is_empty(),
127+
"Prompt content failures in {}: {:?}",
128+
result.fixture_name,
129+
prompt_failures
130+
.iter()
131+
.map(|f| &f.message)
132+
.collect::<Vec<_>>()
133+
);
134+
}
135+
}
136+
137+
#[test]
138+
fn connection_detection_fixtures() {
139+
let runner = EvalRunner::new(fixtures_dir(), None);
140+
let results = runner.run_sync().expect("eval runner should not error");
141+
142+
for result in &results {
143+
let conn_failures: Vec<_> = result
144+
.assertion_failures
145+
.iter()
146+
.filter(|f| f.category == "connections")
147+
.collect();
148+
149+
assert!(
150+
conn_failures.is_empty(),
151+
"Connection failures in {}: {:?}",
152+
result.fixture_name,
153+
conn_failures.iter().map(|f| &f.message).collect::<Vec<_>>()
154+
);
155+
}
156+
}
157+
158+
#[test]
159+
fn breaking_change_fixtures() {
160+
let runner = EvalRunner::new(fixtures_dir(), None);
161+
let results = runner.run_sync().expect("eval runner should not error");
162+
163+
for result in &results {
164+
let breaking_failures: Vec<_> = result
165+
.assertion_failures
166+
.iter()
167+
.filter(|f| f.category == "breaking")
168+
.collect();
169+
170+
assert!(
171+
breaking_failures.is_empty(),
172+
"Breaking change failures in {}: {:?}",
173+
result.fixture_name,
174+
breaking_failures
175+
.iter()
176+
.map(|f| &f.message)
177+
.collect::<Vec<_>>()
178+
);
179+
}
180+
}
181+
182+
/// Verify specific fixture count to catch accidental fixture deletion.
183+
#[test]
184+
fn fixture_count() {
185+
let runner = EvalRunner::new(fixtures_dir(), None);
186+
let results = runner.run_sync().expect("eval runner should not error");
187+
// 12 original + 26 new = 38
188+
assert!(
189+
results.len() >= 38,
190+
"Expected at least 38 fixtures, found {}",
191+
results.len()
192+
);
193+
}
194+
195+
/// Print aggregate per-type accuracy report after running all fixtures.
196+
#[test]
197+
fn aggregate_summary() {
198+
let runner = EvalRunner::new(fixtures_dir(), None);
199+
let results = runner.run_sync().expect("eval runner should not error");
200+
201+
let summary = commitbee::eval::EvalSummary::from_results(&results);
202+
let report = summary.format_report();
203+
204+
// Print the report so it's visible with --nocapture
205+
eprintln!("\n{}", report);
206+
207+
// Verify the summary math is consistent
208+
assert_eq!(
209+
summary.total_passed + summary.total_failed,
210+
summary.total_fixtures,
211+
"passed + failed should equal total"
212+
);
213+
assert_eq!(
214+
summary.total_fixtures,
215+
results.len(),
216+
"summary total should match results count"
217+
);
218+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
diff --git a/src/services/api.rs b/src/services/api.rs
2+
index abc1234..def5678 100644
3+
--- a/src/services/api.rs
4+
+++ b/src/services/api.rs
5+
@@ -5,20 +5,5 @@ use crate::error::Result;
6+
7+
pub struct ApiClient {
8+
base_url: String,
9+
}
10+
11+
-/// Send a GET request to the given path.
12+
-pub fn api_get(client: &ApiClient, path: &str) -> Result<Response> {
13+
- let url = format!("{}/{}", client.base_url, path);
14+
- reqwest::blocking::get(&url).map_err(|e| Error::Network(e.to_string()))
15+
-}
16+
-
17+
-/// Send a POST request with a JSON body.
18+
-pub fn api_post(client: &ApiClient, path: &str, body: &str) -> Result<Response> {
19+
- let url = format!("{}/{}", client.base_url, path);
20+
- reqwest::blocking::Client::new()
21+
- .post(&url)
22+
- .body(body.to_string())
23+
- .send()
24+
- .map_err(|e| Error::Network(e.to_string()))
25+
-}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# SPDX-FileCopyrightText: 2026 Sephyi <me@sephy.io>
2+
#
3+
# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
4+
5+
name = "ast-breaking-public-removal"
6+
description = "Removing public API without replacement should signal breaking change"
7+
language = "rust"
8+
category = "ast"
9+
expected_type = "refactor"
10+
expected_scope = "optional"
11+
12+
[evidence]
13+
has_new_public_api = false
14+
public_api_removed_count = 2
15+
16+
[prompt]
17+
must_contain = ["SYMBOLS CHANGED", "Removed"]
18+
19+
[breaking]
20+
expected = true
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[[symbols]]
2+
kind = "Function"
3+
name = "api_get"
4+
file = "src/services/api.rs"
5+
line = 10
6+
end_line = 13
7+
is_public = true
8+
is_added = false
9+
signature = "pub fn api_get(client: &ApiClient, path: &str) -> Result<Response>"
10+
11+
[[symbols]]
12+
kind = "Function"
13+
name = "api_post"
14+
file = "src/services/api.rs"
15+
line = 16
16+
end_line = 22
17+
is_public = true
18+
is_added = false
19+
signature = "pub fn api_post(client: &ApiClient, path: &str, body: &str) -> Result<Response>"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
diff --git a/src/services/validator.rs b/src/services/validator.rs
2+
new file mode 100644
3+
index 0000000..abc1234
4+
--- /dev/null
5+
+++ b/src/services/validator.rs
6+
@@ -0,0 +1,12 @@
7+
+use crate::error::Result;
8+
+
9+
+/// Validate user input before processing.
10+
+pub fn validate_input(input: &str) -> Result<()> {
11+
+ if input.is_empty() {
12+
+ return Err(crate::error::Error::Config("empty input".into()));
13+
+ }
14+
+ if input.len() > 1024 {
15+
+ return Err(crate::error::Error::Config("input too long".into()));
16+
+ }
17+
+ Ok(())
18+
+}
19+
diff --git a/src/services/handler.rs b/src/services/handler.rs
20+
new file mode 100644
21+
index 0000000..def5678
22+
--- /dev/null
23+
+++ b/src/services/handler.rs
24+
@@ -0,0 +1,15 @@
25+
+use crate::error::Result;
26+
+use super::validator::validate_input;
27+
+
28+
+pub struct RequestHandler;
29+
+
30+
+impl RequestHandler {
31+
+ pub fn handle(&self, request: &str) -> Result<String> {
32+
+ // Validate first, then process
33+
+ validate_input(request)?;
34+
+
35+
+ // Process the validated input
36+
+ let result = request.to_uppercase();
37+
+ Ok(result)
38+
+ }
39+
+}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# SPDX-FileCopyrightText: 2026 Sephyi <me@sephy.io>
2+
#
3+
# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
4+
5+
name = "ast-cross-file-connection"
6+
description = "Cross-file call should produce CONNECTIONS section in prompt"
7+
language = "rust"
8+
category = "feat"
9+
expected_type = "feat"
10+
expected_scope = "optional"
11+
12+
[prompt]
13+
must_contain = ["CONNECTIONS", "calls validate_input"]
14+
15+
[connections]
16+
min_count = 1
17+
must_contain = ["calls validate_input"]
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
[[symbols]]
2+
kind = "Function"
3+
name = "validate_input"
4+
file = "src/services/validator.rs"
5+
line = 4
6+
end_line = 11
7+
is_public = true
8+
is_added = true
9+
signature = "pub fn validate_input(input: &str) -> Result<()>"
10+
11+
[[symbols]]
12+
kind = "Struct"
13+
name = "RequestHandler"
14+
file = "src/services/handler.rs"
15+
line = 4
16+
end_line = 4
17+
is_public = true
18+
is_added = true
19+
signature = "pub struct RequestHandler"
20+
21+
[[symbols]]
22+
kind = "Function"
23+
name = "handle"
24+
file = "src/services/handler.rs"
25+
line = 7
26+
end_line = 14
27+
is_public = true
28+
is_added = true
29+
signature = "pub fn handle(&self, request: &str) -> Result<String>"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
diff --git a/src/services/validator.rs b/src/services/validator.rs
2+
index abc1234..def5678 100644
3+
--- a/src/services/validator.rs
4+
+++ b/src/services/validator.rs
5+
@@ -5,8 +5,8 @@ use crate::error::Result;
6+
7+
impl Validator {
8+
- pub fn check(&self, input: &str) -> bool {
9+
- !input.is_empty() && input.len() < 1024
10+
+ pub fn check(&self, input: &str) -> bool {
11+
+ !input.is_empty() && input.len() < 1024
12+
}
13+
}
14+
15+
@@ -15,0 +16,10 @@
16+
+/// Validate a batch of inputs, returning the first error.
17+
+pub fn validate_batch(inputs: &[&str]) -> Result<()> {
18+
+ for input in inputs {
19+
+ if input.is_empty() {
20+
+ return Err(crate::error::Error::Config("empty input in batch".into()));
21+
+ }
22+
+ }
23+
+ Ok(())
24+
+}

0 commit comments

Comments
 (0)