Skip to content

Commit 0e5ebb6

Browse files
fix(router): use max_completion_tokens for OpenAI GPT-5+ validation (#575)
* fix(router): use max_completion_tokens for OpenAI GPT-5+ validation probe OpenAI GPT-5 models reject the legacy max_tokens parameter and require max_completion_tokens. The inference validation probe now sends max_completion_tokens as the primary parameter, with an automatic fallback to max_tokens when the backend returns HTTP 400 (for legacy/self-hosted backends that only support the older parameter). Closes #517 Signed-off-by: Maxime Grenu <maxime.grenu@gmail.com> * style(router): fix cargo fmt import order and line length --------- Signed-off-by: Maxime Grenu <maxime.grenu@gmail.com> Co-authored-by: John Myers <johntmyers@users.noreply.github.com>
1 parent bd7b388 commit 0e5ebb6

File tree

2 files changed

+157
-6
lines changed

2 files changed

+157
-6
lines changed

crates/openshell-router/src/backend.rs

Lines changed: 156 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ struct ValidationProbe {
3131
path: &'static str,
3232
protocol: &'static str,
3333
body: bytes::Bytes,
34+
/// Alternate body to try when the primary probe fails with HTTP 400.
35+
/// Used for OpenAI chat completions where newer models require
36+
/// `max_completion_tokens` while legacy/self-hosted backends only
37+
/// accept `max_tokens`.
38+
fallback_body: Option<bytes::Bytes>,
3439
}
3540

3641
/// Response from a proxied HTTP request to a backend (fully buffered).
@@ -163,12 +168,17 @@ fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, Validation
163168
.iter()
164169
.any(|protocol| protocol == "openai_chat_completions")
165170
{
171+
// Use max_completion_tokens (modern OpenAI parameter, required by GPT-5+)
172+
// with max_tokens as fallback for legacy/self-hosted backends.
166173
return Ok(ValidationProbe {
167174
path: "/v1/chat/completions",
168175
protocol: "openai_chat_completions",
169176
body: bytes::Bytes::from_static(
170-
br#"{"messages":[{"role":"user","content":"ping"}],"max_tokens":32}"#,
177+
br#"{"messages":[{"role":"user","content":"ping"}],"max_completion_tokens":32}"#,
171178
),
179+
fallback_body: Some(bytes::Bytes::from_static(
180+
br#"{"messages":[{"role":"user","content":"ping"}],"max_tokens":32}"#,
181+
)),
172182
});
173183
}
174184

@@ -183,6 +193,7 @@ fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, Validation
183193
body: bytes::Bytes::from_static(
184194
br#"{"messages":[{"role":"user","content":"ping"}],"max_tokens":32}"#,
185195
),
196+
fallback_body: None,
186197
});
187198
}
188199

@@ -195,6 +206,7 @@ fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, Validation
195206
path: "/v1/responses",
196207
protocol: "openai_responses",
197208
body: bytes::Bytes::from_static(br#"{"input":"ping","max_output_tokens":32}"#),
209+
fallback_body: None,
198210
});
199211
}
200212

@@ -207,6 +219,7 @@ fn validation_probe(route: &ResolvedRoute) -> Result<ValidationProbe, Validation
207219
path: "/v1/completions",
208220
protocol: "openai_completions",
209221
body: bytes::Bytes::from_static(br#"{"prompt":"ping","max_tokens":32}"#),
222+
fallback_body: None,
210223
});
211224
}
212225

@@ -233,7 +246,47 @@ pub async fn verify_backend_endpoint(
233246
});
234247
}
235248

236-
let response = send_backend_request(client, route, "POST", probe.path, headers, probe.body)
249+
let result = try_validation_request(
250+
client,
251+
route,
252+
probe.path,
253+
probe.protocol,
254+
headers.clone(),
255+
probe.body,
256+
)
257+
.await;
258+
259+
// If the primary probe failed with a request-shape error (HTTP 400) and
260+
// there is a fallback body, retry with the alternate token parameter.
261+
// This handles the split between `max_completion_tokens` (GPT-5+) and
262+
// `max_tokens` (legacy/self-hosted backends).
263+
if let (Err(err), Some(fallback_body)) = (&result, probe.fallback_body) {
264+
if err.kind == ValidationFailureKind::RequestShape {
265+
return try_validation_request(
266+
client,
267+
route,
268+
probe.path,
269+
probe.protocol,
270+
headers,
271+
fallback_body,
272+
)
273+
.await;
274+
}
275+
}
276+
277+
result
278+
}
279+
280+
/// Send a single validation request and classify the response.
281+
async fn try_validation_request(
282+
client: &reqwest::Client,
283+
route: &ResolvedRoute,
284+
path: &str,
285+
protocol: &str,
286+
headers: Vec<(String, String)>,
287+
body: bytes::Bytes,
288+
) -> Result<ValidatedEndpoint, ValidationFailure> {
289+
let response = send_backend_request(client, route, "POST", path, headers, body)
237290
.await
238291
.map_err(|err| match err {
239292
RouterError::UpstreamUnavailable(details) => ValidationFailure {
@@ -253,12 +306,12 @@ pub async fn verify_backend_endpoint(
253306
details,
254307
},
255308
})?;
256-
let url = build_backend_url(&route.endpoint, probe.path);
309+
let url = build_backend_url(&route.endpoint, path);
257310

258311
if response.status().is_success() {
259312
return Ok(ValidatedEndpoint {
260313
url,
261-
protocol: probe.protocol.to_string(),
314+
protocol: protocol.to_string(),
262315
});
263316
}
264317

@@ -376,7 +429,7 @@ fn build_backend_url(endpoint: &str, path: &str) -> String {
376429

377430
#[cfg(test)]
378431
mod tests {
379-
use super::{build_backend_url, verify_backend_endpoint};
432+
use super::{ValidationFailureKind, build_backend_url, verify_backend_endpoint};
380433
use crate::config::ResolvedRoute;
381434
use openshell_core::inference::AuthHeader;
382435
use wiremock::matchers::{body_partial_json, header, method, path};
@@ -463,4 +516,102 @@ mod tests {
463516
assert_eq!(validated.protocol, "openai_chat_completions");
464517
assert_eq!(validated.url, "mock://test-backend/v1/chat/completions");
465518
}
519+
520+
/// GPT-5+ models reject `max_tokens` — the primary probe uses
521+
/// `max_completion_tokens` so validation should succeed directly.
522+
#[tokio::test]
523+
async fn verify_openai_chat_uses_max_completion_tokens() {
524+
let mock_server = MockServer::start().await;
525+
let route = test_route(
526+
&mock_server.uri(),
527+
&["openai_chat_completions"],
528+
AuthHeader::Bearer,
529+
);
530+
531+
Mock::given(method("POST"))
532+
.and(path("/v1/chat/completions"))
533+
.and(body_partial_json(serde_json::json!({
534+
"max_completion_tokens": 32,
535+
})))
536+
.respond_with(
537+
ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "chatcmpl-1"})),
538+
)
539+
.mount(&mock_server)
540+
.await;
541+
542+
let client = reqwest::Client::builder().build().unwrap();
543+
let validated = verify_backend_endpoint(&client, &route).await.unwrap();
544+
545+
assert_eq!(validated.protocol, "openai_chat_completions");
546+
}
547+
548+
/// Legacy/self-hosted backends that reject `max_completion_tokens`
549+
/// should succeed on the fallback probe using `max_tokens`.
550+
#[tokio::test]
551+
async fn verify_openai_chat_falls_back_to_max_tokens() {
552+
let mock_server = MockServer::start().await;
553+
let route = test_route(
554+
&mock_server.uri(),
555+
&["openai_chat_completions"],
556+
AuthHeader::Bearer,
557+
);
558+
559+
// Reject the primary probe (max_completion_tokens) with 400.
560+
Mock::given(method("POST"))
561+
.and(path("/v1/chat/completions"))
562+
.and(body_partial_json(serde_json::json!({
563+
"max_completion_tokens": 32,
564+
})))
565+
.respond_with(ResponseTemplate::new(400).set_body_string(
566+
r#"{"error":{"message":"Unsupported parameter: 'max_completion_tokens'"}}"#,
567+
))
568+
.expect(1)
569+
.mount(&mock_server)
570+
.await;
571+
572+
// Accept the fallback probe (max_tokens).
573+
Mock::given(method("POST"))
574+
.and(path("/v1/chat/completions"))
575+
.and(body_partial_json(serde_json::json!({
576+
"max_tokens": 32,
577+
})))
578+
.respond_with(
579+
ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "chatcmpl-2"})),
580+
)
581+
.expect(1)
582+
.mount(&mock_server)
583+
.await;
584+
585+
let client = reqwest::Client::builder().build().unwrap();
586+
let validated = verify_backend_endpoint(&client, &route).await.unwrap();
587+
588+
assert_eq!(validated.protocol, "openai_chat_completions");
589+
}
590+
591+
/// Non-chat-completions probes (e.g. anthropic_messages) should not
592+
/// have a fallback — a 400 remains a hard failure.
593+
#[tokio::test]
594+
async fn verify_non_chat_completions_no_fallback() {
595+
let mock_server = MockServer::start().await;
596+
let route = test_route(
597+
&mock_server.uri(),
598+
&["anthropic_messages"],
599+
AuthHeader::Custom("x-api-key"),
600+
);
601+
602+
Mock::given(method("POST"))
603+
.and(path("/v1/messages"))
604+
.respond_with(ResponseTemplate::new(400).set_body_string("bad request"))
605+
.mount(&mock_server)
606+
.await;
607+
608+
let client = reqwest::Client::builder().build().unwrap();
609+
let result = verify_backend_endpoint(&client, &route).await;
610+
611+
assert!(result.is_err());
612+
assert_eq!(
613+
result.unwrap_err().kind,
614+
ValidationFailureKind::RequestShape
615+
);
616+
}
466617
}

crates/openshell-server/src/inference.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ mod tests {
854854
.and(header("content-type", "application/json"))
855855
.and(body_partial_json(serde_json::json!({
856856
"model": "gpt-4o-mini",
857-
"max_tokens": 32,
857+
"max_completion_tokens": 32,
858858
})))
859859
.respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
860860
"id": "chatcmpl-123",

0 commit comments

Comments
 (0)