From 5e717a06cdf5af0a4df74f56339e7d58365667f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Houl=C3=A9?= <13155277+tomhoule@users.noreply.github.com> Date: Wed, 27 May 2026 11:17:23 +0200 Subject: [PATCH] open_ai: Support fast mode in BYOK via the Responses API service_tier (#57412) Maps the existing `Speed::Fast` plumbing to OpenAI's `service_tier: "priority"`, which matches what "fast mode" in Codex does. Relevant docs [here](https://platform.openai.com/docs/api-reference/chat/create#chat-create-service_tier). Like for the existing Anthropic fast mode we have a `Model::supports_priority` method for the variants on https://openai.com/api-priority-processing. Pro, nano, and legacy gpt-4 are excluded; Custom defaults to false. This is gated to staff only for now (not in this diff, but the existing fast mode feature), until we have the mechanism to require confirmation before you enable fast mode. Release Notes: - Added support for Fast Mode (priority service tier) on the OpenAI API provider. --- crates/edit_prediction/src/mercury.rs | 1 + .../edit_prediction_cli/src/openai_client.rs | 2 + .../language_models/src/provider/open_ai.rs | 9 +- crates/open_ai/src/completion.rs | 105 +++++++++++++++++- crates/open_ai/src/open_ai.rs | 41 +++++++ crates/open_ai/src/responses.rs | 4 +- 6 files changed, 157 insertions(+), 5 deletions(-) diff --git a/crates/edit_prediction/src/mercury.rs b/crates/edit_prediction/src/mercury.rs index cd7cb6aff34..ddbe8993130 100644 --- a/crates/edit_prediction/src/mercury.rs +++ b/crates/edit_prediction/src/mercury.rs @@ -147,6 +147,7 @@ impl Mercury { tools: vec![], prompt_cache_key: None, reasoning_effort: None, + service_tier: None, }; let buf = serde_json::to_vec(&request_body)?; diff --git a/crates/edit_prediction_cli/src/openai_client.rs b/crates/edit_prediction_cli/src/openai_client.rs index 205b339226f..3352fcf508f 100644 --- a/crates/edit_prediction_cli/src/openai_client.rs +++ b/crates/edit_prediction_cli/src/openai_client.rs @@ -46,6 +46,7 @@ impl PlainOpenAiClient { temperature: None, tool_choice: None, parallel_tool_calls: None, + service_tier: None, tools: Vec::new(), prompt_cache_key: None, reasoning_effort: None, @@ -506,6 +507,7 @@ impl BatchingOpenAiClient { temperature: None, tool_choice: None, parallel_tool_calls: None, + service_tier: None, tools: Vec::new(), prompt_cache_key: None, reasoning_effort: None, diff --git a/crates/language_models/src/provider/open_ai.rs b/crates/language_models/src/provider/open_ai.rs index ddf8b0e6885..3d4826bbd27 100644 --- a/crates/language_models/src/provider/open_ai.rs +++ b/crates/language_models/src/provider/open_ai.rs @@ -454,6 +454,10 @@ impl LanguageModel for OpenAiLanguageModel { supports_selectable_thinking_effort(&self.model) } + fn supports_fast_mode(&self) -> bool { + self.model.supports_priority() + } + fn supported_effort_levels(&self) -> Vec { supported_thinking_effort_levels(&self.model) } @@ -476,7 +480,7 @@ impl LanguageModel for OpenAiLanguageModel { fn stream_completion( &self, - request: LanguageModelRequest, + mut request: LanguageModelRequest, cx: &AsyncApp, ) -> BoxFuture< 'static, @@ -488,6 +492,9 @@ impl LanguageModel for OpenAiLanguageModel { LanguageModelCompletionError, >, > { + if !self.model.supports_priority() { + request.speed = None; + } if self.model.uses_responses_api() { let request = into_open_ai_response( request, diff --git a/crates/open_ai/src/completion.rs b/crates/open_ai/src/completion.rs index 4f1a671aa80..af140237564 100644 --- a/crates/open_ai/src/completion.rs +++ b/crates/open_ai/src/completion.rs @@ -21,12 +21,22 @@ use crate::responses::{ }; use crate::{ FunctionContent, FunctionDefinition, ImageUrl, MessagePart, ReasoningEffort, - ResponseStreamEvent, ToolCall, ToolCallContent, + ResponseStreamEvent, ServiceTier, ToolCall, ToolCallContent, }; const RESPONSE_MESSAGE_PHASE_COMMENTARY: &str = "commentary"; const RESPONSE_MESSAGE_PHASE_FINAL_ANSWER: &str = "final_answer"; +/// Translates the request's `Speed` into the corresponding OpenAI service tier. +/// Only `Fast` produces a value; `Standard` leaves the field unset so that the +/// project's default tier applies. +fn service_tier_for(speed: Option) -> Option { + match speed? { + language_model_core::Speed::Fast => Some(ServiceTier::Priority), + language_model_core::Speed::Standard => None, + } +} + pub fn into_open_ai( request: LanguageModelRequest, model_id: &str, @@ -37,6 +47,7 @@ pub fn into_open_ai( interleaved_reasoning: bool, ) -> crate::Request { let stream = !model_id.starts_with("o1-"); + let service_tier = service_tier_for(request.speed); let mut messages = Vec::new(); let mut current_reasoning: Option = None; @@ -173,6 +184,7 @@ pub fn into_open_ai( LanguageModelToolChoice::None => crate::ToolChoice::None, }), reasoning_effort, + service_tier, } } @@ -198,9 +210,11 @@ pub fn into_open_ai_response( temperature, thinking_allowed, thinking_effort, - speed: _, + speed, } = request; + let service_tier = service_tier_for(speed); + let mut input_items = Vec::new(); let mut replayed_reasoning_item_indexes = HashMap::default(); for (index, message) in messages.into_iter().enumerate() { @@ -284,6 +298,7 @@ pub fn into_open_ai_response( None }, reasoning, + service_tier, } } @@ -1170,7 +1185,7 @@ mod tests { use language_model_core::{ LanguageModelImage, LanguageModelRequestMessage, LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent, LanguageModelToolUse, - LanguageModelToolUseId, SharedString, + LanguageModelToolUseId, SharedString, Speed, }; use pretty_assertions::assert_eq; use serde_json::json; @@ -1666,6 +1681,90 @@ mod tests { assert_eq!(serialized.get("reasoning"), None); } + /// `Speed::Fast` should translate to `service_tier: "priority"` on the + /// outgoing Responses request, while `Standard` / `None` should leave the + /// field unset so the project's default tier wins. + #[test] + fn into_open_ai_response_sets_service_tier_for_fast_speed() -> Result<()> { + for (speed, expected) in [ + (None, None), + (Some(Speed::Standard), None), + (Some(Speed::Fast), Some("priority")), + ] { + let request = LanguageModelRequest { + thread_id: None, + prompt_id: None, + intent: None, + messages: vec![LanguageModelRequestMessage { + role: Role::User, + content: vec![MessageContent::Text("Hello".into())], + cache: false, + reasoning_details: None, + }], + tools: Vec::new(), + tool_choice: None, + stop: Vec::new(), + temperature: None, + thinking_allowed: false, + thinking_effort: None, + speed, + }; + + let response = into_open_ai_response(request, "gpt-5.4", true, true, None, None, true); + + let serialized = serde_json::to_value(&response)?; + assert_eq!( + serialized + .get("service_tier") + .and_then(|value| value.as_str()), + expected, + "speed = {speed:?} should produce service_tier = {expected:?}", + ); + } + Ok(()) + } + + /// Same as above but for the Chat Completions code path. + #[test] + fn into_open_ai_sets_service_tier_for_fast_speed() -> Result<()> { + for (speed, expected) in [ + (None, None), + (Some(Speed::Standard), None), + (Some(Speed::Fast), Some("priority")), + ] { + let request = LanguageModelRequest { + thread_id: None, + prompt_id: None, + intent: None, + messages: vec![LanguageModelRequestMessage { + role: Role::User, + content: vec![MessageContent::Text("Hello".into())], + cache: false, + reasoning_details: None, + }], + tools: Vec::new(), + tool_choice: None, + stop: Vec::new(), + temperature: None, + thinking_allowed: false, + thinking_effort: None, + speed, + }; + + let chat = into_open_ai(request, "gpt-5.4", true, true, None, None, false); + + let serialized = serde_json::to_value(&chat)?; + assert_eq!( + serialized + .get("service_tier") + .and_then(|value| value.as_str()), + expected, + "speed = {speed:?} should produce service_tier = {expected:?}", + ); + } + Ok(()) + } + #[test] fn into_open_ai_response_sends_none_reasoning_when_thinking_is_disabled() -> Result<()> { let request = LanguageModelRequest { diff --git a/crates/open_ai/src/open_ai.rs b/crates/open_ai/src/open_ai.rs index 1b4b4958f21..0ff1308d52a 100644 --- a/crates/open_ai/src/open_ai.rs +++ b/crates/open_ai/src/open_ai.rs @@ -342,6 +342,30 @@ impl Model { pub fn supports_prompt_cache_key(&self) -> bool { true } + + /// Whether OpenAI's Priority processing tier is available for this model. + /// Sourced from . The `*-pro`, + /// `*-nano`, and legacy `gpt-4` variants are not eligible. + pub fn supports_priority(&self) -> bool { + match self { + Self::FourOmniMini + | Self::O3 + | Self::Five + | Self::FiveMini + | Self::FivePointOne + | Self::FivePointTwo + | Self::FivePointThreeCodex + | Self::FivePointFourMini + | Self::FivePointFour + | Self::FivePointFive => true, + Self::Four + | Self::FiveNano + | Self::FivePointFourNano + | Self::FivePointFourPro + | Self::FivePointFivePro + | Self::Custom { .. } => false, + } + } } #[cfg(test)] @@ -456,6 +480,23 @@ pub struct Request { pub prompt_cache_key: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub reasoning_effort: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_tier: Option, +} + +/// Service tier for OpenAI requests. Maps to the top-level `service_tier` +/// field on Responses and Chat Completions. We only ever send `Priority` +/// today (in response to Fast Mode being enabled); the other variants are +/// included for symmetry with the API and so deserialization of echoed +/// values does not fail. +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ServiceTier { + Auto, + Default, + Flex, + Scale, + Priority, } #[derive(Debug, Serialize, Deserialize)] diff --git a/crates/open_ai/src/responses.rs b/crates/open_ai/src/responses.rs index 954f9b5ad56..7465bc1ca46 100644 --- a/crates/open_ai/src/responses.rs +++ b/crates/open_ai/src/responses.rs @@ -4,7 +4,7 @@ use http_client::{AsyncBody, HttpClient, Method, Request as HttpRequest}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::{ReasoningEffort, RequestError, Role, ToolChoice}; +use crate::{ReasoningEffort, RequestError, Role, ServiceTier, ToolChoice}; #[derive(Serialize, Debug)] pub struct Request { @@ -35,6 +35,8 @@ pub struct Request { pub reasoning: Option, #[serde(skip_serializing_if = "Option::is_none")] pub store: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub service_tier: Option, } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]