diff --git a/crates/edit_prediction/src/mercury.rs b/crates/edit_prediction/src/mercury.rs index cd7cb6aff34..ddbe8993130 100644 --- a/crates/edit_prediction/src/mercury.rs +++ b/crates/edit_prediction/src/mercury.rs @@ -147,6 +147,7 @@ impl Mercury { tools: vec![], prompt_cache_key: None, reasoning_effort: None, + service_tier: None, }; let buf = serde_json::to_vec(&request_body)?; diff --git a/crates/edit_prediction_cli/src/openai_client.rs b/crates/edit_prediction_cli/src/openai_client.rs index 205b339226f..3352fcf508f 100644 --- a/crates/edit_prediction_cli/src/openai_client.rs +++ b/crates/edit_prediction_cli/src/openai_client.rs @@ -46,6 +46,7 @@ impl PlainOpenAiClient { temperature: None, tool_choice: None, parallel_tool_calls: None, + service_tier: None, tools: Vec::new(), prompt_cache_key: None, reasoning_effort: None, @@ -506,6 +507,7 @@ impl BatchingOpenAiClient { temperature: None, tool_choice: None, parallel_tool_calls: None, + service_tier: None, tools: Vec::new(), prompt_cache_key: None, reasoning_effort: None, diff --git a/crates/language_models/src/provider/open_ai.rs b/crates/language_models/src/provider/open_ai.rs index ddf8b0e6885..3d4826bbd27 100644 --- a/crates/language_models/src/provider/open_ai.rs +++ b/crates/language_models/src/provider/open_ai.rs @@ -454,6 +454,10 @@ impl LanguageModel for OpenAiLanguageModel { supports_selectable_thinking_effort(&self.model) } + fn supports_fast_mode(&self) -> bool { + self.model.supports_priority() + } + fn supported_effort_levels(&self) -> Vec { supported_thinking_effort_levels(&self.model) } @@ -476,7 +480,7 @@ impl LanguageModel for OpenAiLanguageModel { fn stream_completion( &self, - request: LanguageModelRequest, + mut request: LanguageModelRequest, cx: &AsyncApp, ) -> BoxFuture< 'static, @@ -488,6 +492,9 @@ impl LanguageModel for OpenAiLanguageModel { LanguageModelCompletionError, >, > { + if !self.model.supports_priority() { + request.speed = None; + } if self.model.uses_responses_api() { let request = into_open_ai_response( request, diff --git a/crates/open_ai/src/completion.rs b/crates/open_ai/src/completion.rs index 4f1a671aa80..af140237564 100644 --- a/crates/open_ai/src/completion.rs +++ b/crates/open_ai/src/completion.rs @@ -21,12 +21,22 @@ use crate::responses::{ }; use crate::{ FunctionContent, FunctionDefinition, ImageUrl, MessagePart, ReasoningEffort, - ResponseStreamEvent, ToolCall, ToolCallContent, + ResponseStreamEvent, ServiceTier, ToolCall, ToolCallContent, }; const RESPONSE_MESSAGE_PHASE_COMMENTARY: &str = "commentary"; const RESPONSE_MESSAGE_PHASE_FINAL_ANSWER: &str = "final_answer"; +/// Translates the request's `Speed` into the corresponding OpenAI service tier. +/// Only `Fast` produces a value; `Standard` leaves the field unset so that the +/// project's default tier applies. +fn service_tier_for(speed: Option) -> Option { + match speed? { + language_model_core::Speed::Fast => Some(ServiceTier::Priority), + language_model_core::Speed::Standard => None, + } +} + pub fn into_open_ai( request: LanguageModelRequest, model_id: &str, @@ -37,6 +47,7 @@ pub fn into_open_ai( interleaved_reasoning: bool, ) -> crate::Request { let stream = !model_id.starts_with("o1-"); + let service_tier = service_tier_for(request.speed); let mut messages = Vec::new(); let mut current_reasoning: Option = None; @@ -173,6 +184,7 @@ pub fn into_open_ai( LanguageModelToolChoice::None => crate::ToolChoice::None, }), reasoning_effort, + service_tier, } } @@ -198,9 +210,11 @@ pub fn into_open_ai_response( temperature, thinking_allowed, thinking_effort, - speed: _, + speed, } = request; + let service_tier = service_tier_for(speed); + let mut input_items = Vec::new(); let mut replayed_reasoning_item_indexes = HashMap::default(); for (index, message) in messages.into_iter().enumerate() { @@ -284,6 +298,7 @@ pub fn into_open_ai_response( None }, reasoning, + service_tier, } } @@ -1170,7 +1185,7 @@ mod tests { use language_model_core::{ LanguageModelImage, LanguageModelRequestMessage, LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent, LanguageModelToolUse, - LanguageModelToolUseId, SharedString, + LanguageModelToolUseId, SharedString, Speed, }; use pretty_assertions::assert_eq; use serde_json::json; @@ -1666,6 +1681,90 @@ mod tests { assert_eq!(serialized.get("reasoning"), None); } + /// `Speed::Fast` should translate to `service_tier: "priority"` on the + /// outgoing Responses request, while `Standard` / `None` should leave the + /// field unset so the project's default tier wins. + #[test] + fn into_open_ai_response_sets_service_tier_for_fast_speed() -> Result<()> { + for (speed, expected) in [ + (None, None), + (Some(Speed::Standard), None), + (Some(Speed::Fast), Some("priority")), + ] { + let request = LanguageModelRequest { + thread_id: None, + prompt_id: None, + intent: None, + messages: vec![LanguageModelRequestMessage { + role: Role::User, + content: vec![MessageContent::Text("Hello".into())], + cache: false, + reasoning_details: None, + }], + tools: Vec::new(), + tool_choice: None, + stop: Vec::new(), + temperature: None, + thinking_allowed: false, + thinking_effort: None, + speed, + }; + + let response = into_open_ai_response(request, "gpt-5.4", true, true, None, None, true); + + let serialized = serde_json::to_value(&response)?; + assert_eq!( + serialized + .get("service_tier") + .and_then(|value| value.as_str()), + expected, + "speed = {speed:?} should produce service_tier = {expected:?}", + ); + } + Ok(()) + } + + /// Same as above but for the Chat Completions code path. + #[test] + fn into_open_ai_sets_service_tier_for_fast_speed() -> Result<()> { + for (speed, expected) in [ + (None, None), + (Some(Speed::Standard), None), + (Some(Speed::Fast), Some("priority")), + ] { + let request = LanguageModelRequest { + thread_id: None, + prompt_id: None, + intent: None, + messages: vec![LanguageModelRequestMessage { + role: Role::User, + content: vec![MessageContent::Text("Hello".into())], + cache: false, + reasoning_details: None, + }], + tools: Vec::new(), + tool_choice: None, + stop: Vec::new(), + temperature: None, + thinking_allowed: false, + thinking_effort: None, + speed, + }; + + let chat = into_open_ai(request, "gpt-5.4", true, true, None, None, false); + + let serialized = serde_json::to_value(&chat)?; + assert_eq!( + serialized + .get("service_tier") + .and_then(|value| value.as_str()), + expected, + "speed = {speed:?} should produce service_tier = {expected:?}", + ); + } + Ok(()) + } + #[test] fn into_open_ai_response_sends_none_reasoning_when_thinking_is_disabled() -> Result<()> { let request = LanguageModelRequest { diff --git a/crates/open_ai/src/open_ai.rs b/crates/open_ai/src/open_ai.rs index 1b4b4958f21..0ff1308d52a 100644 --- a/crates/open_ai/src/open_ai.rs +++ b/crates/open_ai/src/open_ai.rs @@ -342,6 +342,30 @@ impl Model { pub fn supports_prompt_cache_key(&self) -> bool { true } + + /// Whether OpenAI's Priority processing tier is available for this model. + /// Sourced from . The `*-pro`, + /// `*-nano`, and legacy `gpt-4` variants are not eligible. + pub fn supports_priority(&self) -> bool { + match self { + Self::FourOmniMini + | Self::O3 + | Self::Five + | Self::FiveMini + | Self::FivePointOne + | Self::FivePointTwo + | Self::FivePointThreeCodex + | Self::FivePointFourMini + | Self::FivePointFour + | Self::FivePointFive => true, + Self::Four + | Self::FiveNano + | Self::FivePointFourNano + | Self::FivePointFourPro + | Self::FivePointFivePro + | Self::Custom { .. } => false, + } + } } #[cfg(test)] @@ -456,6 +480,23 @@ pub struct Request { pub prompt_cache_key: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub reasoning_effort: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_tier: Option, +} + +/// Service tier for OpenAI requests. Maps to the top-level `service_tier` +/// field on Responses and Chat Completions. We only ever send `Priority` +/// today (in response to Fast Mode being enabled); the other variants are +/// included for symmetry with the API and so deserialization of echoed +/// values does not fail. +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ServiceTier { + Auto, + Default, + Flex, + Scale, + Priority, } #[derive(Debug, Serialize, Deserialize)] diff --git a/crates/open_ai/src/responses.rs b/crates/open_ai/src/responses.rs index 954f9b5ad56..7465bc1ca46 100644 --- a/crates/open_ai/src/responses.rs +++ b/crates/open_ai/src/responses.rs @@ -4,7 +4,7 @@ use http_client::{AsyncBody, HttpClient, Method, Request as HttpRequest}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::{ReasoningEffort, RequestError, Role, ToolChoice}; +use crate::{ReasoningEffort, RequestError, Role, ServiceTier, ToolChoice}; #[derive(Serialize, Debug)] pub struct Request { @@ -35,6 +35,8 @@ pub struct Request { pub reasoning: Option, #[serde(skip_serializing_if = "Option::is_none")] pub store: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub service_tier: Option, } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]