Add stream_options.include_usage for OpenAI-compatible API token usage (#45812)

## Summary This PR enables token usage reporting in streaming responses for OpenAI-compatible APIs (OpenAI, xAI/Grok, OpenRouter, etc). ## Problem Currently, the token counter UI in the Agent Panel doesn't display usage for some OpenAI-compatible providers because they don't return usage data during streaming by default. According to OpenAI's API documentation, the `stream_options.include_usage` parameter must be set to `true` to receive usage statistics in streaming responses. ## Solution - Added StreamOptions struct with `include_usage` field to the open_ai crate - Added `stream_options` field to the Request struct - Automatically set `stream_options: { include_usage: true }` when `stream: true` - Updated edit_prediction requests with `stream_options: None` (non-streaming) ## Testing Tested with xAI Grok models - token counter now correctly shows usage after sending a message. ## References - [OpenAI Chat Completions API - stream_options](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options) - [xAI API Documentation](https://docs.x.ai/api)
2026-06-01 03:14:56 +07:00 · 2026-03-17 05:38:14 -05:00 · 2026-03-17 05:38:14 -05:00 · 905d28cc54
commit 905d28cc54
parent 8a25373fb9
4 changed files with 23 additions and 0 deletions
--- a/crates/edit_prediction/src/mercury.rs
+++ b/crates/edit_prediction/src/mercury.rs
@ -137,6 +137,7 @@ impl Mercury {
                    content: open_ai::MessageContent::Plain(prompt),
                }],
                stream: false,
+                stream_options: None,
                max_completion_tokens: None,
                stop: vec![],
                temperature: None,
--- a/crates/edit_prediction_cli/src/openai_client.rs
+++ b/crates/edit_prediction_cli/src/openai_client.rs
@ -40,6 +40,7 @@ impl PlainOpenAiClient {
            model: model.to_string(),
            messages,
            stream: false,
+            stream_options: None,
            max_completion_tokens: Some(max_tokens),
            stop: Vec::new(),
            temperature: None,
@ -490,6 +491,7 @@ impl BatchingOpenAiClient {
                    model: serializable_request.model,
                    messages,
                    stream: false,
+                    stream_options: None,
                    max_completion_tokens: Some(serializable_request.max_tokens),
                    stop: Vec::new(),
                    temperature: None,
--- a/crates/language_models/src/provider/open_ai.rs
+++ b/crates/language_models/src/provider/open_ai.rs
@ -506,6 +506,11 @@ pub fn into_open_ai(
        model: model_id.into(),
        messages,
        stream,
+        stream_options: if stream {
+            Some(open_ai::StreamOptions::default())
+        } else {
+            None
+        },
        stop: request.stop,
        temperature: request.temperature.or(Some(1.0)),
        max_completion_tokens: max_output_tokens,
--- a/crates/open_ai/src/open_ai.rs
+++ b/crates/open_ai/src/open_ai.rs
@ -295,12 +295,27 @@ impl Model {
    }
 }

+#[derive(Debug, Serialize, Deserialize)]
+pub struct StreamOptions {
+    pub include_usage: bool,
+}
+
+impl Default for StreamOptions {
+    fn default() -> Self {
+        Self {
+            include_usage: true,
+        }
+    }
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Request {
    pub model: String,
    pub messages: Vec<RequestMessage>,
    pub stream: bool,
    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub max_completion_tokens: Option<u64>,
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub stop: Vec<String>,