From da6f241ff20120966bf6747a379fa13dc2c70407 Mon Sep 17 00:00:00 2001
From: Gabriele Ancillai <gabriele.ancillai@sofka.com.co>
Date: Wed, 27 May 2026 12:40:59 -0500
Subject: [PATCH] lmstudio: Fix context wheel by including token usage in
 streaming responses

Add stream_options with include_usage: true to the ChatCompletionRequest so
LM Studio returns token usage in streaming responses. Previously, without
this field, the API never included usage data, so the context wheel had nothing
to display.

Also move usage handling in the event mapper to run before the empty-choices
guard. OpenAI-compatible servers send the final usage summary as a chunk with
an empty choices array, so the old guard was discarding usage data instead of
emitting a UsageUpdate event.

Fixes #53790
---
 .../language_models/src/provider/lmstudio.rs  | 32 +++++++++++--------
 crates/lmstudio/src/lmstudio.rs               |  7 ++++
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/crates/language_models/src/provider/lmstudio.rs b/crates/language_models/src/provider/lmstudio.rs
index ea19c265e9c..be633e46ec1 100644
--- a/crates/language_models/src/provider/lmstudio.rs
+++ b/crates/language_models/src/provider/lmstudio.rs
@@ -1,4 +1,4 @@
-use anyhow::{Result, anyhow};
+use anyhow::Result;
 use collections::HashMap;
 use credentials_provider::CredentialsProvider;
 use fs::Fs;
@@ -413,6 +413,9 @@ impl LmStudioLanguageModel {
             model: self.model.name.clone(),
             messages,
             stream: true,
+            stream_options: Some(lmstudio::StreamOptions {
+                include_usage: true,
+            }),
             max_tokens: Some(-1),
             stop: Some(request.stop),
             // In LM Studio you can configure specific settings you'd like to use for your model.
@@ -558,13 +561,23 @@ impl LmStudioEventMapper {
         &mut self,
         event: lmstudio::ResponseStreamEvent,
     ) -> Vec<Result<LanguageModelCompletionEvent, LanguageModelCompletionError>> {
+        let mut events = Vec::new();
+
+        if let Some(usage) = event.usage {
+            events.push(Ok(LanguageModelCompletionEvent::UsageUpdate(TokenUsage {
+                input_tokens: usage.prompt_tokens,
+                output_tokens: usage.completion_tokens,
+                cache_creation_input_tokens: 0,
+                cache_read_input_tokens: 0,
+            })));
+        }
+
+        // The final usage summary chunk from OpenAI-compatible servers has an empty choices array.
+        // Return accumulated events instead of treating it as an error.
         let Some(choice) = event.choices.into_iter().next() else {
-            return vec![Err(LanguageModelCompletionError::from(anyhow!(
-                "Response contained no choices"
-            )))];
+            return events;
         };
 
-        let mut events = Vec::new();
         if let Some(content) = choice.delta.content {
             events.push(Ok(LanguageModelCompletionEvent::Text(content)));
         }
@@ -603,15 +616,6 @@ impl LmStudioEventMapper {
             }
         }
 
-        if let Some(usage) = event.usage {
-            events.push(Ok(LanguageModelCompletionEvent::UsageUpdate(TokenUsage {
-                input_tokens: usage.prompt_tokens,
-                output_tokens: usage.completion_tokens,
-                cache_creation_input_tokens: 0,
-                cache_read_input_tokens: 0,
-            })));
-        }
-
         match choice.finish_reason.as_deref() {
             Some("stop") => {
                 events.push(Ok(LanguageModelCompletionEvent::Stop(StopReason::EndTurn)));
diff --git a/crates/lmstudio/src/lmstudio.rs b/crates/lmstudio/src/lmstudio.rs
index 8a44b7fdefe..0a0ce8271fd 100644
--- a/crates/lmstudio/src/lmstudio.rs
+++ b/crates/lmstudio/src/lmstudio.rs
@@ -205,12 +205,19 @@ pub struct FunctionContent {
     pub arguments: String,
 }
 
+#[derive(Serialize, Debug)]
+pub struct StreamOptions {
+    pub include_usage: bool,
+}
+
 #[derive(Serialize, Debug)]
 pub struct ChatCompletionRequest {
     pub model: String,
     pub messages: Vec<ChatMessage>,
     pub stream: bool,
     #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub max_tokens: Option<i32>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub stop: Option<Vec<String>>,