lmstudio: Fix context wheel by including token usage in streaming responses

Add stream_options with include_usage: true to the ChatCompletionRequest so LM Studio returns token usage in streaming responses. Previously, without this field, the API never included usage data, so the context wheel had nothing to display. Also move usage handling in the event mapper to run before the empty-choices guard. OpenAI-compatible servers send the final usage summary as a chunk with an empty choices array, so the old guard was discarding usage data instead of emitting a UsageUpdate event. Fixes #53790
2026-06-01 03:14:56 +07:00 · 2026-05-27 12:40:59 -05:00 · 2026-05-27 12:40:59 -05:00 · da6f241ff2
commit da6f241ff2
parent e25458243b
2 changed files with 25 additions and 14 deletions
--- a/crates/language_models/src/provider/lmstudio.rs
+++ b/crates/language_models/src/provider/lmstudio.rs
@ -1,4 +1,4 @@
-use anyhow::{Result, anyhow};
+use anyhow::Result;
 use collections::HashMap;
 use credentials_provider::CredentialsProvider;
 use fs::Fs;
@ -413,6 +413,9 @@ impl LmStudioLanguageModel {
            model: self.model.name.clone(),
            messages,
            stream: true,
+            stream_options: Some(lmstudio::StreamOptions {
+                include_usage: true,
+            }),
            max_tokens: Some(-1),
            stop: Some(request.stop),
            // In LM Studio you can configure specific settings you'd like to use for your model.
@ -558,13 +561,23 @@ impl LmStudioEventMapper {
        &mut self,
        event: lmstudio::ResponseStreamEvent,
    ) -> Vec<Result<LanguageModelCompletionEvent, LanguageModelCompletionError>> {
+        let mut events = Vec::new();
+
+        if let Some(usage) = event.usage {
+            events.push(Ok(LanguageModelCompletionEvent::UsageUpdate(TokenUsage {
+                input_tokens: usage.prompt_tokens,
+                output_tokens: usage.completion_tokens,
+                cache_creation_input_tokens: 0,
+                cache_read_input_tokens: 0,
+            })));
+        }
+
+        // The final usage summary chunk from OpenAI-compatible servers has an empty choices array.
+        // Return accumulated events instead of treating it as an error.
        let Some(choice) = event.choices.into_iter().next() else {
-            return vec![Err(LanguageModelCompletionError::from(anyhow!(
-                "Response contained no choices"
-            )))];
+            return events;
        };

-        let mut events = Vec::new();
        if let Some(content) = choice.delta.content {
            events.push(Ok(LanguageModelCompletionEvent::Text(content)));
        }
@ -603,15 +616,6 @@ impl LmStudioEventMapper {
            }
        }

-        if let Some(usage) = event.usage {
-            events.push(Ok(LanguageModelCompletionEvent::UsageUpdate(TokenUsage {
-                input_tokens: usage.prompt_tokens,
-                output_tokens: usage.completion_tokens,
-                cache_creation_input_tokens: 0,
-                cache_read_input_tokens: 0,
-            })));
-        }
-
        match choice.finish_reason.as_deref() {
            Some("stop") => {
                events.push(Ok(LanguageModelCompletionEvent::Stop(StopReason::EndTurn)));
--- a/crates/lmstudio/src/lmstudio.rs
+++ b/crates/lmstudio/src/lmstudio.rs
@ -205,12 +205,19 @@ pub struct FunctionContent {
    pub arguments: String,
 }

+#[derive(Serialize, Debug)]
+pub struct StreamOptions {
+    pub include_usage: bool,
+}
+
 #[derive(Serialize, Debug)]
 pub struct ChatCompletionRequest {
    pub model: String,
    pub messages: Vec<ChatMessage>,
    pub stream: bool,
    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub max_tokens: Option<i32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub stop: Option<Vec<String>>,