From da43bdb648202474bc7402590f0d6042a62c6ba2 Mon Sep 17 00:00:00 2001 From: Bennet Bo Fenner Date: Tue, 19 May 2026 14:51:42 +0200 Subject: [PATCH] agent: Support image output from MCP tools (#57134) Release Notes: - agent: Support image output from MCP tools --- crates/agent/src/tests/mod.rs | 25 +- .../src/tools/context_server_registry.rs | 47 +++- .../src/conversation_view/thread_view.rs | 44 ++-- crates/language_model/src/request.rs | 230 +++++++++++------- crates/language_model_core/src/request.rs | 1 - 5 files changed, 225 insertions(+), 122 deletions(-) diff --git a/crates/agent/src/tests/mod.rs b/crates/agent/src/tests/mod.rs index 7713907f893..9592d8f7928 100644 --- a/crates/agent/src/tests/mod.rs +++ b/crates/agent/src/tests/mod.rs @@ -26,10 +26,10 @@ use gpui::{ use indoc::indoc; use language_model::{ CompletionIntent, LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent, - LanguageModelId, LanguageModelProviderId, LanguageModelProviderName, LanguageModelRegistry, - LanguageModelRequest, LanguageModelRequestMessage, LanguageModelToolResult, - LanguageModelToolSchemaFormat, LanguageModelToolUse, MessageContent, Role, StopReason, - TokenUsage, + LanguageModelId, LanguageModelImageExt, LanguageModelProviderId, LanguageModelProviderName, + LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage, + LanguageModelToolResult, LanguageModelToolSchemaFormat, LanguageModelToolUse, MessageContent, + Role, StopReason, TokenUsage, fake_provider::{FakeLanguageModel, FakeLanguageModelProvider}, }; use pretty_assertions::assert_eq; @@ -1656,6 +1656,7 @@ async fn test_mcp_tool_multi_content_response(cx: &mut TestAppContext) { let (tool_call_params, tool_call_response) = mcp_tool_calls.next().await.unwrap(); assert_eq!(tool_call_params.name, "screenshot"); + let image_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg=="; tool_call_response .send(context_server::types::CallToolResponse { content: vec![ @@ -1663,7 +1664,7 @@ async fn test_mcp_tool_multi_content_response(cx: &mut TestAppContext) { text: "Some text".into(), }, context_server::types::ToolResponseContent::Image { - data: "aGVsbG8=".into(), + data: image_data.into(), mime_type: "image/png".into(), }, context_server::types::ToolResponseContent::Text { @@ -1691,13 +1692,25 @@ async fn test_mcp_tool_multi_content_response(cx: &mut TestAppContext) { }) .expect("expected a tool result"); assert_eq!(tool_result.tool_use_id, "tool_1".into()); - assert_eq!(tool_result.content.len(), 2); + assert_eq!(tool_result.content.len(), 3); + assert_eq!( + tool_result.content[0], + language_model::LanguageModelToolResultContent::Text(Arc::from("Some text")) + ); + let expected_image = + language_model::LanguageModelImage::from_base64_image(image_data, "image/png") + .expect("image conversion should not error") + .expect("image conversion should succeed"); assert_eq!( tool_result.content[0], language_model::LanguageModelToolResultContent::Text(Arc::from("Some text")) ); assert_eq!( tool_result.content[1], + language_model::LanguageModelToolResultContent::Image(expected_image) + ); + assert_eq!( + tool_result.content[2], language_model::LanguageModelToolResultContent::Text(Arc::from("Some more text")) ); fake_model.end_last_completion_stream(); diff --git a/crates/agent/src/tools/context_server_registry.rs b/crates/agent/src/tools/context_server_registry.rs index 01601679c90..6c0e8d31557 100644 --- a/crates/agent/src/tools/context_server_registry.rs +++ b/crates/agent/src/tools/context_server_registry.rs @@ -5,7 +5,7 @@ use collections::{BTreeMap, HashMap}; use context_server::{ContextServerId, client::NotificationSubscription}; use futures::FutureExt as _; use gpui::{App, AppContext, AsyncApp, Context, Entity, EventEmitter, SharedString, Task}; -use language_model::LanguageModelToolResultContent; +use language_model::{LanguageModelImage, LanguageModelImageExt, LanguageModelToolResultContent}; use project::context_server_store::{ContextServerStatus, ContextServerStore}; use std::sync::Arc; use util::ResultExt; @@ -346,7 +346,7 @@ impl AnyAgentTool for ContextServerTool { let authorize = event_stream.authorize_third_party_tool(initial_title, tool_id, display_name, cx); - cx.spawn(async move |_cx| { + cx.spawn(async move |cx| { let input = input .recv() .await @@ -394,15 +394,50 @@ impl AnyAgentTool for ContextServerTool { } let mut llm_output = Vec::new(); + let mut tool_call_content = Vec::new(); let mut concatenated_text = String::new(); for content in response.content { match content { context_server::types::ToolResponseContent::Text { text } => { concatenated_text.push_str(&text); + tool_call_content.push(acp::ToolCallContent::Content(acp::Content::new( + acp::ContentBlock::Text(acp::TextContent::new(text.clone())), + ))); llm_output.push(LanguageModelToolResultContent::Text(text.into())); } - context_server::types::ToolResponseContent::Image { .. } => { - log::warn!("Ignoring image content from tool response"); + context_server::types::ToolResponseContent::Image { data, mime_type } => { + tool_call_content.push(acp::ToolCallContent::Content(acp::Content::new( + acp::ContentBlock::Image(acp::ImageContent::new( + data.clone(), + mime_type.clone(), + )), + ))); + let language_model_image = cx + .background_spawn({ + let mime_type = mime_type.clone(); + async move { + LanguageModelImage::from_base64_image(&data, &mime_type) + } + }) + .await; + match language_model_image { + Ok(Some(image)) => { + llm_output.push(LanguageModelToolResultContent::Image(image)); + } + Ok(None) => { + log::warn!( + "Skipping MCP tool response image with MIME type `{}` because it cannot be converted for language model input", + mime_type + ); + } + Err(error) => { + log::warn!( + "Failed to convert MCP tool response image with MIME type `{}` for language model input: {:#}", + mime_type, + error + ); + } + } } context_server::types::ToolResponseContent::Audio { .. } => { log::warn!("Ignoring audio content from tool response"); @@ -415,6 +450,10 @@ impl AnyAgentTool for ContextServerTool { } } } + if !tool_call_content.is_empty() { + event_stream + .update_fields(acp::ToolCallUpdateFields::new().content(tool_call_content)); + } let raw_output = serde_json::Value::String(concatenated_text); Ok(AgentToolOutput { raw_output, diff --git a/crates/agent_ui/src/conversation_view/thread_view.rs b/crates/agent_ui/src/conversation_view/thread_view.rs index 6b63abd50ea..bb8d65fa868 100644 --- a/crates/agent_ui/src/conversation_view/thread_view.rs +++ b/crates/agent_ui/src/conversation_view/thread_view.rs @@ -6585,6 +6585,32 @@ impl ThreadView { ) }), ) + .when(!use_card_layout, |this| { + let button_id = + SharedString::from(format!("tool_output-collapse-{:?}", tool_call.id)); + let tool_call_id = tool_call.id.clone(); + + this.child( + div() + .ml(rems(0.4)) + .px_3p5() + .pt_2() + .border_l_1() + .border_color(self.tool_card_border_color(cx)) + .child( + IconButton::new(button_id, IconName::ChevronUp) + .full_width() + .style(ButtonStyle::Outlined) + .icon_color(Color::Muted) + .on_click(cx.listener({ + move |this: &mut Self, _, _, cx: &mut Context| { + this.expanded_tool_calls.remove(&tool_call_id); + cx.notify(); + } + })), + ), + ) + }) .into_any(), ToolCallStatus::Rejected => Empty.into_any(), } @@ -7580,7 +7606,6 @@ impl ThreadView { } else if let Some(markdown) = content.markdown() { self.render_markdown_output( markdown.clone(), - tool_call.id.clone(), context_ix, card_layout, window, @@ -7724,14 +7749,11 @@ impl ThreadView { fn render_markdown_output( &self, markdown: Entity, - tool_call_id: acp::ToolCallId, context_ix: usize, card_layout: bool, window: &Window, cx: &Context, ) -> AnyElement { - let button_id = SharedString::from(format!("tool_output-{:?}", tool_call_id)); - v_flex() .gap_2() .map(|this| { @@ -7754,20 +7776,6 @@ impl ThreadView { MarkdownStyle::themed(MarkdownFont::Agent, window, cx), cx, )) - .when(!card_layout, |this| { - this.child( - IconButton::new(button_id, IconName::ChevronUp) - .full_width() - .style(ButtonStyle::Outlined) - .icon_color(Color::Muted) - .on_click(cx.listener({ - move |this: &mut Self, _, _, cx: &mut Context| { - this.expanded_tool_calls.remove(&tool_call_id); - cx.notify(); - } - })), - ) - }) .into_any_element() } diff --git a/crates/language_model/src/request.rs b/crates/language_model/src/request.rs index edb5645a8d1..b4dedbbd7ba 100644 --- a/crates/language_model/src/request.rs +++ b/crates/language_model/src/request.rs @@ -1,8 +1,8 @@ use std::io::{Cursor, Write}; use std::sync::Arc; -use anyhow::Result; -use base64::write::EncoderWriter; +use anyhow::{Result, anyhow}; +use base64::{Engine as _, write::EncoderWriter}; use gpui::{ App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, Size, Task, point, px, size, }; @@ -29,6 +29,7 @@ const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8; pub trait LanguageModelImageExt { const FORMAT: ImageFormat; fn from_image(data: Arc, cx: &mut App) -> Task>; + fn from_base64_image(data: &str, mime_type: &str) -> Result>; } impl LanguageModelImageExt for LanguageModelImage { @@ -36,93 +37,104 @@ impl LanguageModelImageExt for LanguageModelImage { fn from_image(data: Arc, cx: &mut App) -> Task> { cx.background_spawn(async move { - let image_bytes = Cursor::new(data.bytes()); - let dynamic_image = match data.format() { - ImageFormat::Png => image::codecs::png::PngDecoder::new(image_bytes) - .and_then(image::DynamicImage::from_decoder), - ImageFormat::Jpeg => image::codecs::jpeg::JpegDecoder::new(image_bytes) - .and_then(image::DynamicImage::from_decoder), - ImageFormat::Webp => image::codecs::webp::WebPDecoder::new(image_bytes) - .and_then(image::DynamicImage::from_decoder), - ImageFormat::Gif => image::codecs::gif::GifDecoder::new(image_bytes) - .and_then(image::DynamicImage::from_decoder), - ImageFormat::Bmp => image::codecs::bmp::BmpDecoder::new(image_bytes) - .and_then(image::DynamicImage::from_decoder), - ImageFormat::Tiff => image::codecs::tiff::TiffDecoder::new(image_bytes) - .and_then(image::DynamicImage::from_decoder), - _ => return None, - } - .log_err()?; - - let width = dynamic_image.width(); - let height = dynamic_image.height(); - let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32)); - - // First apply any provider-specific dimension constraints we know about (Anthropic). - let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32 - || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32 - { - let new_bounds = ObjectFit::ScaleDown.get_bounds( - gpui::Bounds { - origin: point(px(0.0), px(0.0)), - size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)), - }, - image_size, - ); - dynamic_image.resize( - new_bounds.size.width.into(), - new_bounds.size.height.into(), - image::imageops::FilterType::Triangle, - ) - } else { - dynamic_image + let format = match data.format() { + ImageFormat::Png => image::ImageFormat::Png, + ImageFormat::Jpeg => image::ImageFormat::Jpeg, + ImageFormat::Webp => image::ImageFormat::WebP, + ImageFormat::Gif => image::ImageFormat::Gif, + ImageFormat::Bmp => image::ImageFormat::Bmp, + ImageFormat::Tiff => image::ImageFormat::Tiff, + ImageFormat::Ico => image::ImageFormat::Ico, + ImageFormat::Pnm => image::ImageFormat::Pnm, + ImageFormat::Svg => return None, }; - - // Then enforce a default per-image size cap on the encoded PNG bytes. - // - // We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd. - // The upstream provider limit we want to respect is effectively on the binary image - // payload size, so we enforce against the encoded PNG bytes before base64 encoding. - let mut encoded_png = encode_png_bytes(&processed_image).log_err()?; - for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES { - if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES { - break; - } - - // Scale down geometrically to converge quickly. We don't know the final PNG size - // as a function of pixels, so we iteratively shrink. - let (w, h) = processed_image.dimensions(); - if w <= 1 || h <= 1 { - break; - } - - // Shrink by ~15% each pass (0.85). This is a compromise between speed and - // preserving image detail. - let new_w = ((w as f32) * 0.85).round().max(1.0) as u32; - let new_h = ((h as f32) * 0.85).round().max(1.0) as u32; - - processed_image = - processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle); - encoded_png = encode_png_bytes(&processed_image).log_err()?; - } - - if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES { - // Still too large after multiple passes; treat as non-convertible for now. - // (Provider-specific handling can be introduced later.) - return None; - } - - // Now base64 encode the PNG bytes. - let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?; - - // SAFETY: The base64 encoder should not produce non-UTF8. - let source = unsafe { String::from_utf8_unchecked(base64_image) }; - - Some(LanguageModelImage { - source: source.into(), - }) + let dynamic_image = + image::load_from_memory_with_format(data.bytes(), format).log_err()?; + language_model_image_from_dynamic_image(dynamic_image) + .log_err() + .flatten() }) } + + fn from_base64_image(data: &str, mime_type: &str) -> Result> { + let format = image::ImageFormat::from_mime_type(mime_type) + .ok_or_else(|| anyhow!("unsupported image MIME type `{}`", mime_type))?; + let bytes = base64::engine::general_purpose::STANDARD.decode(data.as_bytes())?; + let dynamic_image = image::load_from_memory_with_format(&bytes, format)?; + language_model_image_from_dynamic_image(dynamic_image) + } +} + +fn language_model_image_from_dynamic_image( + dynamic_image: image::DynamicImage, +) -> Result> { + let width = dynamic_image.width(); + let height = dynamic_image.height(); + let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32)); + + // First apply any provider-specific dimension constraints we know about (Anthropic). + let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32 + || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32 + { + let new_bounds = ObjectFit::ScaleDown.get_bounds( + gpui::Bounds { + origin: point(px(0.0), px(0.0)), + size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)), + }, + image_size, + ); + dynamic_image.resize( + new_bounds.size.width.into(), + new_bounds.size.height.into(), + image::imageops::FilterType::Triangle, + ) + } else { + dynamic_image + }; + + // Then enforce a default per-image size cap on the encoded PNG bytes. + // + // We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd. + // The upstream provider limit we want to respect is effectively on the binary image + // payload size, so we enforce against the encoded PNG bytes before base64 encoding. + let mut encoded_png = encode_png_bytes(&processed_image)?; + for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES { + if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES { + break; + } + + // Scale down geometrically to converge quickly. We don't know the final PNG size + // as a function of pixels, so we iteratively shrink. + let (width, height) = processed_image.dimensions(); + if width <= 1 || height <= 1 { + break; + } + + // Shrink by ~15% each pass (0.85). This is a compromise between speed and + // preserving image detail. + let new_width = ((width as f32) * 0.85).round().max(1.0) as u32; + let new_height = ((height as f32) * 0.85).round().max(1.0) as u32; + + processed_image = + processed_image.resize(new_width, new_height, image::imageops::FilterType::Triangle); + encoded_png = encode_png_bytes(&processed_image)?; + } + + if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES { + // Still too large after multiple passes; treat as non-convertible for now. + // (Provider-specific handling can be introduced later.) + return Ok(None); + } + + // Now base64 encode the PNG bytes. + let base64_image = encode_bytes_as_base64(encoded_png.as_slice())?; + + // SAFETY: The base64 encoder should not produce non-UTF8. + let source = unsafe { String::from_utf8_unchecked(base64_image) }; + + Ok(Some(LanguageModelImage { + source: source.into(), + })) } fn encode_png_bytes(image: &image::DynamicImage) -> Result> { @@ -162,7 +174,6 @@ pub fn gpui_size_to_image_size(size: Size) -> ImageSize { #[cfg(test)] mod tests { use super::*; - use base64::Engine as _; use gpui::TestAppContext; fn base64_to_png_bytes(base64: &str) -> Vec { @@ -202,13 +213,46 @@ mod tests { raw_png.len() ); - let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png)); + let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png.clone())); let lm_image = cx .update(|cx| LanguageModelImage::from_image(Arc::clone(&image), cx)) .await .expect("from_image should succeed"); - let decoded_png = base64_to_png_bytes(lm_image.source.as_ref()); + assert_downscaled_from_original(lm_image.source.as_ref(), 4096, 4096); + + let base64_png = base64::engine::general_purpose::STANDARD.encode(raw_png); + let lm_image = LanguageModelImage::from_base64_image(&base64_png, "image/png") + .expect("from_base64_image should not error") + .expect("from_base64_image should succeed"); + + assert_downscaled_from_original(lm_image.source.as_ref(), 4096, 4096); + } + + #[test] + fn test_from_base64_image_converts_jpeg_to_png() { + use image::ImageEncoder as _; + + let mut jpeg_bytes = Vec::new(); + image::codecs::jpeg::JpegEncoder::new(&mut jpeg_bytes) + .write_image(&[255, 0, 0], 1, 1, image::ExtendedColorType::Rgb8) + .expect("encode jpeg"); + let jpeg_data = base64::engine::general_purpose::STANDARD.encode(jpeg_bytes); + + let image = LanguageModelImage::from_base64_image(&jpeg_data, "image/jpeg") + .expect("from_base64_image should not error") + .expect("from_base64_image should succeed"); + let png_bytes = base64_to_png_bytes(image.source.as_ref()); + + assert_eq!( + image::guess_format(&png_bytes).expect("guess image format"), + image::ImageFormat::Png + ); + assert_eq!(png_dimensions(&png_bytes), (1, 1)); + } + + fn assert_downscaled_from_original(base64_png: &str, width: u32, height: u32) { + let decoded_png = base64_to_png_bytes(base64_png); assert!( decoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES, "Encoded PNG should be ≤ {} bytes after downscale, but was {} bytes", @@ -216,12 +260,12 @@ mod tests { decoded_png.len() ); - let (w, h) = png_dimensions(&decoded_png); + let (downsized_width, downsized_height) = png_dimensions(&decoded_png); assert!( - w < 4096 && h < 4096, + downsized_width < width && downsized_height < height, "Dimensions should have shrunk: got {}×{}", - w, - h + downsized_width, + downsized_height ); } } diff --git a/crates/language_model_core/src/request.rs b/crates/language_model_core/src/request.rs index 7f8f7c7d764..19a642ba01b 100644 --- a/crates/language_model_core/src/request.rs +++ b/crates/language_model_core/src/request.rs @@ -438,7 +438,6 @@ mod tests { // Test image object let json = serde_json::json!({ "source": "base64encodedimagedata", - "size": {"width": 100, "height": 200} }); let content: LanguageModelToolResultContent = serde_json::from_value(json).unwrap(); match content {