agent: Support image output from MCP tools (#57134)

Release Notes:

- agent: Support image output from MCP tools
This commit is contained in:
Bennet Bo Fenner 2026-05-19 14:51:42 +02:00 committed by GitHub
parent c352cad169
commit da43bdb648
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 225 additions and 122 deletions

View file

@ -26,10 +26,10 @@ use gpui::{
use indoc::indoc;
use language_model::{
CompletionIntent, LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
LanguageModelId, LanguageModelProviderId, LanguageModelProviderName, LanguageModelRegistry,
LanguageModelRequest, LanguageModelRequestMessage, LanguageModelToolResult,
LanguageModelToolSchemaFormat, LanguageModelToolUse, MessageContent, Role, StopReason,
TokenUsage,
LanguageModelId, LanguageModelImageExt, LanguageModelProviderId, LanguageModelProviderName,
LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage,
LanguageModelToolResult, LanguageModelToolSchemaFormat, LanguageModelToolUse, MessageContent,
Role, StopReason, TokenUsage,
fake_provider::{FakeLanguageModel, FakeLanguageModelProvider},
};
use pretty_assertions::assert_eq;
@ -1656,6 +1656,7 @@ async fn test_mcp_tool_multi_content_response(cx: &mut TestAppContext) {
let (tool_call_params, tool_call_response) = mcp_tool_calls.next().await.unwrap();
assert_eq!(tool_call_params.name, "screenshot");
let image_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
tool_call_response
.send(context_server::types::CallToolResponse {
content: vec![
@ -1663,7 +1664,7 @@ async fn test_mcp_tool_multi_content_response(cx: &mut TestAppContext) {
text: "Some text".into(),
},
context_server::types::ToolResponseContent::Image {
data: "aGVsbG8=".into(),
data: image_data.into(),
mime_type: "image/png".into(),
},
context_server::types::ToolResponseContent::Text {
@ -1691,13 +1692,25 @@ async fn test_mcp_tool_multi_content_response(cx: &mut TestAppContext) {
})
.expect("expected a tool result");
assert_eq!(tool_result.tool_use_id, "tool_1".into());
assert_eq!(tool_result.content.len(), 2);
assert_eq!(tool_result.content.len(), 3);
assert_eq!(
tool_result.content[0],
language_model::LanguageModelToolResultContent::Text(Arc::from("Some text"))
);
let expected_image =
language_model::LanguageModelImage::from_base64_image(image_data, "image/png")
.expect("image conversion should not error")
.expect("image conversion should succeed");
assert_eq!(
tool_result.content[0],
language_model::LanguageModelToolResultContent::Text(Arc::from("Some text"))
);
assert_eq!(
tool_result.content[1],
language_model::LanguageModelToolResultContent::Image(expected_image)
);
assert_eq!(
tool_result.content[2],
language_model::LanguageModelToolResultContent::Text(Arc::from("Some more text"))
);
fake_model.end_last_completion_stream();

View file

@ -5,7 +5,7 @@ use collections::{BTreeMap, HashMap};
use context_server::{ContextServerId, client::NotificationSubscription};
use futures::FutureExt as _;
use gpui::{App, AppContext, AsyncApp, Context, Entity, EventEmitter, SharedString, Task};
use language_model::LanguageModelToolResultContent;
use language_model::{LanguageModelImage, LanguageModelImageExt, LanguageModelToolResultContent};
use project::context_server_store::{ContextServerStatus, ContextServerStore};
use std::sync::Arc;
use util::ResultExt;
@ -346,7 +346,7 @@ impl AnyAgentTool for ContextServerTool {
let authorize =
event_stream.authorize_third_party_tool(initial_title, tool_id, display_name, cx);
cx.spawn(async move |_cx| {
cx.spawn(async move |cx| {
let input = input
.recv()
.await
@ -394,15 +394,50 @@ impl AnyAgentTool for ContextServerTool {
}
let mut llm_output = Vec::new();
let mut tool_call_content = Vec::new();
let mut concatenated_text = String::new();
for content in response.content {
match content {
context_server::types::ToolResponseContent::Text { text } => {
concatenated_text.push_str(&text);
tool_call_content.push(acp::ToolCallContent::Content(acp::Content::new(
acp::ContentBlock::Text(acp::TextContent::new(text.clone())),
)));
llm_output.push(LanguageModelToolResultContent::Text(text.into()));
}
context_server::types::ToolResponseContent::Image { .. } => {
log::warn!("Ignoring image content from tool response");
context_server::types::ToolResponseContent::Image { data, mime_type } => {
tool_call_content.push(acp::ToolCallContent::Content(acp::Content::new(
acp::ContentBlock::Image(acp::ImageContent::new(
data.clone(),
mime_type.clone(),
)),
)));
let language_model_image = cx
.background_spawn({
let mime_type = mime_type.clone();
async move {
LanguageModelImage::from_base64_image(&data, &mime_type)
}
})
.await;
match language_model_image {
Ok(Some(image)) => {
llm_output.push(LanguageModelToolResultContent::Image(image));
}
Ok(None) => {
log::warn!(
"Skipping MCP tool response image with MIME type `{}` because it cannot be converted for language model input",
mime_type
);
}
Err(error) => {
log::warn!(
"Failed to convert MCP tool response image with MIME type `{}` for language model input: {:#}",
mime_type,
error
);
}
}
}
context_server::types::ToolResponseContent::Audio { .. } => {
log::warn!("Ignoring audio content from tool response");
@ -415,6 +450,10 @@ impl AnyAgentTool for ContextServerTool {
}
}
}
if !tool_call_content.is_empty() {
event_stream
.update_fields(acp::ToolCallUpdateFields::new().content(tool_call_content));
}
let raw_output = serde_json::Value::String(concatenated_text);
Ok(AgentToolOutput {
raw_output,

View file

@ -6585,6 +6585,32 @@ impl ThreadView {
)
}),
)
.when(!use_card_layout, |this| {
let button_id =
SharedString::from(format!("tool_output-collapse-{:?}", tool_call.id));
let tool_call_id = tool_call.id.clone();
this.child(
div()
.ml(rems(0.4))
.px_3p5()
.pt_2()
.border_l_1()
.border_color(self.tool_card_border_color(cx))
.child(
IconButton::new(button_id, IconName::ChevronUp)
.full_width()
.style(ButtonStyle::Outlined)
.icon_color(Color::Muted)
.on_click(cx.listener({
move |this: &mut Self, _, _, cx: &mut Context<Self>| {
this.expanded_tool_calls.remove(&tool_call_id);
cx.notify();
}
})),
),
)
})
.into_any(),
ToolCallStatus::Rejected => Empty.into_any(),
}
@ -7580,7 +7606,6 @@ impl ThreadView {
} else if let Some(markdown) = content.markdown() {
self.render_markdown_output(
markdown.clone(),
tool_call.id.clone(),
context_ix,
card_layout,
window,
@ -7724,14 +7749,11 @@ impl ThreadView {
fn render_markdown_output(
&self,
markdown: Entity<Markdown>,
tool_call_id: acp::ToolCallId,
context_ix: usize,
card_layout: bool,
window: &Window,
cx: &Context<Self>,
) -> AnyElement {
let button_id = SharedString::from(format!("tool_output-{:?}", tool_call_id));
v_flex()
.gap_2()
.map(|this| {
@ -7754,20 +7776,6 @@ impl ThreadView {
MarkdownStyle::themed(MarkdownFont::Agent, window, cx),
cx,
))
.when(!card_layout, |this| {
this.child(
IconButton::new(button_id, IconName::ChevronUp)
.full_width()
.style(ButtonStyle::Outlined)
.icon_color(Color::Muted)
.on_click(cx.listener({
move |this: &mut Self, _, _, cx: &mut Context<Self>| {
this.expanded_tool_calls.remove(&tool_call_id);
cx.notify();
}
})),
)
})
.into_any_element()
}

View file

@ -1,8 +1,8 @@
use std::io::{Cursor, Write};
use std::sync::Arc;
use anyhow::Result;
use base64::write::EncoderWriter;
use anyhow::{Result, anyhow};
use base64::{Engine as _, write::EncoderWriter};
use gpui::{
App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, Size, Task, point, px, size,
};
@ -29,6 +29,7 @@ const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8;
pub trait LanguageModelImageExt {
const FORMAT: ImageFormat;
fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>>;
fn from_base64_image(data: &str, mime_type: &str) -> Result<Option<LanguageModelImage>>;
}
impl LanguageModelImageExt for LanguageModelImage {
@ -36,24 +37,37 @@ impl LanguageModelImageExt for LanguageModelImage {
fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>> {
cx.background_spawn(async move {
let image_bytes = Cursor::new(data.bytes());
let dynamic_image = match data.format() {
ImageFormat::Png => image::codecs::png::PngDecoder::new(image_bytes)
.and_then(image::DynamicImage::from_decoder),
ImageFormat::Jpeg => image::codecs::jpeg::JpegDecoder::new(image_bytes)
.and_then(image::DynamicImage::from_decoder),
ImageFormat::Webp => image::codecs::webp::WebPDecoder::new(image_bytes)
.and_then(image::DynamicImage::from_decoder),
ImageFormat::Gif => image::codecs::gif::GifDecoder::new(image_bytes)
.and_then(image::DynamicImage::from_decoder),
ImageFormat::Bmp => image::codecs::bmp::BmpDecoder::new(image_bytes)
.and_then(image::DynamicImage::from_decoder),
ImageFormat::Tiff => image::codecs::tiff::TiffDecoder::new(image_bytes)
.and_then(image::DynamicImage::from_decoder),
_ => return None,
let format = match data.format() {
ImageFormat::Png => image::ImageFormat::Png,
ImageFormat::Jpeg => image::ImageFormat::Jpeg,
ImageFormat::Webp => image::ImageFormat::WebP,
ImageFormat::Gif => image::ImageFormat::Gif,
ImageFormat::Bmp => image::ImageFormat::Bmp,
ImageFormat::Tiff => image::ImageFormat::Tiff,
ImageFormat::Ico => image::ImageFormat::Ico,
ImageFormat::Pnm => image::ImageFormat::Pnm,
ImageFormat::Svg => return None,
};
let dynamic_image =
image::load_from_memory_with_format(data.bytes(), format).log_err()?;
language_model_image_from_dynamic_image(dynamic_image)
.log_err()
.flatten()
})
}
.log_err()?;
fn from_base64_image(data: &str, mime_type: &str) -> Result<Option<LanguageModelImage>> {
let format = image::ImageFormat::from_mime_type(mime_type)
.ok_or_else(|| anyhow!("unsupported image MIME type `{}`", mime_type))?;
let bytes = base64::engine::general_purpose::STANDARD.decode(data.as_bytes())?;
let dynamic_image = image::load_from_memory_with_format(&bytes, format)?;
language_model_image_from_dynamic_image(dynamic_image)
}
}
fn language_model_image_from_dynamic_image(
dynamic_image: image::DynamicImage,
) -> Result<Option<LanguageModelImage>> {
let width = dynamic_image.width();
let height = dynamic_image.height();
let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
@ -83,7 +97,7 @@ impl LanguageModelImageExt for LanguageModelImage {
// We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd.
// The upstream provider limit we want to respect is effectively on the binary image
// payload size, so we enforce against the encoded PNG bytes before base64 encoding.
let mut encoded_png = encode_png_bytes(&processed_image).log_err()?;
let mut encoded_png = encode_png_bytes(&processed_image)?;
for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES {
if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES {
break;
@ -91,38 +105,36 @@ impl LanguageModelImageExt for LanguageModelImage {
// Scale down geometrically to converge quickly. We don't know the final PNG size
// as a function of pixels, so we iteratively shrink.
let (w, h) = processed_image.dimensions();
if w <= 1 || h <= 1 {
let (width, height) = processed_image.dimensions();
if width <= 1 || height <= 1 {
break;
}
// Shrink by ~15% each pass (0.85). This is a compromise between speed and
// preserving image detail.
let new_w = ((w as f32) * 0.85).round().max(1.0) as u32;
let new_h = ((h as f32) * 0.85).round().max(1.0) as u32;
let new_width = ((width as f32) * 0.85).round().max(1.0) as u32;
let new_height = ((height as f32) * 0.85).round().max(1.0) as u32;
processed_image =
processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle);
encoded_png = encode_png_bytes(&processed_image).log_err()?;
processed_image.resize(new_width, new_height, image::imageops::FilterType::Triangle);
encoded_png = encode_png_bytes(&processed_image)?;
}
if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES {
// Still too large after multiple passes; treat as non-convertible for now.
// (Provider-specific handling can be introduced later.)
return None;
return Ok(None);
}
// Now base64 encode the PNG bytes.
let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?;
let base64_image = encode_bytes_as_base64(encoded_png.as_slice())?;
// SAFETY: The base64 encoder should not produce non-UTF8.
let source = unsafe { String::from_utf8_unchecked(base64_image) };
Some(LanguageModelImage {
Ok(Some(LanguageModelImage {
source: source.into(),
})
})
}
}))
}
fn encode_png_bytes(image: &image::DynamicImage) -> Result<Vec<u8>> {
@ -162,7 +174,6 @@ pub fn gpui_size_to_image_size(size: Size<DevicePixels>) -> ImageSize {
#[cfg(test)]
mod tests {
use super::*;
use base64::Engine as _;
use gpui::TestAppContext;
fn base64_to_png_bytes(base64: &str) -> Vec<u8> {
@ -202,13 +213,46 @@ mod tests {
raw_png.len()
);
let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png));
let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png.clone()));
let lm_image = cx
.update(|cx| LanguageModelImage::from_image(Arc::clone(&image), cx))
.await
.expect("from_image should succeed");
let decoded_png = base64_to_png_bytes(lm_image.source.as_ref());
assert_downscaled_from_original(lm_image.source.as_ref(), 4096, 4096);
let base64_png = base64::engine::general_purpose::STANDARD.encode(raw_png);
let lm_image = LanguageModelImage::from_base64_image(&base64_png, "image/png")
.expect("from_base64_image should not error")
.expect("from_base64_image should succeed");
assert_downscaled_from_original(lm_image.source.as_ref(), 4096, 4096);
}
#[test]
fn test_from_base64_image_converts_jpeg_to_png() {
use image::ImageEncoder as _;
let mut jpeg_bytes = Vec::new();
image::codecs::jpeg::JpegEncoder::new(&mut jpeg_bytes)
.write_image(&[255, 0, 0], 1, 1, image::ExtendedColorType::Rgb8)
.expect("encode jpeg");
let jpeg_data = base64::engine::general_purpose::STANDARD.encode(jpeg_bytes);
let image = LanguageModelImage::from_base64_image(&jpeg_data, "image/jpeg")
.expect("from_base64_image should not error")
.expect("from_base64_image should succeed");
let png_bytes = base64_to_png_bytes(image.source.as_ref());
assert_eq!(
image::guess_format(&png_bytes).expect("guess image format"),
image::ImageFormat::Png
);
assert_eq!(png_dimensions(&png_bytes), (1, 1));
}
fn assert_downscaled_from_original(base64_png: &str, width: u32, height: u32) {
let decoded_png = base64_to_png_bytes(base64_png);
assert!(
decoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES,
"Encoded PNG should be ≤ {} bytes after downscale, but was {} bytes",
@ -216,12 +260,12 @@ mod tests {
decoded_png.len()
);
let (w, h) = png_dimensions(&decoded_png);
let (downsized_width, downsized_height) = png_dimensions(&decoded_png);
assert!(
w < 4096 && h < 4096,
downsized_width < width && downsized_height < height,
"Dimensions should have shrunk: got {}×{}",
w,
h
downsized_width,
downsized_height
);
}
}

View file

@ -438,7 +438,6 @@ mod tests {
// Test image object
let json = serde_json::json!({
"source": "base64encodedimagedata",
"size": {"width": 100, "height": 200}
});
let content: LanguageModelToolResultContent = serde_json::from_value(json).unwrap();
match content {