From e50e5801428749d75c607794e867231f8fb95887 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Fri, 5 Jun 2026 18:41:12 +0300 Subject: [PATCH] fix: bound screenshot payloads --- CHANGELOG.md | 5 + Cargo.lock | 17 ++ Cargo.toml | 2 +- README.md | 8 +- src/main.rs | 13 +- src/screenshot.rs | 409 +++++++++++++++++++++++++++++++++++++++++++++- src/server.rs | 175 +++++++++++++++++--- 7 files changed, 589 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54ba79d..a700246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 embedded input/backend knobs. ### Fixed +- Bounded screenshot payloads by default before returning them to MCP hosts, + while exposing opt-in screenshot sizing controls and coordinate metadata for + downscaled captures. +- Added opt-in JPEG screenshot output with a caller-selected quality so agents + can choose compression before the byte cap forces additional resizing. - Ported downstream Linux readiness fixes: `doctor` now treats direct `/dev/uinput` and the XDG RemoteDesktop portal as valid development-input backends instead of requiring `ydotoold` in every ready setup. diff --git a/Cargo.lock b/Cargo.lock index 5218d2c..f3107e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -737,6 +737,8 @@ dependencies = [ "moxcms", "num-traits", "png", + "zune-core", + "zune-jpeg", ] [[package]] @@ -1880,6 +1882,21 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] + [[package]] name = "zvariant" version = "5.11.0" diff --git a/Cargo.toml b/Cargo.toml index fdd6869..412cdfe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ base64 = "0.22.1" evdev = "0.13.2" cosmic-protocols = { version = "0.2.0", default-features = false, features = ["client"] } futures-util = "0.3.32" -image = { version = "0.25", default-features = false, features = ["png"] } +image = { version = "0.25", default-features = false, features = ["jpeg", "png"] } rmcp = { version = "1.5.0", features = ["transport-io"] } schemars = "1.0" serde = { version = "1.0.228", features = ["derive"] } diff --git a/README.md b/README.md index 6960eac..ab749e0 100644 --- a/README.md +++ b/README.md @@ -45,11 +45,13 @@ MCP tools exposed by the server: - `list_windows` — compositor windows with title, app id, wm_class, focus state, client type (Wayland/X11), and bounds - `focused_window` — the window currently holding keyboard focus - `get_app_state` — combined screenshot + accessibility tree for a chosen app, with element indices that the input tools accept -- `screenshot` — capture the screen as a PNG; can target a window, which is raised to the front and cropped to just that window +- `screenshot` — capture the screen as a bounded PNG or JPEG image; can target a window, which is raised to the front and cropped to just that window + +Screenshot payloads are size-bounded by default before they are returned to the MCP host: max 1920 px width/height and 2 MiB image bytes, with hard caps even when callers request more. Agents that need more detail can pass `max_width`, `max_height`, `max_bytes`, `scale`, `format: "jpeg"`, or `quality`, preferably with a window target or crop. PNG remains the default; JPEG lets callers trade lossless pixels for a smaller payload before the byte cap forces further resizing. Returned screenshot metadata includes `coordinate_width`, `coordinate_height`, `scale`, `format`, and `quality` so callers can convert from a downscaled preview to desktop coordinate pixels. **Input** -- `click` — by element index, semantic selector, or pixel coordinates -- `drag` — pixel-coordinate drag (start / end) +- `click` — by element index, semantic selector, or desktop coordinate pixels +- `drag` — desktop coordinate drag (start / end) - `scroll` — page-based scroll on an element or at a pixel location - `press_key` — keys / chords; can focus a window or terminal first - `type_text` — literal text input, optionally targeted at a window or terminal diff --git a/src/main.rs b/src/main.rs index 9274038..c4b15b0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -70,7 +70,7 @@ async fn main() -> Result<()> { .nth(3) .and_then(|s| s.parse().ok()) .unwrap_or(0); - let cap = screenshot::capture_screenshot().await?; + let cap = screenshot::capture_screenshot_raw().await?; eprintln!("desktop logical size: {}x{}", cap.width, cap.height); let mut p = abs_pointer::AbsPointer::create(cap.width as i32, cap.height as i32)?; p.click(x, y, abs_pointer::PointerButton::Left, 1)?; @@ -87,6 +87,17 @@ async fn main() -> Result<()> { serde_json::to_string_pretty(&serde_json::json!({ "mime_type": capture.mime_type, "source": capture.source, + "width": capture.width, + "height": capture.height, + "coordinate_width": capture.coordinate_width, + "coordinate_height": capture.coordinate_height, + "scale": capture.scale, + "resized": capture.resized, + "bytes": capture.bytes, + "original_bytes": capture.original_bytes, + "max_bytes": capture.max_bytes, + "format": capture.format, + "quality": capture.quality, "data_url_length": capture.data_url.len() })) .context("failed to serialize screenshot report")? diff --git a/src/screenshot.rs b/src/screenshot.rs index da1b633..c4eff42 100644 --- a/src/screenshot.rs +++ b/src/screenshot.rs @@ -2,11 +2,14 @@ use crate::diagnostics::hydrate_session_bus_env; use anyhow::{anyhow, bail, Context, Result}; use base64::{engine::general_purpose::STANDARD, Engine}; use futures_util::StreamExt; +use image::codecs::jpeg::JpegEncoder; +use image::imageops::FilterType; use schemars::JsonSchema; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use std::{ collections::HashMap, fs, + io::Cursor, path::{Path, PathBuf}, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -19,13 +22,81 @@ use zbus::{ const PORTAL_REQUEST_INTERFACE: &str = "org.freedesktop.portal.Request"; const PORTAL_REQUEST_PATH_NAMESPACE: &str = "/org/freedesktop/portal/desktop/request"; +pub const DEFAULT_SCREENSHOT_MAX_DIMENSION: u32 = 1920; +pub const DEFAULT_SCREENSHOT_MAX_BYTES: usize = 2 * 1024 * 1024; +pub const ABSOLUTE_SCREENSHOT_MAX_DIMENSION: u32 = 4096; +pub const ABSOLUTE_SCREENSHOT_MAX_BYTES: usize = 4 * 1024 * 1024; +pub const DEFAULT_SCREENSHOT_JPEG_QUALITY: u8 = 80; +pub const MIN_SCREENSHOT_JPEG_QUALITY: u8 = 1; +pub const MAX_SCREENSHOT_JPEG_QUALITY: u8 = 95; +const MIN_SCREENSHOT_MAX_BYTES: usize = 1024; + +#[derive(Debug, Clone)] +pub struct RawScreenshotCapture { + pub mime_type: String, + pub bytes: Vec, + pub source: String, + pub width: u32, + pub height: u32, +} + #[derive(Debug, Clone, Serialize, JsonSchema)] pub struct ScreenshotCapture { pub mime_type: String, pub data_url: String, pub source: String, + /// Width of the returned image payload. pub width: u32, + /// Height of the returned image payload. pub height: u32, + /// Coordinate-space width before payload downscaling. + pub coordinate_width: u32, + /// Coordinate-space height before payload downscaling. + pub coordinate_height: u32, + /// Returned pixels per coordinate-space pixel. + pub scale: f32, + pub resized: bool, + pub bytes: usize, + pub original_bytes: usize, + pub max_bytes: usize, + pub format: ScreenshotOutputFormat, + pub quality: Option, +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct ScreenshotPayloadOptions { + pub max_width: Option, + pub max_height: Option, + pub max_bytes: Option, + pub scale: Option, + pub format: Option, + pub quality: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "lowercase")] +pub enum ScreenshotOutputFormat { + Png, + Jpeg, +} + +impl ScreenshotOutputFormat { + fn mime_type(self) -> &'static str { + match self { + Self::Png => "image/png", + Self::Jpeg => "image/jpeg", + } + } +} + +#[derive(Debug, Clone, Copy)] +struct ResolvedScreenshotPayloadOptions { + max_width: u32, + max_height: u32, + max_bytes: usize, + scale: f32, + format: ScreenshotOutputFormat, + quality: u8, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -34,7 +105,43 @@ enum ScreenshotCleanup { Preserve, } -pub async fn capture_screenshot() -> Result { +impl ScreenshotPayloadOptions { + fn resolve(self) -> ResolvedScreenshotPayloadOptions { + let max_width = self + .max_width + .unwrap_or(DEFAULT_SCREENSHOT_MAX_DIMENSION) + .clamp(1, ABSOLUTE_SCREENSHOT_MAX_DIMENSION); + let max_height = self + .max_height + .unwrap_or(DEFAULT_SCREENSHOT_MAX_DIMENSION) + .clamp(1, ABSOLUTE_SCREENSHOT_MAX_DIMENSION); + let max_bytes = self + .max_bytes + .unwrap_or(DEFAULT_SCREENSHOT_MAX_BYTES) + .clamp(MIN_SCREENSHOT_MAX_BYTES, ABSOLUTE_SCREENSHOT_MAX_BYTES); + let scale = self + .scale + .filter(|value| value.is_finite() && *value > 0.0) + .unwrap_or(1.0) + .min(1.0); + let format = self.format.unwrap_or(ScreenshotOutputFormat::Png); + let quality = self + .quality + .unwrap_or(DEFAULT_SCREENSHOT_JPEG_QUALITY) + .clamp(MIN_SCREENSHOT_JPEG_QUALITY, MAX_SCREENSHOT_JPEG_QUALITY); + + ResolvedScreenshotPayloadOptions { + max_width, + max_height, + max_bytes, + scale, + format, + quality, + } + } +} + +pub async fn capture_screenshot_raw() -> Result { hydrate_session_bus_env(); match capture_with_gnome_shell().await { @@ -48,7 +155,67 @@ pub async fn capture_screenshot() -> Result { } } -async fn capture_with_gnome_shell() -> Result { +pub async fn capture_screenshot() -> Result { + let raw = capture_screenshot_raw().await?; + prepare_screenshot_payload(raw, ScreenshotPayloadOptions::default()) +} + +pub fn prepare_screenshot_payload( + raw: RawScreenshotCapture, + options: ScreenshotPayloadOptions, +) -> Result { + if raw.bytes.is_empty() { + bail!("screenshot file was empty"); + } + let (coordinate_width, coordinate_height) = png_dimensions(&raw.bytes)?; + let original_bytes = raw.bytes.len(); + let options = options.resolve(); + let (target_width, target_height) = + target_dimensions(coordinate_width, coordinate_height, options); + + let (bytes, width, height) = if options.format == ScreenshotOutputFormat::Png + && target_width == coordinate_width + && target_height == coordinate_height + && original_bytes <= options.max_bytes + { + (raw.bytes, coordinate_width, coordinate_height) + } else { + encode_screenshot_to_fit_bytes( + &raw.bytes, + coordinate_width, + coordinate_height, + target_width, + target_height, + options, + )? + }; + + let encoded = STANDARD.encode(&bytes); + let scale = if coordinate_width == 0 { + 1.0 + } else { + width as f32 / coordinate_width as f32 + }; + + Ok(ScreenshotCapture { + mime_type: options.format.mime_type().to_string(), + data_url: format!("data:{};base64,{encoded}", options.format.mime_type()), + source: raw.source, + width, + height, + coordinate_width, + coordinate_height, + scale, + resized: width != coordinate_width || height != coordinate_height, + bytes: bytes.len(), + original_bytes, + max_bytes: options.max_bytes, + format: options.format, + quality: (options.format == ScreenshotOutputFormat::Jpeg).then_some(options.quality), + }) +} + +async fn capture_with_gnome_shell() -> Result { let connection = zbus::Connection::session() .await .context("failed to connect to session bus")?; @@ -86,7 +253,7 @@ async fn capture_with_gnome_shell() -> Result { .await } -async fn capture_with_portal() -> Result { +async fn capture_with_portal() -> Result { let connection = zbus::Connection::session() .await .context("failed to connect to session bus")?; @@ -181,7 +348,7 @@ async fn read_png_as_capture( path: PathBuf, source: &str, cleanup: ScreenshotCleanup, -) -> Result { +) -> Result { let result = read_png_as_capture_inner(&path, source); if let ScreenshotCleanup::DeletePath(path) = cleanup { let _ = fs::remove_file(path); @@ -189,23 +356,126 @@ async fn read_png_as_capture( result } -fn read_png_as_capture_inner(path: &Path, source: &str) -> Result { +fn read_png_as_capture_inner(path: &Path, source: &str) -> Result { let bytes = fs::read(path) .with_context(|| format!("failed to read screenshot file {}", path.display()))?; if bytes.is_empty() { bail!("screenshot file was empty: {}", path.display()); } let (width, height) = png_dimensions(&bytes)?; - let encoded = STANDARD.encode(bytes); - Ok(ScreenshotCapture { + Ok(RawScreenshotCapture { mime_type: "image/png".to_string(), - data_url: format!("data:image/png;base64,{encoded}"), + bytes, source: source.to_string(), width, height, }) } +fn target_dimensions( + width: u32, + height: u32, + options: ResolvedScreenshotPayloadOptions, +) -> (u32, u32) { + let width_scale = options.max_width as f64 / width as f64; + let height_scale = options.max_height as f64 / height as f64; + let scale = f64::from(options.scale) + .min(width_scale) + .min(height_scale) + .min(1.0); + + let target_width = ((width as f64 * scale).round() as u32).clamp(1, width); + let target_height = ((height as f64 * scale).round() as u32).clamp(1, height); + (target_width, target_height) +} + +fn encode_screenshot_to_fit_bytes( + raw: &[u8], + original_width: u32, + original_height: u32, + mut target_width: u32, + mut target_height: u32, + options: ResolvedScreenshotPayloadOptions, +) -> Result<(Vec, u32, u32)> { + let img = image::load_from_memory_with_format(raw, image::ImageFormat::Png) + .context("failed to decode screenshot PNG for encoding")?; + + loop { + let bytes = if options.format == ScreenshotOutputFormat::Png + && target_width == original_width + && target_height == original_height + { + raw.to_vec() + } else { + let output = if target_width == original_width && target_height == original_height { + img.clone() + } else { + img.resize_exact(target_width, target_height, FilterType::Lanczos3) + }; + encode_image(&output, options)? + }; + + if bytes.len() <= options.max_bytes { + return Ok((bytes, target_width, target_height)); + } + + if target_width == 1 && target_height == 1 { + bail!( + "screenshot payload is {} bytes at 1x1, over max_bytes {}", + bytes.len(), + options.max_bytes + ); + } + + (target_width, target_height) = next_dimensions_for_byte_cap( + target_width, + target_height, + bytes.len(), + options.max_bytes, + ); + } +} + +fn encode_image( + img: &image::DynamicImage, + options: ResolvedScreenshotPayloadOptions, +) -> Result> { + let mut out = Vec::new(); + match options.format { + ScreenshotOutputFormat::Png => { + img.write_to(&mut Cursor::new(&mut out), image::ImageFormat::Png) + .context("failed to encode screenshot PNG")?; + } + ScreenshotOutputFormat::Jpeg => { + let rgb = img.to_rgb8(); + JpegEncoder::new_with_quality(&mut out, options.quality) + .encode_image(&rgb) + .context("failed to encode screenshot JPEG")?; + } + } + Ok(out) +} + +fn next_dimensions_for_byte_cap( + width: u32, + height: u32, + encoded_bytes: usize, + max_bytes: usize, +) -> (u32, u32) { + let shrink = ((max_bytes as f64 / encoded_bytes as f64).sqrt() * 0.9).clamp(0.1, 0.95); + let mut next_width = ((width as f64 * shrink).floor() as u32).max(1); + let mut next_height = ((height as f64 * shrink).floor() as u32).max(1); + + if next_width >= width && width > 1 { + next_width = width - 1; + } + if next_height >= height && height > 1 { + next_height = height - 1; + } + + (next_width, next_height) +} + fn cleanup_gnome_requested_path(path: &Path) { let _ = fs::remove_file(path); } @@ -294,6 +564,41 @@ mod tests { png } + fn solid_png(width: u32, height: u32) -> Vec { + let img = image::RgbaImage::from_pixel(width, height, image::Rgba([24, 96, 160, 255])); + encode_test_png(img) + } + + fn noisy_png(width: u32, height: u32) -> Vec { + let mut img = image::RgbaImage::new(width, height); + for (x, y, pixel) in img.enumerate_pixels_mut() { + let r = ((x * 31 + y * 17) % 256) as u8; + let g = ((x * 13 + y * 47) % 256) as u8; + let b = ((x * 97 + y * 7) % 256) as u8; + *pixel = image::Rgba([r, g, b, 255]); + } + encode_test_png(img) + } + + fn encode_test_png(img: image::RgbaImage) -> Vec { + let mut out = Vec::new(); + image::DynamicImage::ImageRgba8(img) + .write_to(&mut Cursor::new(&mut out), image::ImageFormat::Png) + .unwrap(); + out + } + + fn raw_capture(bytes: Vec) -> RawScreenshotCapture { + let (width, height) = png_dimensions(&bytes).unwrap(); + RawScreenshotCapture { + mime_type: "image/png".to_string(), + bytes, + source: "test".to_string(), + width, + height, + } + } + #[test] fn decodes_file_uri_percent_escapes() { assert_eq!( @@ -316,6 +621,92 @@ mod tests { assert_eq!(png_dimensions(&png).unwrap(), (3840, 1080)); } + #[test] + fn default_payload_downscales_long_edge() { + let capture = + prepare_screenshot_payload(raw_capture(solid_png(4000, 1000)), Default::default()) + .unwrap(); + + assert_eq!((capture.width, capture.height), (1920, 480)); + assert_eq!( + (capture.coordinate_width, capture.coordinate_height), + (4000, 1000) + ); + assert!(capture.resized); + assert!(capture.bytes <= DEFAULT_SCREENSHOT_MAX_BYTES); + assert!(capture.data_url.starts_with("data:image/png;base64,")); + } + + #[test] + fn larger_bounded_request_can_keep_more_detail() { + let capture = prepare_screenshot_payload( + raw_capture(solid_png(3000, 1000)), + ScreenshotPayloadOptions { + max_width: Some(3000), + max_height: Some(3000), + max_bytes: Some(DEFAULT_SCREENSHOT_MAX_BYTES), + ..Default::default() + }, + ) + .unwrap(); + + assert_eq!((capture.width, capture.height), (3000, 1000)); + assert_eq!( + (capture.coordinate_width, capture.coordinate_height), + (3000, 1000) + ); + assert!(!capture.resized); + } + + #[test] + fn byte_cap_downscales_until_payload_fits() { + let capture = prepare_screenshot_payload( + raw_capture(noisy_png(512, 512)), + ScreenshotPayloadOptions { + max_width: Some(512), + max_height: Some(512), + max_bytes: Some(20_000), + ..Default::default() + }, + ) + .unwrap(); + + assert!(capture.bytes <= 20_000); + assert!(capture.width < 512); + assert_eq!( + (capture.coordinate_width, capture.coordinate_height), + (512, 512) + ); + assert!(capture.resized); + } + + #[test] + fn jpeg_format_compresses_when_requested() { + let capture = prepare_screenshot_payload( + raw_capture(noisy_png(512, 512)), + ScreenshotPayloadOptions { + max_width: Some(512), + max_height: Some(512), + max_bytes: Some(DEFAULT_SCREENSHOT_MAX_BYTES), + format: Some(ScreenshotOutputFormat::Jpeg), + quality: Some(60), + ..Default::default() + }, + ) + .unwrap(); + + assert_eq!(capture.mime_type, "image/jpeg"); + assert_eq!(capture.format, ScreenshotOutputFormat::Jpeg); + assert_eq!(capture.quality, Some(60)); + assert_eq!((capture.width, capture.height), (512, 512)); + assert_eq!( + (capture.coordinate_width, capture.coordinate_height), + (512, 512) + ); + assert!(capture.bytes < capture.original_bytes); + assert!(capture.data_url.starts_with("data:image/jpeg;base64,")); + } + #[tokio::test] async fn portal_capture_preserves_valid_returned_path() { let path = test_path("portal-valid"); diff --git a/src/server.rs b/src/server.rs index a0f6909..09076d5 100644 --- a/src/server.rs +++ b/src/server.rs @@ -11,7 +11,10 @@ use crate::remote_desktop::{ type_text_with_keysyms, PointerButton, PortalKeyboardSession, PortalPointerSession, ScrollDirection, }; -use crate::screenshot::{capture_screenshot, ScreenshotCapture}; +use crate::screenshot::{ + capture_screenshot_raw, prepare_screenshot_payload, RawScreenshotCapture, ScreenshotCapture, + ScreenshotOutputFormat, ScreenshotPayloadOptions, +}; use crate::windowing::registry; use crate::windows::{ focus_window_target, focused_window, list_windows, resolve_window_target, @@ -211,7 +214,7 @@ impl ComputerUseLinux { #[tool( name = "get_app_state", - description = "Start an app use session if needed, then get screenshot and accessibility state for a Linux app.", + description = "Start an app use session if needed, then get a size-bounded screenshot and accessibility state for a Linux app. Screenshot results include coordinate_width, coordinate_height, scale, format, and quality when the returned image is downscaled or compressed; callers can request jpeg/quality for compression before resizing.", annotations( read_only_hint = true, destructive_hint = false, @@ -229,11 +232,15 @@ impl ComputerUseLinux { let max_nodes = params.max_nodes.unwrap_or(120).clamp(1, 500); let max_depth = params.max_depth.unwrap_or(12).min(12); let include_screenshot = params.include_screenshot.unwrap_or(true); + let screenshot_options = params.screenshot_options(); let app_filter = self .resolve_accessibility_app_filter(¶ms, window_context.as_ref()) .await; let (screenshot, screenshot_error) = if include_screenshot { - match capture_screenshot().await { + match capture_screenshot_raw() + .await + .and_then(|raw| prepare_screenshot_payload(raw, screenshot_options)) + { Ok(capture) => (Some(capture), None), Err(error) => (None, Some(error.to_string())), } @@ -312,7 +319,7 @@ impl ComputerUseLinux { #[tool( name = "screenshot", - description = "Capture the screen and return it as a viewable image. Optionally target a window (window_id/pid/wm_class/title/app_id): the window is raised to the front and the image is cropped to just that window, so you see the app on its own rather than the whole desktop. Returns the PNG image plus a short caption (dimensions, source, and crop bounds).", + description = "Capture the screen and return it as a viewable, size-bounded image. Optionally target a window (window_id/pid/wm_class/title/app_id): the window is raised to the front and the image is cropped before any resize. Returns the image plus a short caption with returned dimensions, coordinate dimensions, scale, format, quality, source, and crop bounds; callers can request jpeg/quality for compression before resizing.", annotations( read_only_hint = false, destructive_hint = false, @@ -345,31 +352,50 @@ impl ComputerUseLinux { } } - let capture = capture_screenshot() + let raw_capture = capture_screenshot_raw() .await .map_err(|e| ErrorData::internal_error(format!("screenshot failed: {e}"), None))?; - let raw = - decode_data_url(&capture.data_url).map_err(|e| ErrorData::internal_error(e, None))?; - let (png, width, height, cropped) = match crop.as_ref().and_then(window_crop_rect) { - Some((x, y, w, h)) => match crop_png(&raw, x, y, w, h) { - Ok((bytes, cw, ch)) => (bytes, cw, ch, true), + let (capture, cropped) = match crop.as_ref().and_then(window_crop_rect) { + Some((x, y, w, h)) => match crop_png(&raw_capture.bytes, x, y, w, h) { + Ok((bytes, cw, ch)) => ( + RawScreenshotCapture { + mime_type: raw_capture.mime_type.clone(), + bytes, + source: raw_capture.source.clone(), + width: cw, + height: ch, + }, + true, + ), // If cropping fails, fall back to the full frame rather than erroring. - Err(_) => (raw, capture.width, capture.height, false), + Err(_) => (raw_capture, false), }, - None => (raw, capture.width, capture.height, false), + None => (raw_capture, false), }; + let capture = + prepare_screenshot_payload(capture, params.screenshot_options()).map_err(|e| { + ErrorData::internal_error(format!("screenshot resize failed: {e}"), None) + })?; - let b64 = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, &png); let caption = serde_json::json!({ - "width": width, - "height": height, + "width": capture.width, + "height": capture.height, + "coordinate_width": capture.coordinate_width, + "coordinate_height": capture.coordinate_height, + "scale": capture.scale, + "resized": capture.resized, + "bytes": capture.bytes, + "original_bytes": capture.original_bytes, + "max_bytes": capture.max_bytes, + "format": capture.format, + "quality": capture.quality, "source": capture.source, "cropped_to_window": cropped, "window_title": window_label, }); Ok(CallToolResult::success(vec![ - Content::image(b64, "image/png".to_string()), + Content::image(data_url_payload(&capture.data_url), capture.mime_type), Content::text(caption.to_string()), ])) } @@ -393,7 +419,7 @@ impl ComputerUseLinux { { return true; } - let Ok(cap) = crate::screenshot::capture_screenshot().await else { + let Ok(cap) = capture_screenshot_raw().await else { return false; }; match crate::abs_pointer::AbsPointer::create(cap.width as i32, cap.height as i32) { @@ -428,7 +454,7 @@ impl ComputerUseLinux { #[tool( name = "click", - description = "Click an element by index, semantic selector, or pixel coordinates from screenshot.", + description = "Click an element by index, semantic selector, or desktop coordinate pixels from screenshot metadata.", annotations( read_only_hint = false, destructive_hint = true, @@ -1017,7 +1043,7 @@ impl ComputerUseLinux { // can't be env!("CARGO_PKG_VERSION"); the MCP safety check (CI) fails the // build if it drifts from the Cargo version. version = "0.2.4", - instructions = "Begin every turn that uses Computer Use by calling get_app_state. If diagnostics report disabled GNOME accessibility, call setup_accessibility before asking the user to retry. Use list_windows/focused_window before targeted keyboard input. If diagnostics report windowing.can_list_windows=false on GNOME, call setup_window_targeting to install the optional GNOME Shell extension backend, then ask the user to log out and back in if the setup report says a shell reload is required. This Linux backend can capture screenshots through GNOME Shell or XDG Desktop Portal, read AT-SPI trees with action/value metadata, invoke native AT-SPI actions, set AT-SPI values or editable text, list/focus compositor windows through registered Linux window backends when the session permits it, attach best-effort terminal tty/process metadata to terminal windows, send coordinate or element-targeted click/scroll/drag input through the Wayland remote desktop portal when available, and send layout-safe literal type_text through KDE clipboard integration on Plasma Wayland or through portal keysyms on other Wayland sessions before falling back to ydotool. Tools with readOnlyHint=false may mutate local desktop or application state; hosts should require approval for actions that can submit, delete, send, purchase, or overwrite data. For element-targeted actions, prefer element_index from the latest get_app_state result; click, perform_action, and set_value can also use semantic role/name/text/states selectors when the target is unique. type_text and press_key accept optional window_id, pid, app_id, wm_class, title, tty, terminal_pid, terminal_command, or terminal_cwd selectors and refuse targeted input if focus cannot be verified." + instructions = "Begin every turn that uses Computer Use by calling get_app_state. If diagnostics report disabled GNOME accessibility, call setup_accessibility before asking the user to retry. Use list_windows/focused_window before targeted keyboard input. If diagnostics report windowing.can_list_windows=false on GNOME, call setup_window_targeting to install the optional GNOME Shell extension backend, then ask the user to log out and back in if the setup report says a shell reload is required. This Linux backend can capture size-bounded screenshots through GNOME Shell or XDG Desktop Portal, read AT-SPI trees with action/value metadata, invoke native AT-SPI actions, set AT-SPI values or editable text, list/focus compositor windows through registered Linux window backends when the session permits it, attach best-effort terminal tty/process metadata to terminal windows, send coordinate or element-targeted click/scroll/drag input through the Wayland remote desktop portal when available, and send layout-safe literal type_text through KDE clipboard integration on Plasma Wayland or through portal keysyms on other Wayland sessions before falling back to ydotool. Screenshot results include width/height for the returned image plus coordinate_width/coordinate_height and scale for desktop coordinate conversion; request more detail with max_width, max_height, max_bytes, format=jpeg, quality, or a smaller target/crop instead of relying on unbounded screenshots. Tools with readOnlyHint=false may mutate local desktop or application state; hosts should require approval for actions that can submit, delete, send, purchase, or overwrite data. For element-targeted actions, prefer element_index from the latest get_app_state result; click, perform_action, and set_value can also use semantic role/name/text/states selectors when the target is unique. type_text and press_key accept optional window_id, pid, app_id, wm_class, title, tty, terminal_pid, terminal_command, or terminal_cwd selectors and refuse targeted input if focus cannot be verified." )] impl ServerHandler for ComputerUseLinux {} @@ -1145,6 +1171,25 @@ struct GetAppStateParams { max_depth: Option, #[serde(default)] include_screenshot: Option, + /// Maximum returned screenshot width in pixels (default 1920, hard-capped). + #[serde(default)] + max_width: Option, + /// Maximum returned screenshot height in pixels (default 1920, hard-capped). + #[serde(default)] + max_height: Option, + /// Maximum returned screenshot image bytes before base64 (default 2 MiB, hard-capped). + #[serde(default)] + max_bytes: Option, + /// Additional downscale factor from 0.0 to 1.0, applied before max dimensions. + #[serde(default)] + scale: Option, + /// Output image format (default png). Use jpeg with quality to trade exact pixels for smaller payloads. + #[serde(default)] + format: Option, + /// JPEG quality from 1 to 95 (default 80). Ignored for png. + #[serde(default)] + #[schemars(range(min = 1, max = 95))] + quality: Option, } impl GetAppStateParams { @@ -1161,6 +1206,17 @@ impl GetAppStateParams { title: self.title.clone(), } } + + fn screenshot_options(&self) -> ScreenshotPayloadOptions { + ScreenshotPayloadOptions { + max_width: self.max_width, + max_height: self.max_height, + max_bytes: self.max_bytes, + scale: self.scale, + format: self.format, + quality: self.quality, + } + } } #[derive(Debug, Clone, Default, Deserialize, Serialize, JsonSchema)] @@ -1182,6 +1238,25 @@ struct ScreenshotParams { /// Capture the whole desktop even when a window is targeted (default false). #[serde(default)] full_screen: Option, + /// Maximum returned screenshot width in pixels (default 1920, hard-capped). + #[serde(default)] + max_width: Option, + /// Maximum returned screenshot height in pixels (default 1920, hard-capped). + #[serde(default)] + max_height: Option, + /// Maximum returned screenshot image bytes before base64 (default 2 MiB, hard-capped). + #[serde(default)] + max_bytes: Option, + /// Additional downscale factor from 0.0 to 1.0, applied before max dimensions. + #[serde(default)] + scale: Option, + /// Output image format (default png). Use jpeg with quality to trade exact pixels for smaller payloads. + #[serde(default)] + format: Option, + /// JPEG quality from 1 to 95 (default 80). Ignored for png. + #[serde(default)] + #[schemars(range(min = 1, max = 95))] + quality: Option, } impl ScreenshotParams { @@ -1206,6 +1281,17 @@ impl ScreenshotParams { title: self.title.clone(), }) } + + fn screenshot_options(&self) -> ScreenshotPayloadOptions { + ScreenshotPayloadOptions { + max_width: self.max_width, + max_height: self.max_height, + max_bytes: self.max_bytes, + scale: self.scale, + format: self.format, + quality: self.quality, + } + } } #[derive(Debug, Clone, Serialize, JsonSchema)] @@ -2262,13 +2348,13 @@ fn env_flag_enabled_any(keys: &[&str]) -> bool { keys.iter().any(|key| env_flag_enabled(key)) } -/// Decode the base64 payload of a `data:` URL (or a bare base64 string) to bytes. -fn decode_data_url(data_url: &str) -> std::result::Result, String> { - use base64::Engine; - let b64 = data_url.split_once(',').map(|(_, b)| b).unwrap_or(data_url); - base64::engine::general_purpose::STANDARD - .decode(b64) - .map_err(|e| format!("invalid screenshot base64: {e}")) +/// Return the base64 payload of a `data:` URL (or the original string if bare). +fn data_url_payload(data_url: &str) -> String { + data_url + .split_once(',') + .map(|(_, payload)| payload) + .unwrap_or(data_url) + .to_string() } /// Convert a window's bounds into a crop rectangle, if it has a usable origin @@ -2854,6 +2940,43 @@ mod tests { } } + fn solid_png(width: u32, height: u32) -> Vec { + let img = image::RgbaImage::from_pixel(width, height, image::Rgba([32, 128, 192, 255])); + let mut out = Vec::new(); + image::DynamicImage::ImageRgba8(img) + .write_to(&mut std::io::Cursor::new(&mut out), image::ImageFormat::Png) + .unwrap(); + out + } + + #[test] + fn window_crop_happens_before_screenshot_payload_resize() { + let (cropped, width, height) = crop_png(&solid_png(400, 200), 50, 20, 200, 100).unwrap(); + let capture = prepare_screenshot_payload( + RawScreenshotCapture { + mime_type: "image/png".to_string(), + bytes: cropped, + source: "test".to_string(), + width, + height, + }, + ScreenshotPayloadOptions { + max_width: Some(100), + max_height: Some(100), + max_bytes: Some(1024 * 1024), + ..Default::default() + }, + ) + .unwrap(); + + assert_eq!( + (capture.coordinate_width, capture.coordinate_height), + (200, 100) + ); + assert_eq!((capture.width, capture.height), (100, 50)); + assert!(capture.resized); + } + fn window_info( window_id: u64, title: Option<&str>,