From 650935e7ab68cc61b9166bf3bbc52925982b0b59 Mon Sep 17 00:00:00 2001 From: wgqqqqq Date: Thu, 5 Mar 2026 22:40:00 +0800 Subject: [PATCH 1/2] feat: migrate image tool to view_image and harden image flow --- Cargo.toml | 1 + .../desktop/src/api/image_analysis_api.rs | 74 +- src/apps/desktop/src/api/tool_api.rs | 6 +- src/crates/core/Cargo.toml | 1 + .../core/src/agentic/agents/agentic_mode.rs | 2 +- .../image_analysis/image_processing.rs | 328 +++++++++ .../core/src/agentic/image_analysis/mod.rs | 17 +- .../src/agentic/image_analysis/processor.rs | 304 +++----- .../implementations/analyze_image_tool.rs | 687 ------------------ .../src/agentic/tools/implementations/mod.rs | 62 +- .../tools/implementations/view_image_tool.rs | 396 ++++++++++ src/crates/core/src/agentic/tools/registry.rs | 12 +- .../component-library/components/registry.tsx | 6 +- .../src/flow_chat/hooks/useMessageSender.ts | 228 +++++- .../tool-cards/ImageAnalysisCard.tsx | 40 +- src/web-ui/src/flow_chat/tool-cards/index.ts | 6 +- src/web-ui/src/locales/en-US/flow-chat.json | 9 +- src/web-ui/src/locales/zh-CN/flow-chat.json | 9 +- 18 files changed, 1131 insertions(+), 1057 deletions(-) create mode 100644 src/crates/core/src/agentic/image_analysis/image_processing.rs delete mode 100644 src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs create mode 100644 src/crates/core/src/agentic/tools/implementations/view_image_tool.rs diff --git a/Cargo.toml b/Cargo.toml index accfe9ec..cfbbf464 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,7 @@ uuid = { version = "1.0", features = ["v4", "serde"] } chrono = { version = "0.4", features = ["serde", "clock"] } regex = "1.10" base64 = "0.21" +image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp", "bmp"] } md5 = "0.7" once_cell = "1.19.0" lazy_static = "1.4" diff --git a/src/apps/desktop/src/api/image_analysis_api.rs b/src/apps/desktop/src/api/image_analysis_api.rs index 09035c0b..369272ca 100644 --- a/src/apps/desktop/src/api/image_analysis_api.rs +++ b/src/apps/desktop/src/api/image_analysis_api.rs @@ -2,7 +2,10 @@ use crate::api::app_state::AppState; use bitfun_core::agentic::coordination::ConversationCoordinator; -use bitfun_core::agentic::image_analysis::*; +use bitfun_core::agentic::image_analysis::{ + resolve_vision_model_from_ai_config, AnalyzeImagesRequest, ImageAnalysisResult, ImageAnalyzer, + MessageEnhancer, SendEnhancedMessageRequest, +}; use log::error; use std::sync::Arc; use tauri::State; @@ -21,65 +24,26 @@ pub async fn analyze_images( format!("Failed to get AI config: {}", e) })?; - let image_model_id = ai_config - .default_models - .image_understanding - .ok_or_else(|| { - error!("Image understanding model not configured"); - "Image understanding model not configured".to_string() - })?; - - let image_model_id = if image_model_id.is_empty() { - let vision_model = ai_config - .models - .iter() - .find(|m| { - m.enabled - && m.capabilities.iter().any(|cap| { - matches!( - cap, - bitfun_core::service::config::types::ModelCapability::ImageUnderstanding - ) - }) - }) - .map(|m| m.id.as_str()); - - match vision_model { - Some(model_id) => model_id, - None => { - error!("No image understanding model found"); - return Err( - "Image understanding model not configured and no compatible model found.\n\n\ - Please add a model that supports image understanding\ - in [Settings → AI Model Config], enable 'image_understanding' capability, \ - and assign it in [Settings → Super Agent]." - .to_string(), - ); - } - } - } else { - &image_model_id - }; - - let image_model = ai_config - .models - .iter() - .find(|m| &m.id == image_model_id) - .ok_or_else(|| { - error!( - "Model not found: model_id={}, available_models={:?}", - image_model_id, - ai_config.models.iter().map(|m| &m.id).collect::>() - ); - format!("Model not found: {}", image_model_id) - })? - .clone(); + let image_model = resolve_vision_model_from_ai_config(&ai_config).map_err(|e| { + error!( + "No image understanding model available: available_models={:?}, error={}", + ai_config.models.iter().map(|m| &m.id).collect::>(), + e + ); + format!( + "Image understanding model not configured and no compatible model found.\n\n\ + Please add a model that supports image understanding \ + in [Settings → AI Model Config], enable 'image_understanding' capability, \ + and assign it in [Settings → Super Agent].\n\nDetails: {}", + e + ) + })?; let workspace_path = state.workspace_path.read().await.clone(); let ai_client = state .ai_client_factory - .get_client_by_id(image_model_id) + .get_client_by_id(&image_model.id) .await .map_err(|e| format!("Failed to create AI client: {}", e))?; diff --git a/src/apps/desktop/src/api/tool_api.rs b/src/apps/desktop/src/api/tool_api.rs index 3cca80df..86fa2928 100644 --- a/src/apps/desktop/src/api/tool_api.rs +++ b/src/apps/desktop/src/api/tool_api.rs @@ -3,7 +3,9 @@ use log::error; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::sync::Arc; +use crate::api::context_upload_api::create_image_context_provider; use bitfun_core::agentic::{ tools::framework::ToolUseContext, tools::{get_all_tools, get_readonly_tools}, @@ -171,7 +173,7 @@ pub async fn validate_tool_input( read_file_timestamps: HashMap::new(), options: None, response_state: None, - image_context_provider: None, + image_context_provider: Some(Arc::new(create_image_context_provider())), subagent_parent_info: None, cancellation_token: None, }; @@ -210,7 +212,7 @@ pub async fn execute_tool(request: ToolExecutionRequest) -> Result, + pub mime_type: String, + pub width: u32, + pub height: u32, +} + +pub fn resolve_vision_model_from_ai_config( + ai_config: &ServiceAIConfig, +) -> BitFunResult { + let target_model_id = ai_config + .default_models + .image_understanding + .as_ref() + .filter(|id| !id.is_empty()); + + if let Some(id) = target_model_id { + return ai_config + .models + .iter() + .find(|m| m.id == *id) + .cloned() + .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id))); + } + + ai_config + .models + .iter() + .find(|m| { + m.enabled + && m.capabilities + .iter() + .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding)) + }) + .cloned() + .ok_or_else(|| { + BitFunError::service( + "No image understanding model found.\nPlease configure an image understanding model in settings" + .to_string(), + ) + }) +} + +pub async fn resolve_vision_model_from_global_config() -> BitFunResult { + let config_service = get_global_config_service().await?; + let ai_config: ServiceAIConfig = config_service + .get_config(Some("ai")) + .await + .map_err(|e| BitFunError::service(format!("Failed to get AI config: {}", e)))?; + + resolve_vision_model_from_ai_config(&ai_config) +} + +pub fn resolve_image_path(path: &str, workspace_path: Option<&Path>) -> BitFunResult { + let path_buf = PathBuf::from(path); + + if path_buf.is_absolute() { + Ok(path_buf) + } else if let Some(workspace) = workspace_path { + Ok(workspace.join(path_buf)) + } else { + Ok(path_buf) + } +} + +pub async fn load_image_from_path( + path: &Path, + _workspace_path: Option<&Path>, +) -> BitFunResult> { + fs::read(path) + .await + .map_err(|e| BitFunError::io(format!("Failed to read image: {}", e))) +} + +pub fn decode_data_url(data_url: &str) -> BitFunResult<(Vec, Option)> { + if !data_url.starts_with("data:") { + return Err(BitFunError::validation("Invalid data URL format")); + } + + let parts: Vec<&str> = data_url.splitn(2, ',').collect(); + if parts.len() != 2 { + return Err(BitFunError::validation("Data URL format error")); + } + + let header = parts[0]; + let mime_type = header + .strip_prefix("data:") + .and_then(|s| s.split(';').next()) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(ToString::to_string); + + let base64_data = parts[1]; + let image_data = BASE64 + .decode(base64_data) + .map_err(|e| BitFunError::parse(format!("Base64 decode failed: {}", e)))?; + + Ok((image_data, mime_type)) +} + +pub fn detect_mime_type_from_bytes( + image_data: &[u8], + fallback_mime: Option<&str>, +) -> BitFunResult { + if let Ok(format) = image::guess_format(image_data) { + if let Some(mime) = image_format_to_mime(format) { + return Ok(mime.to_string()); + } + } + + if let Some(fallback) = fallback_mime { + if fallback.starts_with("image/") { + return Ok(fallback.to_string()); + } + } + + Err(BitFunError::validation( + "Unsupported or unrecognized image format", + )) +} + +pub fn optimize_image_for_provider( + image_data: Vec, + provider: &str, + fallback_mime: Option<&str>, +) -> BitFunResult { + let limits = ImageLimits::for_provider(provider); + + let guessed_format = image::guess_format(&image_data).ok(); + let dynamic = image::load_from_memory(&image_data) + .map_err(|e| BitFunError::validation(format!("Failed to decode image data: {}", e)))?; + + let (orig_width, orig_height) = (dynamic.width(), dynamic.height()); + let needs_resize = orig_width > limits.max_width || orig_height > limits.max_height; + + if !needs_resize && image_data.len() <= limits.max_size { + let mime_type = detect_mime_type_from_bytes(&image_data, fallback_mime)?; + return Ok(ProcessedImage { + data: image_data, + mime_type, + width: orig_width, + height: orig_height, + }); + } + + let mut working = if needs_resize { + dynamic.resize(limits.max_width, limits.max_height, FilterType::Triangle) + } else { + dynamic + }; + + let preferred_format = match guessed_format { + Some(ImageFormat::Jpeg) => ImageFormat::Jpeg, + _ => ImageFormat::Png, + }; + + let mut encoded = encode_dynamic_image(&working, preferred_format, 85)?; + + if encoded.0.len() > limits.max_size { + for quality in [80u8, 65, 50, 35] { + encoded = encode_dynamic_image(&working, ImageFormat::Jpeg, quality)?; + if encoded.0.len() <= limits.max_size { + break; + } + } + } + + if encoded.0.len() > limits.max_size { + for _ in 0..3 { + let next_w = ((working.width() as f32) * 0.85).round().max(64.0) as u32; + let next_h = ((working.height() as f32) * 0.85).round().max(64.0) as u32; + if next_w == working.width() && next_h == working.height() { + break; + } + + working = working.resize(next_w, next_h, FilterType::Triangle); + + for quality in [70u8, 55, 40] { + encoded = encode_dynamic_image(&working, ImageFormat::Jpeg, quality)?; + if encoded.0.len() <= limits.max_size { + break; + } + } + + if encoded.0.len() <= limits.max_size { + break; + } + } + } + + Ok(ProcessedImage { + data: encoded.0, + mime_type: encoded.1, + width: working.width(), + height: working.height(), + }) +} + +pub fn build_multimodal_message( + prompt: &str, + image_data: &[u8], + mime_type: &str, + provider: &str, +) -> BitFunResult> { + let base64_data = BASE64.encode(image_data); + let provider_lower = provider.to_lowercase(); + + let message = if provider_lower.contains("anthropic") { + Message { + role: "user".to_string(), + content: Some(serde_json::to_string(&json!([ + { + "type": "image", + "source": { + "type": "base64", + "media_type": mime_type, + "data": base64_data + } + }, + { + "type": "text", + "text": prompt + } + ]))?), + reasoning_content: None, + thinking_signature: None, + tool_calls: None, + tool_call_id: None, + name: None, + } + } else { + // Default to OpenAI-compatible payload shape for OpenAI and most OpenAI-compatible providers. + Message { + role: "user".to_string(), + content: Some(serde_json::to_string(&json!([ + { + "type": "image_url", + "image_url": { + "url": format!("data:{};base64,{}", mime_type, base64_data) + } + }, + { + "type": "text", + "text": prompt + } + ]))?), + reasoning_content: None, + thinking_signature: None, + tool_calls: None, + tool_call_id: None, + name: None, + } + }; + + Ok(vec![message]) +} + +fn image_format_to_mime(format: ImageFormat) -> Option<&'static str> { + match format { + ImageFormat::Png => Some("image/png"), + ImageFormat::Jpeg => Some("image/jpeg"), + ImageFormat::Gif => Some("image/gif"), + ImageFormat::WebP => Some("image/webp"), + ImageFormat::Bmp => Some("image/bmp"), + _ => None, + } +} + +fn encode_dynamic_image( + image: &DynamicImage, + format: ImageFormat, + jpeg_quality: u8, +) -> BitFunResult<(Vec, String)> { + let target_format = match format { + ImageFormat::Jpeg => ImageFormat::Jpeg, + _ => ImageFormat::Png, + }; + + let mut buffer = Vec::new(); + + match target_format { + ImageFormat::Png => { + let rgba = image.to_rgba8(); + let encoder = PngEncoder::new(&mut buffer); + encoder + .write_image( + rgba.as_raw(), + image.width(), + image.height(), + ColorType::Rgba8.into(), + ) + .map_err(|e| BitFunError::tool(format!("PNG encode failed: {}", e)))?; + } + ImageFormat::Jpeg => { + let mut encoder = JpegEncoder::new_with_quality(&mut buffer, jpeg_quality); + encoder + .encode_image(image) + .map_err(|e| BitFunError::tool(format!("JPEG encode failed: {}", e)))?; + } + _ => unreachable!("unsupported target format"), + } + + let mime = image_format_to_mime(target_format) + .unwrap_or("image/png") + .to_string(); + + Ok((buffer, mime)) +} diff --git a/src/crates/core/src/agentic/image_analysis/mod.rs b/src/crates/core/src/agentic/image_analysis/mod.rs index 2b02ebf4..814afb66 100644 --- a/src/crates/core/src/agentic/image_analysis/mod.rs +++ b/src/crates/core/src/agentic/image_analysis/mod.rs @@ -1,12 +1,17 @@ //! Image Analysis Module -//! +//! //! Implements image pre-understanding functionality, converting image content to text descriptions -pub mod types; -pub mod processor; pub mod enhancer; +pub mod image_processing; +pub mod processor; +pub mod types; -pub use types::*; -pub use processor::ImageAnalyzer; pub use enhancer::MessageEnhancer; - +pub use image_processing::{ + build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path, + optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_ai_config, + resolve_vision_model_from_global_config, ProcessedImage, +}; +pub use processor::ImageAnalyzer; +pub use types::*; diff --git a/src/crates/core/src/agentic/image_analysis/processor.rs b/src/crates/core/src/agentic/image_analysis/processor.rs index 145b0ae1..2363738d 100644 --- a/src/crates/core/src/agentic/image_analysis/processor.rs +++ b/src/crates/core/src/agentic/image_analysis/processor.rs @@ -1,18 +1,18 @@ //! Image Processor //! -//! Handles image loading, compression, format conversion, and other operations +//! Handles image loading, preprocessing, multimodal message construction, and response parsing. -use super::types::{AnalyzeImagesRequest, ImageAnalysisResult, ImageContextData, ImageLimits}; +use super::image_processing::{ + build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path, + optimize_image_for_provider, resolve_image_path, +}; +use super::types::{AnalyzeImagesRequest, ImageAnalysisResult, ImageContextData}; use crate::infrastructure::ai::AIClient; use crate::service::config::types::AIModelConfig; use crate::util::errors::*; -use crate::util::types::Message; -use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; -use log::{debug, error, info}; -use serde_json::json; -use std::path::{Path, PathBuf}; +use log::{debug, error, info, warn}; +use std::path::PathBuf; use std::sync::Arc; -use tokio::fs; /// Image Analyzer pub struct ImageAnalyzer { @@ -36,7 +36,6 @@ impl ImageAnalyzer { ) -> BitFunResult> { info!("Starting analysis of {} images", request.images.len()); - // Process multiple images in parallel let mut tasks = vec![]; for img_ctx in request.images { @@ -59,7 +58,6 @@ impl ImageAnalyzer { tasks.push(task); } - // Wait for all analyses to complete let mut results = vec![]; for task in tasks { match task.await { @@ -70,7 +68,10 @@ impl ImageAnalyzer { } Err(e) => { error!("Image analysis task failed: {:?}", e); - return Err(BitFunError::service(format!("Image analysis task failed: {}", e))); + return Err(BitFunError::service(format!( + "Image analysis task failed: {}", + e + ))); } } } @@ -79,7 +80,6 @@ impl ImageAnalyzer { Ok(results) } - /// Analyze a single image async fn analyze_single_image( image_ctx: ImageContextData, model: &AIModelConfig, @@ -91,42 +91,35 @@ impl ImageAnalyzer { debug!("Analyzing image: {}", image_ctx.id); - // 1. Load image - let image_data = + let (image_data, fallback_mime) = Self::load_image_from_context(&image_ctx, workspace_path.as_deref()).await?; - // 2. Image preprocessing (compression, format conversion) - let (optimized_data, mime_type) = - Self::optimize_image_for_model(image_data, &image_ctx.mime_type, model)?; - - // 3. Convert to Base64 - let base64_data = BASE64.encode(&optimized_data); + let processed = + optimize_image_for_provider(image_data, &model.provider, fallback_mime.as_deref())?; debug!( - "Image processing completed: original_type={}, optimized_type={}, size={}KB", - image_ctx.mime_type, - mime_type, - optimized_data.len() / 1024 + "Image processing completed: mime={}, size={}KB, dimensions={}x{}", + processed.mime_type, + processed.data.len() / 1024, + processed.width, + processed.height ); - // 4. Build analysis prompt let analysis_prompt = Self::build_image_analysis_prompt(user_context); - // 5. Build multimodal message - let messages = Self::build_multimodal_message( + let messages = build_multimodal_message( &analysis_prompt, - &base64_data, - &mime_type, + &processed.data, + &processed.mime_type, &model.provider, )?; - // Save complete multimodal message to AI log debug!(target: "ai::image_analysis_request", "Complete multimodal message:\n{}", - serde_json::to_string_pretty(&messages).unwrap_or_else(|_| "Serialization failed".to_string()) + serde_json::to_string_pretty(&messages) + .unwrap_or_else(|_| "Serialization failed".to_string()) ); - // 6. Call AI model for image analysis debug!( "Calling vision model: image_id={}, model={}", image_ctx.id, model.model_name @@ -138,100 +131,38 @@ impl ImageAnalyzer { debug!("AI response content: {}", ai_response.text); - // 7. Parse response into structured result - let mut analysis_result = Self::parse_analysis_response(&ai_response.text, &image_ctx.id)?; - - let elapsed = start.elapsed().as_millis() as u64; - analysis_result.analysis_time_ms = elapsed; + let mut analysis_result = Self::parse_analysis_response(&ai_response.text, &image_ctx.id); + analysis_result.analysis_time_ms = start.elapsed().as_millis() as u64; info!( "Image analysis completed: image_id={}, duration={}ms", - image_ctx.id, elapsed + image_ctx.id, analysis_result.analysis_time_ms ); Ok(analysis_result) } - /// Load image from context async fn load_image_from_context( ctx: &ImageContextData, - workspace_path: Option<&Path>, - ) -> BitFunResult> { + workspace_path: Option<&std::path::Path>, + ) -> BitFunResult<(Vec, Option)> { if let Some(data_url) = &ctx.data_url { - // Parse from data URL - Self::decode_data_url(data_url) - } else if let Some(path_str) = &ctx.image_path { - // Load from file path - let path = PathBuf::from(path_str); - - // Security check: ensure path is within workspace - if let Some(workspace) = workspace_path { - let canonical_path = tokio::fs::canonicalize(&path) - .await - .map_err(|e| BitFunError::io(format!("Image file does not exist: {}", e)))?; - let canonical_workspace = tokio::fs::canonicalize(workspace) - .await - .map_err(|e| BitFunError::io(format!("Invalid workspace path: {}", e)))?; - - if !canonical_path.starts_with(&canonical_workspace) { - return Err(BitFunError::validation("Image path must be within workspace")); - } - } - - fs::read(&path) - .await - .map_err(|e| BitFunError::io(format!("Failed to read image: {}", e))) - } else { - Err(BitFunError::validation("Image context missing path or data")) - } - } - - /// Decode data URL - fn decode_data_url(data_url: &str) -> BitFunResult> { - // data:image/png;base64,iVBORw0KG... - if !data_url.starts_with("data:") { - return Err(BitFunError::validation("Invalid data URL format")); + let (data, mime) = decode_data_url(data_url)?; + return Ok((data, mime.or_else(|| Some(ctx.mime_type.clone())))); } - let parts: Vec<&str> = data_url.splitn(2, ',').collect(); - if parts.len() != 2 { - return Err(BitFunError::validation("Data URL format error")); + if let Some(path_str) = &ctx.image_path { + let path = resolve_image_path(path_str, workspace_path)?; + let data = load_image_from_path(&path, workspace_path).await?; + let detected_mime = detect_mime_type_from_bytes(&data, Some(&ctx.mime_type)).ok(); + return Ok((data, detected_mime.or_else(|| Some(ctx.mime_type.clone())))); } - let base64_data = parts[1]; - BASE64 - .decode(base64_data) - .map_err(|e| BitFunError::parse(format!("Base64 decoding failed: {}", e))) - } - - /// Optimize image (compression, format conversion) - fn optimize_image_for_model( - image_data: Vec, - original_mime: &str, - model: &AIModelConfig, - ) -> BitFunResult<(Vec, String)> { - // Get model limits - let limits = ImageLimits::for_provider(&model.provider); - - // If image size is within limit, return directly - if image_data.len() <= limits.max_size { - debug!("Image size within limit, no compression needed"); - return Ok((image_data, original_mime.to_string())); - } - - info!( - "Image size {}KB exceeds limit {}KB, compression needed", - image_data.len() / 1024, - limits.max_size / 1024 - ); - - // TODO: Use image crate for actual compression - - // Temporarily return original image, compression logic to be implemented later - Ok((image_data, original_mime.to_string())) + Err(BitFunError::validation( + "Image context missing path or data", + )) } - /// Build image analysis prompt fn build_image_analysis_prompt(user_context: Option<&str>) -> String { let mut prompt = String::from( "Please analyze the content of this image in detail. Output in the following JSON format:\n\n\ @@ -261,119 +192,63 @@ impl ImageAnalyzer { prompt } - /// Build multimodal message - fn build_multimodal_message( - prompt: &str, - base64_data: &str, - mime_type: &str, - provider: &str, - ) -> BitFunResult> { - let message = match provider.to_lowercase().as_str() { - "openai" => { - // OpenAI format (Zhipu AI compatible) - // Note: - // 1. Zhipu AI only supports url field, does not support detail parameter - // 2. Image must come first, text after (consistent with official examples) - Message { - role: "user".to_string(), - content: Some(serde_json::to_string(&json!([ - { - "type": "image_url", - "image_url": { - "url": format!("data:{};base64,{}", mime_type, base64_data) - } - }, - { - "type": "text", - "text": prompt - } - ]))?), - reasoning_content: None, - thinking_signature: None, - tool_calls: None, - tool_call_id: None, - name: None, - } - } - "anthropic" => { - // Anthropic format (content is an array) - Message { - role: "user".to_string(), - content: Some(serde_json::to_string(&json!([ - { - "type": "image", - "source": { - "type": "base64", - "media_type": mime_type, - "data": base64_data - } - }, - { - "type": "text", - "text": prompt - } - ]))?), - reasoning_content: None, - thinking_signature: None, - tool_calls: None, - tool_call_id: None, - name: None, - } - } - _ => { - return Err(BitFunError::validation(format!( - "Unsupported provider: {}", - provider - ))); - } - }; + fn parse_analysis_response(response: &str, image_id: &str) -> ImageAnalysisResult { + let json_str = Self::extract_json_from_markdown(response).unwrap_or(response); - Ok(vec![message]) - } + if let Ok(parsed) = serde_json::from_str::(json_str) { + return ImageAnalysisResult { + image_id: image_id.to_string(), + summary: parsed["summary"] + .as_str() + .unwrap_or("Image analysis completed") + .to_string(), + detailed_description: parsed["detailed_description"] + .as_str() + .unwrap_or(response) + .to_string(), + detected_elements: parsed["detected_elements"] + .as_array() + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str()) + .map(String::from) + .collect() + }) + .unwrap_or_default(), + confidence: parsed["confidence"].as_f64().unwrap_or(0.8) as f32, + analysis_time_ms: 0, + }; + } - /// Parse AI response into structured result - fn parse_analysis_response( - response: &str, - image_id: &str, - ) -> BitFunResult { - // Extract JSON - let json_str = Self::extract_json_from_markdown(response).unwrap_or(response); + warn!( + "Image analysis response is not valid JSON, falling back to plain text: image_id={}", + image_id + ); - // Parse JSON - let parsed: serde_json::Value = serde_json::from_str(json_str).map_err(|e| { - BitFunError::parse(format!( - "Failed to parse image analysis result: {}. Original response: {}", - e, response - )) - })?; + let cleaned = response.trim(); + let summary = if cleaned.is_empty() { + "Image analysis completed".to_string() + } else { + cleaned + .lines() + .next() + .unwrap_or("Image analysis completed") + .chars() + .take(140) + .collect() + }; - Ok(ImageAnalysisResult { + ImageAnalysisResult { image_id: image_id.to_string(), - summary: parsed["summary"] - .as_str() - .unwrap_or("Image analysis completed") - .to_string(), - detailed_description: parsed["detailed_description"] - .as_str() - .unwrap_or("") - .to_string(), - detected_elements: parsed["detected_elements"] - .as_array() - .map(|arr| { - arr.iter() - .filter_map(|v| v.as_str()) - .map(String::from) - .collect() - }) - .unwrap_or_default(), - confidence: parsed["confidence"].as_f64().unwrap_or(0.8) as f32, - analysis_time_ms: 0, // Will be filled externally - }) + summary, + detailed_description: cleaned.to_string(), + detected_elements: Vec::new(), + confidence: 0.5, + analysis_time_ms: 0, + } } - /// Extract JSON from Markdown code block fn extract_json_from_markdown(text: &str) -> Option<&str> { - // 1. Try to extract Zhipu AI's special marker format <|begin_of_box|>...<|end_of_box|> if let Some(start_idx) = text.find("<|begin_of_box|>") { let content_start = start_idx + "<|begin_of_box|>".len(); if let Some(end_idx) = text[content_start..].find("<|end_of_box|>") { @@ -383,7 +258,6 @@ impl ImageAnalyzer { } } - // 2. Try to extract Markdown code block format ```json ... ``` or ``` ... ``` let start_markers = ["```json\n", "```\n"]; for marker in &start_markers { diff --git a/src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs b/src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs deleted file mode 100644 index 4e4475fe..00000000 --- a/src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs +++ /dev/null @@ -1,687 +0,0 @@ -//! Image analysis tool - allows Agent to analyze image content on demand -//! -//! Provides flexible image analysis capabilities, Agent can customize analysis prompts and focus areas - -use async_trait::async_trait; -use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; -use log::{debug, info, trace}; -use serde::Deserialize; -use serde_json::{json, Value}; -use std::path::{Path, PathBuf}; -use std::sync::Arc; -use tokio::fs; - -use crate::agentic::tools::framework::{ - Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult, -}; -use crate::infrastructure::ai::AIClient; -use crate::infrastructure::{get_path_manager_arc, get_workspace_path}; -use crate::service::config::types::{AIConfig as ServiceAIConfig, AIModelConfig, GlobalConfig}; -use crate::util::errors::{BitFunError, BitFunResult}; -use crate::util::types::{AIConfig as ModelConfig, Message}; - -/// Image analysis tool input -#[derive(Debug, Deserialize)] -struct AnalyzeImageInput { - /// Image path (relative to workspace or absolute path) - #[serde(default)] - image_path: Option, - /// Base64-encoded image data (clipboard image) - #[serde(default)] - data_url: Option, - /// Image ID (retrieved from temporary storage, for clipboard images) - #[serde(default)] - image_id: Option, - /// Analysis prompt - analysis_prompt: String, - /// Focus areas (optional) - #[serde(default)] - focus_areas: Option>, - /// Detail level (optional) - #[serde(default)] - detail_level: Option, -} - -/// Image analysis tool -pub struct AnalyzeImageTool; - -impl AnalyzeImageTool { - pub fn new() -> Self { - Self - } - - /// Resolve image path (supports relative and absolute paths) - fn resolve_image_path(&self, path: &str) -> BitFunResult { - let path_buf = PathBuf::from(path); - - if path_buf.is_absolute() { - Ok(path_buf) - } else { - let workspace_path = get_workspace_path() - .ok_or_else(|| BitFunError::tool("Workspace path not set".to_string()))?; - Ok(workspace_path.join(path)) - } - } - - /// Load image file - async fn load_image(&self, path: &Path) -> BitFunResult> { - // Security check: ensure path is within workspace - if let Some(workspace_path) = get_workspace_path() { - let canonical_path = tokio::fs::canonicalize(path) - .await - .map_err(|e| BitFunError::io(format!("Image file does not exist: {}", e)))?; - let canonical_workspace = tokio::fs::canonicalize(&workspace_path) - .await - .map_err(|e| BitFunError::io(format!("Invalid workspace path: {}", e)))?; - - if !canonical_path.starts_with(&canonical_workspace) { - return Err(BitFunError::validation( - "Image path must be within workspace", - )); - } - } - - fs::read(path) - .await - .map_err(|e| BitFunError::io(format!("Failed to read image: {}", e))) - } - - /// Detect image MIME type - fn detect_mime_type(&self, path: &Path) -> BitFunResult { - let extension = path - .extension() - .and_then(|e| e.to_str()) - .ok_or_else(|| BitFunError::validation("Unable to determine image format"))? - .to_lowercase(); - - let mime_type = match extension.as_str() { - "png" => "image/png", - "jpg" | "jpeg" => "image/jpeg", - "gif" => "image/gif", - "webp" => "image/webp", - "bmp" => "image/bmp", - _ => { - return Err(BitFunError::validation(format!( - "Unsupported image format: {}", - extension - ))) - } - }; - - Ok(mime_type.to_string()) - } - - /// Get image dimensions (simple implementation) - fn get_image_dimensions(&self, _data: &[u8]) -> (u32, u32) { - // TODO: Implement real image dimension detection - (0, 0) - } - - /// Decode data URL - fn decode_data_url(&self, data_url: &str) -> BitFunResult<(Vec, String)> { - // data:image/png;base64,iVBORw0KG... - if !data_url.starts_with("data:") { - return Err(BitFunError::validation("Invalid data URL format")); - } - - let parts: Vec<&str> = data_url.splitn(2, ',').collect(); - if parts.len() != 2 { - return Err(BitFunError::validation("Data URL format error")); - } - - // Extract MIME type - let header = parts[0]; - let mime_type = header - .strip_prefix("data:") - .and_then(|s| s.split(';').next()) - .unwrap_or("image/png") - .to_string(); - - // Decode base64 - let base64_data = parts[1]; - let image_data = BASE64 - .decode(base64_data) - .map_err(|e| BitFunError::parse(format!("Base64 decode failed: {}", e)))?; - - debug!( - "Decoded image from data URL: mime={}, size_kb={}", - mime_type, - image_data.len() / 1024 - ); - - Ok((image_data, mime_type)) - } - - /// Load AI configuration from config file - async fn load_ai_config(&self) -> BitFunResult { - let path_manager = get_path_manager_arc(); - let config_file = path_manager.app_config_file(); - - if !config_file.exists() { - return Err(BitFunError::tool("Config file does not exist".to_string())); - } - - let config_content = tokio::fs::read_to_string(&config_file) - .await - .map_err(|e| BitFunError::tool(format!("Failed to read config file: {}", e)))?; - - let global_config: GlobalConfig = serde_json::from_str(&config_content) - .map_err(|e| BitFunError::tool(format!("Failed to parse config file: {}", e)))?; - - Ok(global_config.ai) - } - - /// Get vision model configuration - async fn get_vision_model(&self) -> BitFunResult { - let ai_config = self.load_ai_config().await?; - - let target_model_id = ai_config - .default_models - .image_understanding - .as_ref() - .filter(|id| !id.is_empty()); - - let model = if let Some(id) = target_model_id { - ai_config - .models - .iter() - .find(|m| m.id == *id) - .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id)))? - .clone() - } else { - ai_config - .models - .iter() - .find(|m| { - m.enabled - && m.capabilities.iter().any(|cap| { - matches!( - cap, - crate::service::config::types::ModelCapability::ImageUnderstanding - ) - }) - }) - .ok_or_else(|| { - BitFunError::service( - "No image understanding model found.\n\ - Please configure an image understanding model in settings" - .to_string(), - ) - })? - .clone() - }; - - Ok(model) - } - - /// Build analysis prompt - fn build_prompt( - &self, - analysis_prompt: &str, - focus_areas: &Option>, - detail_level: &Option, - ) -> String { - let mut prompt = String::new(); - - // 1. User's analysis prompt - prompt.push_str(analysis_prompt); - prompt.push_str("\n\n"); - - if let Some(areas) = focus_areas { - if !areas.is_empty() { - prompt.push_str("Please pay special attention to the following aspects:\n"); - for area in areas { - prompt.push_str(&format!("- {}\n", area)); - } - prompt.push_str("\n"); - } - } - - let detail_guide = match detail_level.as_deref() { - Some("brief") => "Please answer concisely in 1-2 sentences.", - Some("detailed") => { - "Please provide a detailed analysis including all relevant details." - } - _ => "Please provide a moderate level of analysis detail.", - }; - prompt.push_str(detail_guide); - - prompt - } - - /// Build multimodal message - fn build_multimodal_message( - &self, - prompt: &str, - base64_data: &str, - mime_type: &str, - provider: &str, - ) -> BitFunResult> { - let message = match provider.to_lowercase().as_str() { - "openai" => Message { - role: "user".to_string(), - content: Some(serde_json::to_string(&json!([ - { - "type": "image_url", - "image_url": { - "url": format!("data:{};base64,{}", mime_type, base64_data) - } - }, - { - "type": "text", - "text": prompt - } - ]))?), - reasoning_content: None, - thinking_signature: None, - tool_calls: None, - tool_call_id: None, - name: None, - }, - "anthropic" => Message { - role: "user".to_string(), - content: Some(serde_json::to_string(&json!([ - { - "type": "image", - "source": { - "type": "base64", - "media_type": mime_type, - "data": base64_data - } - }, - { - "type": "text", - "text": prompt - } - ]))?), - reasoning_content: None, - thinking_signature: None, - tool_calls: None, - tool_call_id: None, - name: None, - }, - _ => { - return Err(BitFunError::validation(format!( - "Unsupported provider: {}", - provider - ))); - } - }; - - Ok(vec![message]) - } -} - -#[async_trait] -impl Tool for AnalyzeImageTool { - fn name(&self) -> &str { - "AnalyzeImage" - } - - async fn description(&self) -> BitFunResult { - Ok(r#"Analyzes image content and returns detailed descriptions. Use this tool when the user uploads images and asks related questions. - -Core Capabilities: -- Identify objects, text, structures and other content in images -- Understand technical diagrams (architecture diagrams, flowcharts, UML diagrams, etc.) -- Extract code and error messages from code screenshots -- Analyze UI designs and interface layouts -- Recognize data, tables, and charts in images - -Usage Scenarios: -1. User uploads architecture diagram and asks architecture questions → Analyze components and relationships -2. User uploads error screenshot → Extract error messages and stack traces -3. User uploads code screenshot → Identify code content -4. User uploads UI design → Analyze design elements and layout -5. User uploads data charts → Interpret data and trends - -Important Notes: -- You can customize analysis_prompt to precisely control the analysis angle and focus -- Use focus_areas parameter to specify aspects to emphasize -- Choose detail_level as needed (brief/normal/detailed) -- The same image can be analyzed multiple times for different aspects"#.to_string()) - } - - fn input_schema(&self) -> Value { - json!({ - "type": "object", - "properties": { - "image_path": { - "type": "string", - "description": "Path to the image file (relative to workspace or absolute path).\nExamples: 'screenshot.png' or 'docs/architecture.png'\nNote: Provide ONE of: image_path, data_url, or (image_id + session_id)." - }, - "data_url": { - "type": "string", - "description": "Base64-encoded image data.\nFormat: 'data:image/png;base64,iVBORw0KG...'\nNot recommended for large images due to token cost." - }, - "image_id": { - "type": "string", - "description": "Image ID for clipboard images stored in temporary cache.\nExample: 'img-clipboard-1234567890-abc123'" - }, - "analysis_prompt": { - "type": "string", - "description": "Analysis prompt describing what information you want to extract from the image.\n\ - Examples:\n\ - - 'What is this architecture diagram? What components and connections does it contain?'\n\ - - 'Extract all error messages and stack traces from this screenshot'\n\ - - 'Describe the layout structure and interactive elements of this UI'" - }, - "focus_areas": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Optional. List of aspects to focus on.\nExamples: ['technical architecture', 'data flow'] or ['UI layout', 'color scheme']" - }, - "detail_level": { - "type": "string", - "enum": ["brief", "normal", "detailed"], - "description": "Optional. Level of analysis detail.\n- brief: Brief summary (1-2 sentences)\n- normal: Normal detail (default)\n- detailed: Detailed analysis (includes all relevant details)" - } - }, - "required": ["analysis_prompt"] - }) - } - - fn is_readonly(&self) -> bool { - true - } - - fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool { - true - } - - fn needs_permissions(&self, _input: Option<&Value>) -> bool { - false - } - - async fn validate_input( - &self, - input: &Value, - _context: Option<&ToolUseContext>, - ) -> ValidationResult { - // Check if image_path, data_url, or (image_id + session_id) is provided - let has_path = input - .get("image_path") - .and_then(|v| v.as_str()) - .filter(|s| !s.is_empty()) - .is_some(); - let has_data_url = input - .get("data_url") - .and_then(|v| v.as_str()) - .filter(|s| !s.is_empty()) - .is_some(); - let has_image_id = input - .get("image_id") - .and_then(|v| v.as_str()) - .filter(|s| !s.is_empty()) - .is_some(); - - if !has_path && !has_data_url && !has_image_id { - return ValidationResult { - result: false, - message: Some("Must provide one of image_path, data_url, or image_id".to_string()), - error_code: Some(400), - meta: None, - }; - } - - if let Some(prompt) = input.get("analysis_prompt").and_then(|v| v.as_str()) { - if prompt.is_empty() { - return ValidationResult { - result: false, - message: Some("analysis_prompt cannot be empty".to_string()), - error_code: Some(400), - meta: None, - }; - } - } else { - return ValidationResult { - result: false, - message: Some("analysis_prompt is required".to_string()), - error_code: Some(400), - meta: None, - }; - } - - if let Some(image_path) = input.get("image_path").and_then(|v| v.as_str()) { - if !image_path.is_empty() { - match self.resolve_image_path(image_path) { - Ok(path) => { - if !path.exists() { - return ValidationResult { - result: false, - message: Some(format!("Image file does not exist: {}", image_path)), - error_code: Some(404), - meta: None, - }; - } - - if !path.is_file() { - return ValidationResult { - result: false, - message: Some(format!("Path is not a file: {}", image_path)), - error_code: Some(400), - meta: None, - }; - } - } - Err(e) => { - return ValidationResult { - result: false, - message: Some(format!("Path parsing failed: {}", e)), - error_code: Some(400), - meta: None, - }; - } - } - } - } - - ValidationResult { - result: true, - message: None, - error_code: None, - meta: None, - } - } - - fn render_tool_use_message(&self, input: &Value, options: &ToolRenderOptions) -> String { - // Determine image source - let image_source = if let Some(path) = input.get("image_path").and_then(|v| v.as_str()) { - if !path.is_empty() { - path.to_string() - } else { - "Clipboard image".to_string() - } - } else if input.get("data_url").is_some() { - "Clipboard image".to_string() - } else { - "unknown".to_string() - }; - - if options.verbose { - let prompt = input - .get("analysis_prompt") - .and_then(|v| v.as_str()) - .unwrap_or("..."); - format!( - "Analyzing image: {} (prompt: {})", - image_source, - if prompt.len() > 50 { - // Safe truncation: find the maximum character boundary not exceeding 50 bytes - let pos = prompt - .char_indices() - .take_while(|(i, _)| *i < 50) - .last() - .map(|(i, c)| i + c.len_utf8()) - .unwrap_or(0); - format!("{}...", &prompt[..pos]) - } else { - prompt.to_string() - } - ) - } else { - format!("Analyzing image: {}", image_source) - } - } - - async fn call_impl( - &self, - input: &Value, - _context: &ToolUseContext, - ) -> BitFunResult> { - let start = std::time::Instant::now(); - - // Parse input - let input_data: AnalyzeImageInput = serde_json::from_value(input.clone()) - .map_err(|e| BitFunError::parse(format!("Failed to parse input: {}", e)))?; - - let has_data_url = input_data.data_url.is_some(); - let has_path = input_data.image_path.is_some(); - let has_image_id = input_data.image_id.is_some(); - - if !has_data_url && !has_path && !has_image_id { - return Err(BitFunError::validation( - "Must provide one of image_path, data_url, or image_id", - )); - } - - debug!( - "Starting image analysis: source={}", - if has_image_id { - "temporary_storage(image_id)" - } else if has_data_url { - "direct_input(data_url)" - } else { - "file_path(image_path)" - } - ); - debug!("Analysis prompt: {}", input_data.analysis_prompt); - - let (image_data, mime_type, image_source_description) = if let Some(image_id) = - &input_data.image_id - { - let provider = _context.image_context_provider.as_ref() - .ok_or_else(|| BitFunError::tool( - "image_id mode requires ImageContextProvider support, but no provider was injected.\n\ - Please inject image_context_provider when calling the tool, or use image_path/data_url mode.".to_string() - ))?; - - let image_context = provider.get_image(image_id) - .ok_or_else(|| BitFunError::tool(format!( - "Image context not found: image_id={}. Image may have expired (5-minute validity) or was never uploaded.", - image_id - )))?; - - debug!( - "Retrieved image from context provider: name={}, source={}", - image_context.image_name, image_context.mime_type - ); - - if let Some(data_url) = &image_context.data_url { - let (data, mime) = self.decode_data_url(data_url)?; - ( - data, - mime, - format!("{} (clipboard)", image_context.image_name), - ) - } else if let Some(image_path_str) = &image_context.image_path { - let image_path = self.resolve_image_path(image_path_str)?; - let data = self.load_image(&image_path).await?; - let mime = self.detect_mime_type(&image_path)?; - (data, mime, image_path.display().to_string()) - } else { - return Err(BitFunError::tool(format!( - "Image context {} has neither data_url nor image_path", - image_id - ))); - } - } else if let Some(data_url) = &input_data.data_url { - // Decode from data URL - let (data, mime) = self.decode_data_url(data_url)?; - (data, mime, "clipboard_image".to_string()) - } else if let Some(image_path_str) = &input_data.image_path { - // Load from file path - let image_path = self.resolve_image_path(image_path_str)?; - debug!("Parsed image path: {}", image_path.display()); - - let data = self.load_image(&image_path).await?; - let mime = self.detect_mime_type(&image_path)?; - - debug!("Image size: {} KB, mime: {}", data.len() / 1024, mime); - - (data, mime, image_path.display().to_string()) - } else { - unreachable!("Input already checked above") - }; - - let base64_data = BASE64.encode(&image_data); - - let vision_model = self.get_vision_model().await?; - debug!( - "Using vision model: name={}, model={}", - vision_model.name, vision_model.model_name - ); - - let prompt = self.build_prompt( - &input_data.analysis_prompt, - &input_data.focus_areas, - &input_data.detail_level, - ); - trace!("Full analysis prompt: {}", prompt); - - let messages = self.build_multimodal_message( - &prompt, - &base64_data, - &mime_type, - &vision_model.provider, - )?; - - // Vision models cannot set max_tokens (e.g., glm-4v doesn't support this parameter) - // and should never use the thinking process. - let mut model_config = ModelConfig::try_from(vision_model.clone()) - .map_err(|e| BitFunError::parse(format!("Config conversion failed for vision model {}: {}", vision_model.name, e)))?; - model_config.max_tokens = None; - model_config.enable_thinking_process = false; - model_config.support_preserved_thinking = false; - - let ai_client = Arc::new(AIClient::new(model_config)); - - debug!("Calling vision model for analysis..."); - let ai_response = ai_client - .send_message(messages, None) - .await - .map_err(|e| BitFunError::service(format!("AI call failed: {}", e)))?; - - let elapsed = start.elapsed(); - info!("Image analysis completed: duration={:?}", elapsed); - - let (width, height) = self.get_image_dimensions(&image_data); - - let result_for_assistant = format!( - "Image analysis result ({})\n\n{}", - image_source_description, ai_response.text - ); - - let result = ToolResult::Result { - data: json!({ - "success": true, - "image_source": image_source_description, - "analysis": ai_response.text, - "metadata": { - "mime_type": mime_type, - "file_size": image_data.len(), - "width": width, - "height": height, - "analysis_time_ms": elapsed.as_millis() as u64, - "model_used": vision_model.name, - "prompt_used": input_data.analysis_prompt, - } - }), - result_for_assistant: Some(result_for_assistant), - }; - - Ok(vec![result]) - } -} diff --git a/src/crates/core/src/agentic/tools/implementations/mod.rs b/src/crates/core/src/agentic/tools/implementations/mod.rs index 4912528b..f6d2f6c0 100644 --- a/src/crates/core/src/agentic/tools/implementations/mod.rs +++ b/src/crates/core/src/agentic/tools/implementations/mod.rs @@ -1,51 +1,51 @@ //! Tool implementation module +pub mod ask_user_question_tool; +pub mod bash_tool; +pub mod code_review_tool; +pub mod create_plan_tool; +pub mod delete_file_tool; +pub mod file_edit_tool; pub mod file_read_tool; pub mod file_write_tool; -pub mod file_edit_tool; -pub mod delete_file_tool; -pub mod bash_tool; -pub mod grep_tool; +pub mod get_file_diff_tool; +pub mod git_tool; pub mod glob_tool; -pub mod web_tools; -pub mod todo_write_tool; +pub mod grep_tool; pub mod ide_control_tool; -pub mod mermaid_interactive_tool; -pub mod log_tool; pub mod linter_tool; -pub mod analyze_image_tool; +pub mod log_tool; +pub mod ls_tool; +pub mod mermaid_interactive_tool; pub mod skill_tool; pub mod skills; -pub mod ask_user_question_tool; -pub mod ls_tool; pub mod task_tool; -pub mod git_tool; -pub mod create_plan_tool; -pub mod get_file_diff_tool; -pub mod code_review_tool; pub mod terminal_control_tool; +pub mod todo_write_tool; pub mod util; +pub mod view_image_tool; +pub mod web_tools; +pub use ask_user_question_tool::AskUserQuestionTool; +pub use bash_tool::BashTool; +pub use code_review_tool::CodeReviewTool; +pub use create_plan_tool::CreatePlanTool; +pub use delete_file_tool::DeleteFileTool; +pub use file_edit_tool::FileEditTool; pub use file_read_tool::FileReadTool; pub use file_write_tool::FileWriteTool; -pub use file_edit_tool::FileEditTool; -pub use delete_file_tool::DeleteFileTool; -pub use bash_tool::BashTool; -pub use grep_tool::GrepTool; +pub use get_file_diff_tool::GetFileDiffTool; +pub use git_tool::GitTool; pub use glob_tool::GlobTool; -pub use web_tools::{WebSearchTool, WebFetchTool}; -pub use todo_write_tool::TodoWriteTool; +pub use grep_tool::GrepTool; pub use ide_control_tool::IdeControlTool; -pub use mermaid_interactive_tool::MermaidInteractiveTool; -pub use log_tool::LogTool; pub use linter_tool::ReadLintsTool; -pub use analyze_image_tool::AnalyzeImageTool; -pub use skill_tool::SkillTool; -pub use ask_user_question_tool::AskUserQuestionTool; +pub use log_tool::LogTool; pub use ls_tool::LSTool; +pub use mermaid_interactive_tool::MermaidInteractiveTool; +pub use skill_tool::SkillTool; pub use task_tool::TaskTool; -pub use git_tool::GitTool; -pub use create_plan_tool::CreatePlanTool; -pub use get_file_diff_tool::GetFileDiffTool; -pub use code_review_tool::CodeReviewTool; -pub use terminal_control_tool::TerminalControlTool; \ No newline at end of file +pub use terminal_control_tool::TerminalControlTool; +pub use todo_write_tool::TodoWriteTool; +pub use view_image_tool::ViewImageTool; +pub use web_tools::{WebFetchTool, WebSearchTool}; diff --git a/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs new file mode 100644 index 00000000..cbd59b2f --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs @@ -0,0 +1,396 @@ +//! view_image tool - analyzes image content for text-only or multimodal main models. +//! +//! Current default behavior is to convert image content into structured text analysis. +//! This keeps the tool useful for text-only primary models while preserving an interface +//! that can evolve toward direct multimodal attachment in the future. + +use async_trait::async_trait; +use log::{debug, info, trace}; +use serde::Deserialize; +use serde_json::{json, Value}; + +use crate::agentic::image_analysis::{ + build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path, + optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_global_config, +}; +use crate::agentic::tools::framework::{ + Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult, +}; +use crate::infrastructure::ai::get_global_ai_client_factory; +use crate::infrastructure::get_workspace_path; +use crate::util::errors::{BitFunError, BitFunResult}; + +#[derive(Debug, Deserialize)] +struct ViewImageInput { + #[serde(default)] + image_path: Option, + #[serde(default)] + data_url: Option, + #[serde(default)] + image_id: Option, + #[serde(default)] + analysis_prompt: Option, + #[serde(default)] + focus_areas: Option>, + #[serde(default)] + detail_level: Option, +} + +pub struct ViewImageTool; + +impl ViewImageTool { + pub fn new() -> Self { + Self + } + + fn build_prompt( + &self, + analysis_prompt: Option<&str>, + focus_areas: &Option>, + detail_level: &Option, + ) -> String { + let mut prompt = String::new(); + + prompt.push_str( + analysis_prompt + .filter(|s| !s.trim().is_empty()) + .unwrap_or("Please analyze this image and describe the relevant details."), + ); + prompt.push_str("\n\n"); + + if let Some(areas) = focus_areas { + if !areas.is_empty() { + prompt.push_str("Please pay special attention to the following aspects:\n"); + for area in areas { + prompt.push_str(&format!("- {}\n", area)); + } + prompt.push('\n'); + } + } + + let detail_guide = match detail_level.as_deref() { + Some("brief") => "Please answer concisely in 1-2 sentences.", + Some("detailed") => { + "Please provide a detailed analysis including all relevant details." + } + _ => "Please provide a moderate level of analysis detail.", + }; + prompt.push_str(detail_guide); + + prompt + } + + async fn load_source( + &self, + input_data: &ViewImageInput, + context: &ToolUseContext, + ) -> BitFunResult<(Vec, Option, String)> { + let workspace_path = get_workspace_path(); + + if let Some(image_id) = &input_data.image_id { + let provider = context.image_context_provider.as_ref().ok_or_else(|| { + BitFunError::tool( + "image_id mode requires ImageContextProvider support, but no provider was injected.\n\ + Please inject image_context_provider when calling the tool, or use image_path/data_url mode.".to_string() + ) + })?; + + let image_context = provider.get_image(image_id).ok_or_else(|| { + BitFunError::tool(format!( + "Image context not found: image_id={}. Image may have expired (5-minute validity) or was never uploaded.", + image_id + )) + })?; + + if let Some(data_url) = &image_context.data_url { + let (data, data_url_mime) = decode_data_url(data_url)?; + let fallback_mime = data_url_mime.or_else(|| Some(image_context.mime_type.clone())); + return Ok(( + data, + fallback_mime, + format!("{} (clipboard)", image_context.image_name), + )); + } + + if let Some(image_path_str) = &image_context.image_path { + let image_path = resolve_image_path(image_path_str, workspace_path.as_deref())?; + let data = load_image_from_path(&image_path, workspace_path.as_deref()).await?; + let detected_mime = + detect_mime_type_from_bytes(&data, Some(&image_context.mime_type)).ok(); + return Ok((data, detected_mime, image_path.display().to_string())); + } + + return Err(BitFunError::tool(format!( + "Image context {} has neither data_url nor image_path", + image_id + ))); + } + + if let Some(data_url) = &input_data.data_url { + let (data, data_url_mime) = decode_data_url(data_url)?; + return Ok((data, data_url_mime, "clipboard_image".to_string())); + } + + if let Some(image_path_str) = &input_data.image_path { + let image_path = resolve_image_path(image_path_str, workspace_path.as_deref())?; + let data = load_image_from_path(&image_path, workspace_path.as_deref()).await?; + let detected_mime = detect_mime_type_from_bytes(&data, None).ok(); + return Ok((data, detected_mime, image_path.display().to_string())); + } + + Err(BitFunError::validation( + "Must provide one of image_path, data_url, or image_id", + )) + } +} + +#[async_trait] +impl Tool for ViewImageTool { + fn name(&self) -> &str { + "view_image" + } + + async fn description(&self) -> BitFunResult { + Ok(r#"Analyzes image content and returns detailed text descriptions. + +Use this tool when the user provides an image (file path, data URL, or uploaded clipboard image_id) and asks questions about it. + +Current behavior: +- For text-only primary models, this tool converts image content to structured text. +- For multimodal-capable setups, this interface can be extended to direct image attachment in future. + +Parameters: +- image_path / data_url / image_id: provide one image source +- analysis_prompt: optional custom analysis goal +- focus_areas: optional analysis focus list +- detail_level: brief / normal / detailed"#.to_string()) + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "image_path": { + "type": "string", + "description": "Path to image file (relative to workspace or absolute path). Example: 'screenshot.png'" + }, + "data_url": { + "type": "string", + "description": "Base64-encoded image data URL. Example: 'data:image/png;base64,...'" + }, + "image_id": { + "type": "string", + "description": "Temporary image ID from clipboard upload. Example: 'img-clipboard-1234567890-abc123'" + }, + "analysis_prompt": { + "type": "string", + "description": "Optional custom prompt describing what to extract from the image" + }, + "focus_areas": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Optional list of aspects to emphasize" + }, + "detail_level": { + "type": "string", + "enum": ["brief", "normal", "detailed"], + "description": "Optional detail level" + } + } + }) + } + + fn is_readonly(&self) -> bool { + true + } + + fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool { + true + } + + fn needs_permissions(&self, _input: Option<&Value>) -> bool { + false + } + + async fn validate_input( + &self, + input: &Value, + _context: Option<&ToolUseContext>, + ) -> ValidationResult { + let has_path = input + .get("image_path") + .and_then(|v| v.as_str()) + .is_some_and(|s| !s.is_empty()); + let has_data_url = input + .get("data_url") + .and_then(|v| v.as_str()) + .is_some_and(|s| !s.is_empty()); + let has_image_id = input + .get("image_id") + .and_then(|v| v.as_str()) + .is_some_and(|s| !s.is_empty()); + + if !has_path && !has_data_url && !has_image_id { + return ValidationResult { + result: false, + message: Some("Must provide one of image_path, data_url, or image_id".to_string()), + error_code: Some(400), + meta: None, + }; + } + + if let Some(image_path) = input.get("image_path").and_then(|v| v.as_str()) { + if !image_path.is_empty() { + let workspace_path = get_workspace_path(); + match resolve_image_path(image_path, workspace_path.as_deref()) { + Ok(path) => { + if !path.exists() { + return ValidationResult { + result: false, + message: Some(format!("Image file does not exist: {}", image_path)), + error_code: Some(404), + meta: None, + }; + } + + if !path.is_file() { + return ValidationResult { + result: false, + message: Some(format!("Path is not a file: {}", image_path)), + error_code: Some(400), + meta: None, + }; + } + } + Err(e) => { + return ValidationResult { + result: false, + message: Some(format!("Path parsing failed: {}", e)), + error_code: Some(400), + meta: None, + }; + } + } + } + } + + ValidationResult::default() + } + + fn render_tool_use_message(&self, input: &Value, options: &ToolRenderOptions) -> String { + let image_source = if let Some(path) = input.get("image_path").and_then(|v| v.as_str()) { + if !path.is_empty() { + path.to_string() + } else { + "Clipboard image".to_string() + } + } else if input + .get("image_id") + .and_then(|v| v.as_str()) + .is_some_and(|id| !id.is_empty()) + { + "Clipboard image (image_id)".to_string() + } else if input.get("data_url").is_some() { + "Clipboard image".to_string() + } else { + "unknown".to_string() + }; + + if options.verbose { + let prompt = input + .get("analysis_prompt") + .and_then(|v| v.as_str()) + .unwrap_or("default analysis"); + format!("Viewing image: {} (prompt: {})", image_source, prompt) + } else { + format!("Viewing image: {}", image_source) + } + } + + async fn call_impl( + &self, + input: &Value, + context: &ToolUseContext, + ) -> BitFunResult> { + let start = std::time::Instant::now(); + + let input_data: ViewImageInput = serde_json::from_value(input.clone()) + .map_err(|e| BitFunError::parse(format!("Failed to parse input: {}", e)))?; + + let (image_data, fallback_mime, image_source_description) = + self.load_source(&input_data, context).await?; + + let vision_model = resolve_vision_model_from_global_config().await?; + debug!( + "Using image understanding model: id={}, name={}, provider={}", + vision_model.id, vision_model.name, vision_model.provider + ); + + let processed = optimize_image_for_provider( + image_data, + &vision_model.provider, + fallback_mime.as_deref(), + )?; + + let prompt = self.build_prompt( + input_data.analysis_prompt.as_deref(), + &input_data.focus_areas, + &input_data.detail_level, + ); + trace!("Full view_image prompt: {}", prompt); + + let messages = build_multimodal_message( + &prompt, + &processed.data, + &processed.mime_type, + &vision_model.provider, + )?; + + let ai_client_factory = get_global_ai_client_factory() + .await + .map_err(|e| BitFunError::service(format!("Failed to get AI client factory: {}", e)))?; + let ai_client = ai_client_factory + .get_client_by_id(&vision_model.id) + .await + .map_err(|e| { + BitFunError::service(format!( + "Failed to create vision model client for {}: {}", + vision_model.id, e + )) + })?; + + debug!("Calling vision model for image analysis..."); + let ai_response = ai_client + .send_message(messages, None) + .await + .map_err(|e| BitFunError::service(format!("AI call failed: {}", e)))?; + + let elapsed = start.elapsed(); + info!("view_image completed: duration={:?}", elapsed); + + let result_for_assistant = format!( + "Image analysis result ({})\n\n{}", + image_source_description, ai_response.text + ); + + Ok(vec![ToolResult::Result { + data: json!({ + "success": true, + "image_source": image_source_description, + "analysis": ai_response.text, + "metadata": { + "mime_type": processed.mime_type, + "file_size": processed.data.len(), + "width": processed.width, + "height": processed.height, + "analysis_time_ms": elapsed.as_millis() as u64, + "model_used": vision_model.name, + "prompt_used": input_data.analysis_prompt.unwrap_or_else(|| "default".to_string()), + } + }), + result_for_assistant: Some(result_for_assistant), + }]) + } +} diff --git a/src/crates/core/src/agentic/tools/registry.rs b/src/crates/core/src/agentic/tools/registry.rs index 1eefdb51..6f822601 100644 --- a/src/crates/core/src/agentic/tools/registry.rs +++ b/src/crates/core/src/agentic/tools/registry.rs @@ -122,8 +122,8 @@ impl ToolRegistry { // Linter tool (LSP diagnosis) self.register_tool(Arc::new(ReadLintsTool::new())); - // Image analysis tool - self.register_tool(Arc::new(AnalyzeImageTool::new())); + // Image analysis / viewing tool + self.register_tool(Arc::new(ViewImageTool::new())); // Git version control tool self.register_tool(Arc::new(GitTool::new())); @@ -173,11 +173,11 @@ mod tests { } /// Get all tools -/// - Snapshot initialized: +/// - Snapshot initialized: /// return tools only in the snapshot manager (wrapped file tools + built-in non-file tools) /// **not containing** dynamically registered MCP tools. -/// - Snapshot not initialized: -/// return all tools in the global registry, +/// - Snapshot not initialized: +/// return all tools in the global registry, /// **containing** MCP tools. /// If you need **always include** MCP tools, use [get_all_registered_tools] pub async fn get_all_tools() -> Vec> { @@ -234,7 +234,7 @@ pub fn get_global_tool_registry() -> Arc> { } /// Get all registered tools (**always include** dynamically registered MCP tools) -/// - Snapshot initialized: +/// - Snapshot initialized: /// return wrapped file tools + other tools in the global registry (containing MCP tools) /// - Snapshot not initialized: return all tools in the global registry. pub async fn get_all_registered_tools() -> Vec> { diff --git a/src/web-ui/src/component-library/components/registry.tsx b/src/web-ui/src/component-library/components/registry.tsx index abf4a77e..b4df6113 100644 --- a/src/web-ui/src/component-library/components/registry.tsx +++ b/src/web-ui/src/component-library/components/registry.tsx @@ -1587,14 +1587,14 @@ All requirements met`, }, { id: 'image-analysis-card', - name: 'AnalyzeImage - ????', + name: 'view_image - ????', description: '????????', category: 'flowchat-cards', component: () => (

Read - Success

diff --git a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts index 1a39275f..71968fb5 100644 --- a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts +++ b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts @@ -33,6 +33,127 @@ interface UseMessageSenderReturn { isSending: boolean; } +type ImageInputStrategy = 'vision-preanalysis' | 'direct-attach'; + +interface StrategyDecision { + strategy: ImageInputStrategy; + modelId: string | null; + supportsImageUnderstanding: boolean; + reason: string; +} + +interface ImageAnalysisResult { + image_id: string; + summary: string; + detailed_description: string; + detected_elements: string[]; + confidence: number; + analysis_time_ms: number; +} + +// Keep this off for now: transport currently accepts text-only `userInput`. +// When backend supports multimodal turn input, this can be flipped (or moved to config). +const ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED = false; + +async function resolveSessionModelId( + flowChatManager: FlowChatManager, + sessionId: string | undefined +): Promise { + const state = flowChatManager.getFlowChatState(); + const session = sessionId ? state.sessions.get(sessionId) : undefined; + const configuredModel = session?.config?.modelName; + + if (configuredModel && configuredModel !== 'default') { + return configuredModel; + } + + const { getDefaultPrimaryModel } = await import('@/infrastructure/config/utils/modelConfigHelpers'); + return getDefaultPrimaryModel(); +} + +async function modelSupportsImageUnderstanding(modelId: string | null): Promise { + if (!modelId) return false; + + const { configManager } = await import('@/infrastructure/config/services/ConfigManager'); + const allModels = await configManager.getConfig('ai.models') || []; + const model = allModels.find(m => m.id === modelId || m.name === modelId); + const capabilities = Array.isArray(model?.capabilities) ? model.capabilities : []; + return capabilities.includes('image_understanding'); +} + +async function chooseImageInputStrategy( + flowChatManager: FlowChatManager, + sessionId: string | undefined +): Promise { + const modelId = await resolveSessionModelId(flowChatManager, sessionId); + const supportsImageUnderstanding = await modelSupportsImageUnderstanding(modelId); + + if (supportsImageUnderstanding && ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED) { + return { + strategy: 'direct-attach', + modelId, + supportsImageUnderstanding, + reason: 'model_supports_image_understanding', + }; + } + + return { + strategy: 'vision-preanalysis', + modelId, + supportsImageUnderstanding, + reason: supportsImageUnderstanding + ? 'direct_attach_disabled_until_multimodal_turn_input_is_available' + : 'primary_model_is_text_only', + }; +} + +async function analyzeImagesBeforeSend( + imageContexts: ImageContext[], + sessionId: string, + userMessage: string +): Promise { + if (imageContexts.length === 0) return []; + + const { imageAnalysisAPI } = await import('@/infrastructure/api/service-api/ImageAnalysisAPI'); + return imageAnalysisAPI.analyzeImages({ + session_id: sessionId, + user_message: userMessage, + images: imageContexts.map(ctx => ({ + id: ctx.id, + image_path: ctx.isLocal ? ctx.imagePath : undefined, + data_url: !ctx.isLocal ? ctx.dataUrl : undefined, + mime_type: ctx.mimeType, + metadata: { + name: ctx.imageName, + width: ctx.width, + height: ctx.height, + file_size: ctx.fileSize, + source: ctx.source, + }, + })), + }); +} + +function formatImageContextLine( + ctx: ImageContext, + analysis?: ImageAnalysisResult +): string { + const imgName = ctx.imageName || 'Untitled image'; + const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : ''; + const sourceLine = ctx.isLocal + ? `Path: ${ctx.imagePath}` + : `Image ID: ${ctx.id}`; + + if (!analysis) { + return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nTip: You can use the view_image tool (${ctx.isLocal ? 'image_path' : 'image_id'}).`; + } + + const topElements = (analysis.detected_elements || []).slice(0, 5).join(', '); + const keyElementsLine = topElements ? `\nPre-analysis key elements: ${topElements}` : ''; + + return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nPre-analysis summary: ${analysis.summary}${keyElementsLine}`; +} + export function useMessageSender(props: UseMessageSenderProps): UseMessageSenderReturn { const { currentSessionId, @@ -56,14 +177,14 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender hasSession: !!sessionId, agentType: currentAgentType || 'agentic', }); - + try { const flowChatManager = FlowChatManager.getInstance(); - + if (!sessionId) { const { getDefaultPrimaryModel } = await import('@/infrastructure/config/utils/modelConfigHelpers'); const modelId = await getDefaultPrimaryModel(); - + sessionId = await flowChatManager.createChatSession({ modelName: modelId || undefined }, currentAgentType || 'agentic'); @@ -71,12 +192,10 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender } else { log.debug('Reusing existing session', { sessionId }); } - - // Upload clipboard images to temporary backend storage first. - const clipboardImages = contexts.filter(ctx => - ctx.type === 'image' && !ctx.isLocal && ctx.dataUrl - ) as ImageContext[]; - + + const imageContexts = contexts.filter(ctx => ctx.type === 'image') as ImageContext[]; + const clipboardImages = imageContexts.filter(ctx => !ctx.isLocal && ctx.dataUrl); + if (clipboardImages.length > 0) { try { const { api } = await import('@/infrastructure/api/service-api/ApiClient'); @@ -95,7 +214,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender })) } }; - + await api.invoke('upload_image_contexts', uploadData); log.debug('Clipboard images uploaded', { imageCount: clipboardImages.length, @@ -110,13 +229,63 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender throw error; } } - - // Build both backend and display versions of the message. + + let strategyDecision: StrategyDecision = { + strategy: 'vision-preanalysis', + modelId: null, + supportsImageUnderstanding: false, + reason: 'fallback_default_preanalysis', + }; + try { + strategyDecision = await chooseImageInputStrategy(flowChatManager, sessionId); + } catch (error) { + log.warn('Failed to resolve image input strategy, using pre-analysis fallback', { + sessionId, + error: (error as Error)?.message ?? 'unknown', + }); + } + + log.debug('Image input strategy selected', { + sessionId, + strategy: strategyDecision.strategy, + modelId: strategyDecision.modelId, + supportsImageUnderstanding: strategyDecision.supportsImageUnderstanding, + reason: strategyDecision.reason, + }); + + let imageAnalyses: ImageAnalysisResult[] = []; + if (imageContexts.length > 0) { + if (strategyDecision.strategy === 'direct-attach') { + // Future extensibility hook: + // once start_dialog_turn supports multimodal payloads, this branch can send image items directly. + log.info('Direct image attach strategy is selected but transport is still text-only; using pre-analysis fallback', { + sessionId, + modelId: strategyDecision.modelId, + }); + } + + try { + imageAnalyses = await analyzeImagesBeforeSend(imageContexts, sessionId!, trimmedMessage); + log.debug('Image pre-analysis completed', { + sessionId, + imageCount: imageContexts.length, + analysisCount: imageAnalyses.length, + }); + } catch (error) { + log.warn('Image pre-analysis failed, continuing with context hints only', { + sessionId, + imageCount: imageContexts.length, + error: (error as Error)?.message ?? 'unknown', + }); + } + } + let fullMessage = trimmedMessage; const displayMessage = trimmedMessage; - + if (contexts.length > 0) { - // Full version includes absolute details for the backend. + const analysisByImageId = new Map(imageAnalyses.map(result => [result.image_id, result])); + const fullContextSection = contexts.map(ctx => { switch (ctx.type) { case 'file': @@ -126,20 +295,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender case 'code-snippet': return `[Code Snippet: ${ctx.filePath}:${ctx.startLine}-${ctx.endLine}]`; case 'image': { - const imgName = ctx.imageName || 'Untitled image'; - const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : ''; - - // Distinguish local files and clipboard images. - if (ctx.isLocal && ctx.imagePath) { - return `[Image: ${imgName}${imgSize}]\n` + - `Path: ${ctx.imagePath}\n` + - `Tip: You can use the AnalyzeImage tool with the image_path parameter.`; - } else { - return `[Image: ${imgName}${imgSize} (from clipboard)]\n` + - `Image ID: ${ctx.id}\n` + - `Tip: You can use the AnalyzeImage tool.\n` + - `Parameter: image_id="${ctx.id}"`; - } + return formatImageContextLine(ctx, analysisByImageId.get(ctx.id)); } case 'terminal-command': return `[Command: ${ctx.command}]`; @@ -155,21 +311,21 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender return ''; } }).filter(Boolean).join('\n'); - + fullMessage = `${fullContextSection}\n\n${trimmedMessage}`; } - + await flowChatManager.sendMessage( - fullMessage, - sessionId || undefined, + fullMessage, + sessionId || undefined, displayMessage, currentAgentType || 'agentic' ); - + onClearContexts(); - + onExitTemplateMode?.(); - + onSuccess?.(trimmedMessage); log.info('Message sent successfully', { sessionId, @@ -185,7 +341,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender }); throw error; } - }, [currentSessionId, contexts, onClearContexts, onSuccess, onExitTemplateMode]); + }, [currentSessionId, contexts, onClearContexts, onSuccess, onExitTemplateMode, currentAgentType]); return { sendMessage, diff --git a/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx b/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx index a286ee2e..1204d058 100644 --- a/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx +++ b/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx @@ -1,23 +1,32 @@ /** * Image analysis tool card - compact mode - * Used for AnalyzeImage tool + * Used for view_image tool */ -import React, { useState, useMemo } from 'react'; +import React, { useState, useMemo, useEffect, useCallback } from 'react'; import { Loader2, Clock, Check } from 'lucide-react'; import { useTranslation } from 'react-i18next'; import type { ToolCardProps } from '../types/flow-chat'; import { CompactToolCard, CompactToolCardHeader } from './CompactToolCard'; import './ImageAnalysisCard.scss'; +const imageAnalysisExpandedStateCache = new Map(); + export const ImageAnalysisCard: React.FC = ({ toolItem, onExpand }) => { const { t } = useTranslation('flow-chat'); const { toolCall, toolResult, status } = toolItem; + const toolId = toolItem.id || toolCall?.id; const [isExpanded, setIsExpanded] = useState(false); + useEffect(() => { + if (!toolId) return; + const cached = imageAnalysisExpandedStateCache.get(toolId); + setIsExpanded(cached ?? false); + }, [toolId]); + const getStatusIcon = () => { switch (status) { case 'running': @@ -66,10 +75,15 @@ export const ImageAnalysisCard: React.FC = ({ const getAnalysisResult = () => { if (!toolResult?.result) return null; - - const result = toolResult.result; - - if (result.analysis || result.description || result.content) { + + const raw = toolResult.result; + const result = + (raw?.analysis || raw?.description || raw?.content) ? raw : + (raw?.result?.analysis || raw?.result?.description || raw?.result?.content) ? raw.result : + (raw?.data?.analysis || raw?.data?.description || raw?.data?.content) ? raw.data : + null; + + if (result) { return { analysis: result.analysis || result.description || result.content, modelUsed: result.model_used || result.model, @@ -80,10 +94,17 @@ export const ImageAnalysisCard: React.FC = ({ return null; }; - const handleToggleExpand = () => { - setIsExpanded(!isExpanded); + const handleToggleExpand = useCallback(() => { + window.dispatchEvent(new CustomEvent('tool-card-toggle')); + setIsExpanded(prev => { + const next = !prev; + if (toolId) { + imageAnalysisExpandedStateCache.set(toolId, next); + } + return next; + }); onExpand?.(); - }; + }, [onExpand, toolId]); const analysisInfo = useMemo(() => getAnalysisInfo(), [toolCall?.input]); const analysisResult = useMemo(() => getAnalysisResult(), [toolResult?.result]); @@ -181,4 +202,3 @@ export const ImageAnalysisCard: React.FC = ({ /> ); }; - diff --git a/src/web-ui/src/flow_chat/tool-cards/index.ts b/src/web-ui/src/flow_chat/tool-cards/index.ts index f32facba..f3724216 100644 --- a/src/web-ui/src/flow_chat/tool-cards/index.ts +++ b/src/web-ui/src/flow_chat/tool-cards/index.ts @@ -191,8 +191,8 @@ export const TOOL_CARD_CONFIGS: Record = { displayMode: 'compact', primaryColor: '#8b5cf6' }, - 'AnalyzeImage': { - toolName: 'AnalyzeImage', + 'view_image': { + toolName: 'view_image', displayName: 'Image Analysis', icon: 'IMG', requiresConfirmation: false, @@ -329,7 +329,7 @@ export const TOOL_CARD_COMPONENTS = { 'submit_code_review': CodeReviewToolCard, // Image analysis tools - 'AnalyzeImage': ImageAnalysisCard, + 'view_image': ImageAnalysisCard, // Context compression 'ContextCompression': ContextCompressionDisplay, diff --git a/src/web-ui/src/locales/en-US/flow-chat.json b/src/web-ui/src/locales/en-US/flow-chat.json index f272e4b9..6e12e5e3 100644 --- a/src/web-ui/src/locales/en-US/flow-chat.json +++ b/src/web-ui/src/locales/en-US/flow-chat.json @@ -582,7 +582,14 @@ "parsingAnalysisInfo": "Parsing analysis info...", "unknownImage": "Unknown image", "clipboardImage": "Clipboard image", - "analyzeImageContent": "Analyze image content" + "analyzeImageContent": "Analyze image content", + "imageAnalysis": "Image analysis", + "completed": "Completed", + "analyzing": "Analyzing", + "preparing": "Preparing", + "analysisPrompt": "Analysis prompt", + "focusAreas": "Focus areas", + "analysisResult": "Analysis result" }, "contextCompression": { "beforeUserMessage": "Before user message", diff --git a/src/web-ui/src/locales/zh-CN/flow-chat.json b/src/web-ui/src/locales/zh-CN/flow-chat.json index e8053306..f3ed8632 100644 --- a/src/web-ui/src/locales/zh-CN/flow-chat.json +++ b/src/web-ui/src/locales/zh-CN/flow-chat.json @@ -582,7 +582,14 @@ "parsingAnalysisInfo": "解析分析信息中...", "unknownImage": "未知图片", "clipboardImage": "剪贴板图片", - "analyzeImageContent": "分析图片内容" + "analyzeImageContent": "分析图片内容", + "imageAnalysis": "图片分析", + "completed": "已完成", + "analyzing": "正在分析", + "preparing": "准备分析", + "analysisPrompt": "分析提示", + "focusAreas": "关注点", + "analysisResult": "分析结果" }, "contextCompression": { "beforeUserMessage": "用户消息前", From b1ce49dd12c557c7a44f88e472d08c8dd7fe1634 Mon Sep 17 00:00:00 2001 From: wgqqqqq Date: Thu, 5 Mar 2026 22:50:00 +0800 Subject: [PATCH 2/2] feat: support multimodal image turn flow with persistence redaction --- src/apps/desktop/src/api/agentic_api.rs | 152 +++++++++- src/apps/desktop/src/api/commands.rs | 73 +++++ .../desktop/src/api/image_analysis_api.rs | 9 +- .../src/agentic/coordination/coordinator.rs | 38 ++- src/crates/core/src/agentic/core/message.rs | 140 +++++++++- .../core/src/agentic/core/messages_helper.rs | 38 ++- .../src/agentic/execution/execution_engine.rs | 259 ++++++++++++++++-- .../src/agentic/execution/round_executor.rs | 41 ++- .../image_analysis/image_processing.rs | 156 +++++++++-- .../core/src/agentic/image_analysis/mod.rs | 5 +- .../core/src/agentic/persistence/manager.rs | 74 ++++- .../src/agentic/session/session_manager.rs | 10 +- .../tools/implementations/view_image_tool.rs | 225 ++++++++++++++- .../agentic/tools/pipeline/state_manager.rs | 28 +- .../agentic/tools/pipeline/tool_pipeline.rs | 35 ++- .../core/src/infrastructure/ai/client.rs | 148 ++++++++++ .../flow_chat/components/ModelSelector.tsx | 5 +- .../src/flow_chat/hooks/useMessageSender.ts | 130 ++++++--- .../src/flow_chat/services/FlowChatManager.ts | 15 +- .../flow-chat-manager/MessageModule.ts | 8 +- .../api/service-api/AgentAPI.ts | 5 +- .../api/service-api/ApiClient.ts | 86 +++++- src/web-ui/src/locales/zh-CN/settings.json | 4 +- .../src/locales/zh-CN/settings/ai-model.json | 10 +- .../locales/zh-CN/settings/default-model.json | 6 +- 25 files changed, 1556 insertions(+), 144 deletions(-) diff --git a/src/apps/desktop/src/api/agentic_api.rs b/src/apps/desktop/src/api/agentic_api.rs index 10fa5c9d..1143c4a4 100644 --- a/src/apps/desktop/src/api/agentic_api.rs +++ b/src/apps/desktop/src/api/agentic_api.rs @@ -6,8 +6,10 @@ use std::sync::Arc; use tauri::{AppHandle, State}; use crate::api::app_state::AppState; +use crate::api::context_upload_api::get_image_context; use bitfun_core::agentic::coordination::ConversationCoordinator; use bitfun_core::agentic::core::*; +use bitfun_core::agentic::image_analysis::ImageContextData; #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] @@ -45,6 +47,8 @@ pub struct StartDialogTurnRequest { pub user_input: String, pub agent_type: String, pub turn_id: Option, + #[serde(default)] + pub image_contexts: Option>, } #[derive(Debug, Serialize)] @@ -179,16 +183,42 @@ pub async fn start_dialog_turn( coordinator: State<'_, Arc>, request: StartDialogTurnRequest, ) -> Result { - let _stream = coordinator - .start_dialog_turn( - request.session_id, - request.user_input, - request.turn_id, - request.agent_type, - false, - ) - .await - .map_err(|e| format!("Failed to start dialog turn: {}", e))?; + let StartDialogTurnRequest { + session_id, + user_input, + agent_type, + turn_id, + image_contexts, + } = request; + + if let Some(image_contexts) = image_contexts + .as_ref() + .filter(|images| !images.is_empty()) + .cloned() + { + let resolved_image_contexts = resolve_missing_image_payloads(image_contexts)?; + coordinator + .start_dialog_turn_with_image_contexts( + session_id, + user_input, + resolved_image_contexts, + turn_id, + agent_type, + ) + .await + .map_err(|e| format!("Failed to start dialog turn: {}", e))?; + } else { + coordinator + .start_dialog_turn( + session_id, + user_input, + turn_id, + agent_type, + false, + ) + .await + .map_err(|e| format!("Failed to start dialog turn: {}", e))?; + } Ok(StartDialogTurnResponse { success: true, @@ -196,6 +226,88 @@ pub async fn start_dialog_turn( }) } +fn is_blank_text(value: Option<&String>) -> bool { + value.map(|s| s.trim().is_empty()).unwrap_or(true) +} + +fn resolve_missing_image_payloads( + image_contexts: Vec, +) -> Result, String> { + let mut resolved = Vec::with_capacity(image_contexts.len()); + + for mut image in image_contexts { + let missing_payload = + is_blank_text(image.image_path.as_ref()) && is_blank_text(image.data_url.as_ref()); + if !missing_payload { + resolved.push(image); + continue; + } + + let stored = get_image_context(&image.id).ok_or_else(|| { + format!( + "Image context not found for image_id={}. It may have expired. Please re-attach the image and retry.", + image.id + ) + })?; + + if is_blank_text(image.image_path.as_ref()) { + image.image_path = stored + .image_path + .clone() + .filter(|s| !s.trim().is_empty()); + } + if is_blank_text(image.data_url.as_ref()) { + image.data_url = stored + .data_url + .clone() + .filter(|s| !s.trim().is_empty()); + } + if image.mime_type.trim().is_empty() { + image.mime_type = stored.mime_type.clone(); + } + + let mut metadata = image.metadata.take().unwrap_or_else(|| serde_json::json!({})); + if !metadata.is_object() { + metadata = serde_json::json!({ "raw_metadata": metadata }); + } + if let Some(obj) = metadata.as_object_mut() { + if !obj.contains_key("name") { + obj.insert("name".to_string(), serde_json::json!(stored.image_name)); + } + if !obj.contains_key("width") { + obj.insert("width".to_string(), serde_json::json!(stored.width)); + } + if !obj.contains_key("height") { + obj.insert("height".to_string(), serde_json::json!(stored.height)); + } + if !obj.contains_key("file_size") { + obj.insert("file_size".to_string(), serde_json::json!(stored.file_size)); + } + if !obj.contains_key("source") { + obj.insert("source".to_string(), serde_json::json!(stored.source)); + } + obj.insert( + "resolved_from_upload_cache".to_string(), + serde_json::json!(true), + ); + } + image.metadata = Some(metadata); + + let still_missing = + is_blank_text(image.image_path.as_ref()) && is_blank_text(image.data_url.as_ref()); + if still_missing { + return Err(format!( + "Image context {} is missing image_path/data_url after cache resolution", + image.id + )); + } + + resolved.push(image); + } + + Ok(resolved) +} + #[tauri::command] pub async fn cancel_dialog_turn( coordinator: State<'_, Arc>, @@ -394,6 +506,26 @@ fn message_to_dto(message: Message) -> MessageDTO { let content = match message.content { MessageContent::Text(text) => serde_json::json!({ "type": "text", "text": text }), + MessageContent::Multimodal { text, images } => { + let images: Vec = images + .into_iter() + .map(|img| { + serde_json::json!({ + "id": img.id, + "image_path": img.image_path, + "mime_type": img.mime_type, + "metadata": img.metadata, + "has_data_url": img.data_url.as_ref().is_some_and(|s| !s.is_empty()), + }) + }) + .collect(); + + serde_json::json!({ + "type": "multimodal", + "text": text, + "images": images, + }) + } MessageContent::ToolResult { tool_id, tool_name, diff --git a/src/apps/desktop/src/api/commands.rs b/src/apps/desktop/src/api/commands.rs index eedb6d57..3e61f9f9 100644 --- a/src/apps/desktop/src/api/commands.rs +++ b/src/apps/desktop/src/api/commands.rs @@ -197,6 +197,21 @@ pub async fn test_ai_config_connection( request: TestAIConfigConnectionRequest, ) -> Result { let model_name = request.config.name.clone(); + let supports_image_input = request + .config + .capabilities + .iter() + .any(|cap| { + matches!( + cap, + bitfun_core::service::config::types::ModelCapability::ImageUnderstanding + ) + }) + || matches!( + request.config.category, + bitfun_core::service::config::types::ModelCategory::Multimodal + ); + let ai_config = match request.config.try_into() { Ok(config) => config, Err(e) => { @@ -209,6 +224,64 @@ pub async fn test_ai_config_connection( match ai_client.test_connection().await { Ok(result) => { + if !result.success { + info!( + "AI config connection test completed: model={}, success={}, response_time={}ms", + model_name, result.success, result.response_time_ms + ); + return Ok(result); + } + + if supports_image_input { + match ai_client.test_image_input_connection().await { + Ok(image_result) => { + let response_time_ms = + result.response_time_ms + image_result.response_time_ms; + + if !image_result.success { + let image_error = image_result + .error_details + .unwrap_or_else(|| "Unknown image input test error".to_string()); + let merged = bitfun_core::util::types::ConnectionTestResult { + success: false, + response_time_ms, + model_response: image_result.model_response.or(result.model_response), + error_details: Some(format!( + "Basic connection passed, but multimodal image input test failed: {}", + image_error + )), + }; + info!( + "AI config connection test completed: model={}, success={}, response_time={}ms", + model_name, merged.success, merged.response_time_ms + ); + return Ok(merged); + } + + let merged = bitfun_core::util::types::ConnectionTestResult { + success: true, + response_time_ms, + model_response: image_result + .model_response + .or(result.model_response), + error_details: None, + }; + info!( + "AI config connection test completed: model={}, success={}, response_time={}ms", + model_name, merged.success, merged.response_time_ms + ); + return Ok(merged); + } + Err(e) => { + error!( + "AI config multimodal image input test failed unexpectedly: model={}, error={}", + model_name, e + ); + return Err(format!("Connection test failed: {}", e)); + } + } + } + info!( "AI config connection test completed: model={}, success={}, response_time={}ms", model_name, result.success, result.response_time_ms diff --git a/src/apps/desktop/src/api/image_analysis_api.rs b/src/apps/desktop/src/api/image_analysis_api.rs index 369272ca..25438837 100644 --- a/src/apps/desktop/src/api/image_analysis_api.rs +++ b/src/apps/desktop/src/api/image_analysis_api.rs @@ -26,15 +26,14 @@ pub async fn analyze_images( let image_model = resolve_vision_model_from_ai_config(&ai_config).map_err(|e| { error!( - "No image understanding model available: available_models={:?}, error={}", + "Image understanding model resolution failed: available_models={:?}, error={}", ai_config.models.iter().map(|m| &m.id).collect::>(), e ); format!( - "Image understanding model not configured and no compatible model found.\n\n\ - Please add a model that supports image understanding \ - in [Settings → AI Model Config], enable 'image_understanding' capability, \ - and assign it in [Settings → Super Agent].\n\nDetails: {}", + "Image understanding model is not configured.\n\n\ + Please select a model for [Settings → Default Model Config → Image Understanding Model].\n\n\ + Details: {}", e ) })?; diff --git a/src/crates/core/src/agentic/coordination/coordinator.rs b/src/crates/core/src/agentic/coordination/coordinator.rs index f84b8cf4..811bc567 100644 --- a/src/crates/core/src/agentic/coordination/coordinator.rs +++ b/src/crates/core/src/agentic/coordination/coordinator.rs @@ -13,6 +13,7 @@ use crate::agentic::events::{ use crate::agentic::execution::{ExecutionContext, ExecutionEngine}; use crate::agentic::session::SessionManager; use crate::agentic::tools::pipeline::{SubagentParentInfo, ToolPipeline}; +use crate::agentic::image_analysis::ImageContextData; use crate::util::errors::{BitFunError, BitFunResult}; use log::{debug, error, info, warn}; use std::sync::Arc; @@ -171,6 +172,36 @@ impl ConversationCoordinator { turn_id: Option, agent_type: String, skip_tool_confirmation: bool, + ) -> BitFunResult<()> { + self.start_dialog_turn_internal(session_id, user_input, None, turn_id, agent_type) + .await + } + + pub async fn start_dialog_turn_with_image_contexts( + &self, + session_id: String, + user_input: String, + image_contexts: Vec, + turn_id: Option, + agent_type: String, + ) -> BitFunResult<()> { + self.start_dialog_turn_internal( + session_id, + user_input, + Some(image_contexts), + turn_id, + agent_type, + ) + .await + } + + async fn start_dialog_turn_internal( + &self, + session_id: String, + user_input: String, + image_contexts: Option>, + turn_id: Option, + agent_type: String, ) -> BitFunResult<()> { // Get latest session (re-fetch each time to ensure latest state) let session = self @@ -286,7 +317,12 @@ impl ConversationCoordinator { // Pass frontend turnId, generate if not provided let turn_id = self .session_manager - .start_dialog_turn(&session_id, wrapped_user_input.clone(), turn_id) + .start_dialog_turn( + &session_id, + wrapped_user_input.clone(), + turn_id, + image_contexts, + ) .await?; // Send dialog turn started event diff --git a/src/crates/core/src/agentic/core/message.rs b/src/crates/core/src/agentic/core/message.rs index 853574e8..59d75ade 100644 --- a/src/crates/core/src/agentic/core/message.rs +++ b/src/crates/core/src/agentic/core/message.rs @@ -1,3 +1,4 @@ +use crate::agentic::image_analysis::ImageContextData; use crate::util::types::{Message as AIMessage, ToolCall as AIToolCall}; use crate::util::TokenCounter; use log::warn; @@ -27,6 +28,10 @@ pub enum MessageRole { #[derive(Debug, Clone, Serialize, Deserialize)] pub enum MessageContent { Text(String), + Multimodal { + text: String, + images: Vec, + }, ToolResult { tool_id: String, tool_name: String, @@ -92,6 +97,42 @@ impl From for AIMessage { name: None, } } + MessageContent::Multimodal { text, images } => { + let mut content = text; + if !images.is_empty() { + content.push_str("\n\n[Attached image(s):\n"); + for image in images { + let name = image + .metadata + .as_ref() + .and_then(|m| m.get("name")) + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .or_else(|| { + image + .image_path + .as_ref() + .filter(|s| !s.is_empty()) + .cloned() + }) + .unwrap_or_else(|| image.id.clone()); + + content.push_str(&format!("- {} ({})\n", name, image.mime_type)); + } + content.push(']'); + } + + Self { + role: "user".to_string(), + content: Some(content), + reasoning_content: None, + thinking_signature: None, + tool_calls: None, + tool_call_id: None, + name: None, + } + } MessageContent::Mixed { reasoning_content, text, @@ -213,6 +254,16 @@ impl Message { } } + pub fn user_multimodal(text: String, images: Vec) -> Self { + Self { + id: Uuid::new_v4().to_string(), + role: MessageRole::User, + content: MessageContent::Multimodal { text, images }, + timestamp: SystemTime::now(), + metadata: MessageMetadata::default(), + } + } + pub fn assistant(text: String) -> Self { Self { id: Uuid::new_v4().to_string(), @@ -277,10 +328,13 @@ impl Message { if self.role != MessageRole::User { return false; } - if let MessageContent::Text(text) = &self.content { - if text.starts_with("") { - return false; - } + let text = match &self.content { + MessageContent::Text(text) => Some(text.as_str()), + MessageContent::Multimodal { text, .. } => Some(text.as_str()), + _ => None, + }; + if text.is_some_and(|t| t.starts_with("")) { + return false; } true } @@ -308,16 +362,92 @@ impl Message { if let Some(tokens) = self.metadata.tokens { return tokens; } - let tokens = TokenCounter::estimate_message_tokens(&AIMessage::from(&*self)); + let tokens = self.estimate_tokens(); self.metadata.tokens = Some(tokens); tokens } + + fn estimate_image_tokens(metadata: Option<&serde_json::Value>) -> usize { + let (width, height) = metadata + .and_then(|m| { + let w = m.get("width").and_then(|v| v.as_u64()); + let h = m.get("height").and_then(|v| v.as_u64()); + match (w, h) { + (Some(w), Some(h)) if w > 0 && h > 0 => Some((w as u32, h as u32)), + _ => None, + } + }) + .unwrap_or((1024, 1024)); + + let tiles_w = (width + 511) / 512; + let tiles_h = (height + 511) / 512; + let tiles = (tiles_w.max(1) * tiles_h.max(1)) as usize; + 50 + tiles * 200 + } + + fn estimate_tokens(&self) -> usize { + let mut total = 0usize; + total += 4; + + match &self.content { + MessageContent::Text(text) => { + total += TokenCounter::estimate_tokens(text); + } + MessageContent::Multimodal { text, images } => { + total += TokenCounter::estimate_tokens(text); + for image in images { + total += Self::estimate_image_tokens(image.metadata.as_ref()); + } + } + MessageContent::Mixed { + reasoning_content, + text, + tool_calls, + } => { + if self.metadata.keep_thinking { + if let Some(reasoning) = reasoning_content.as_ref() { + total += TokenCounter::estimate_tokens(reasoning); + } + } + total += TokenCounter::estimate_tokens(text); + + for tool_call in tool_calls { + total += TokenCounter::estimate_tokens(&tool_call.tool_name); + if let Ok(json_str) = serde_json::to_string(&tool_call.arguments) { + total += TokenCounter::estimate_tokens(&json_str); + } + total += 10; + } + } + MessageContent::ToolResult { + tool_name, + result, + result_for_assistant, + .. + } => { + if let Some(text) = result_for_assistant.as_ref().filter(|s| !s.is_empty()) { + total += TokenCounter::estimate_tokens(text); + } else if let Ok(json_str) = serde_json::to_string(result) { + total += TokenCounter::estimate_tokens(&json_str); + } else { + total += TokenCounter::estimate_tokens(tool_name); + } + } + } + + total + } } impl ToString for MessageContent { fn to_string(&self) -> String { match self { MessageContent::Text(text) => text.clone(), + MessageContent::Multimodal { text, images } => format!( + "Multimodal: text_length={}, images={}", + text.len(), + images.len() + ), MessageContent::ToolResult { tool_id, tool_name, diff --git a/src/crates/core/src/agentic/core/messages_helper.rs b/src/crates/core/src/agentic/core/messages_helper.rs index 203701e1..1d4c5fc1 100644 --- a/src/crates/core/src/agentic/core/messages_helper.rs +++ b/src/crates/core/src/agentic/core/messages_helper.rs @@ -13,22 +13,32 @@ impl MessageHelper { return; } if !enable_thinking { - messages - .iter_mut() - .for_each(|m| m.metadata.keep_thinking = false); + messages.iter_mut().for_each(|m| { + if m.metadata.keep_thinking { + m.metadata.keep_thinking = false; + m.metadata.tokens = None; + } + }); } else if support_preserved_thinking { - messages - .iter_mut() - .for_each(|m| m.metadata.keep_thinking = true); + messages.iter_mut().for_each(|m| { + if !m.metadata.keep_thinking { + m.metadata.keep_thinking = true; + m.metadata.tokens = None; + } + }); } else { let last_message_turn_id = messages.last().and_then(|m| m.metadata.turn_id.clone()); if let Some(last_turn_id) = last_message_turn_id { messages.iter_mut().for_each(|m| { - m.metadata.keep_thinking = m + let keep_thinking = m .metadata .turn_id .as_ref() .is_some_and(|cur_turn_id| cur_turn_id == &last_turn_id); + if m.metadata.keep_thinking != keep_thinking { + m.metadata.keep_thinking = keep_thinking; + m.metadata.tokens = None; + } }) } else { // Find the index of the last user message (role is user and not ) from back to front @@ -38,15 +48,21 @@ impl MessageHelper { // Messages from the last user message onwards are messages for this turn messages.iter_mut().enumerate().for_each(|(index, m)| { let keep_thinking = index >= last_user_message_index; - m.metadata.keep_thinking = keep_thinking; + if m.metadata.keep_thinking != keep_thinking { + m.metadata.keep_thinking = keep_thinking; + m.metadata.tokens = None; + } }) } else { // No user message found, should not reach here in practice warn!("compute_keep_thinking_flags: no user message found"); - messages - .iter_mut() - .for_each(|m| m.metadata.keep_thinking = false); + messages.iter_mut().for_each(|m| { + if m.metadata.keep_thinking { + m.metadata.keep_thinking = false; + m.metadata.tokens = None; + } + }); } } } diff --git a/src/crates/core/src/agentic/execution/execution_engine.rs b/src/crates/core/src/agentic/execution/execution_engine.rs index 70512430..61adeb16 100644 --- a/src/crates/core/src/agentic/execution/execution_engine.rs +++ b/src/crates/core/src/agentic/execution/execution_engine.rs @@ -5,18 +5,25 @@ use super::round_executor::RoundExecutor; use super::types::{ExecutionContext, ExecutionResult, RoundContext}; use crate::agentic::agents::get_agent_registry; -use crate::agentic::core::{Message, MessageHelper}; +use crate::agentic::core::{Message, MessageContent, MessageHelper}; use crate::agentic::events::{AgenticEvent, EventPriority, EventQueue}; +use crate::agentic::image_analysis::{ + build_multimodal_message_with_images, process_image_contexts_for_provider, ImageContextData, + ImageLimits, +}; use crate::agentic::session::SessionManager; use crate::agentic::tools::{get_all_registered_tools, SubagentParentInfo}; use crate::infrastructure::ai::get_global_ai_client_factory; use crate::infrastructure::get_workspace_path; +use crate::service::config::get_global_config_service; +use crate::service::config::types::{ModelCapability, ModelCategory}; use crate::util::errors::{BitFunError, BitFunResult}; use crate::util::token_counter::TokenCounter; use crate::util::types::Message as AIMessage; use crate::util::types::ToolDefinition; use log::{debug, error, info, trace, warn}; use std::collections::HashMap; +use std::path::Path; use std::sync::Arc; use tokio_util::sync::CancellationToken; @@ -55,6 +62,146 @@ impl ExecutionEngine { } } + fn estimate_request_tokens_internal( + messages: &mut [Message], + tools: Option<&[ToolDefinition]>, + ) -> usize { + let mut total: usize = messages.iter_mut().map(|m| m.get_tokens()).sum(); + total += 3; + + if let Some(tool_defs) = tools { + total += TokenCounter::estimate_tool_definitions_tokens(tool_defs); + } + + total + } + + fn is_redacted_image_context(image: &ImageContextData) -> bool { + let missing_path = image + .image_path + .as_ref() + .map(|s| s.trim().is_empty()) + .unwrap_or(true); + let missing_data_url = image + .data_url + .as_ref() + .map(|s| s.trim().is_empty()) + .unwrap_or(true); + let has_redaction_hint = image + .metadata + .as_ref() + .and_then(|m| m.get("has_data_url")) + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + missing_path && missing_data_url && has_redaction_hint + } + + fn is_recoverable_historical_image_error(err: &BitFunError) -> bool { + match err { + BitFunError::Io(_) | BitFunError::Deserialization(_) => true, + BitFunError::Validation(msg) => { + msg.starts_with("Failed to decode image data") + || msg.starts_with("Unsupported or unrecognized image format") + || msg.starts_with("Invalid data URL format") + || msg.starts_with("Data URL format error") + } + _ => false, + } + } + + fn can_fallback_to_text_only( + images: &[ImageContextData], + err: &BitFunError, + is_current_turn_message: bool, + ) -> bool { + let is_redacted_payload_error = matches!( + err, + BitFunError::Validation(msg) if msg.starts_with("Image context missing image_path/data_url") + ) && !images.is_empty() + && images.iter().all(Self::is_redacted_image_context); + + if is_redacted_payload_error { + return true; + } + + if is_current_turn_message { + return false; + } + + Self::is_recoverable_historical_image_error(err) + } + + async fn build_ai_messages_for_send( + messages: &[Message], + provider: &str, + workspace_path: Option<&Path>, + current_turn_id: &str, + ) -> BitFunResult> { + let limits = ImageLimits::for_provider(provider); + + let mut result = Vec::with_capacity(messages.len()); + let mut attached_image_count = 0usize; + + for msg in messages { + match &msg.content { + MessageContent::Multimodal { text, images } => { + let prompt = if text.trim().is_empty() { + "(image attached)".to_string() + } else { + text.clone() + }; + + match process_image_contexts_for_provider(images, provider, workspace_path) + .await + { + Ok(processed) => { + let next_count = attached_image_count + processed.len(); + if next_count > limits.max_images_per_request { + return Err(BitFunError::validation(format!( + "Too many images in one request: {} > {}", + next_count, limits.max_images_per_request + ))); + } + attached_image_count = next_count; + + let multimodal = build_multimodal_message_with_images( + &prompt, &processed, provider, + )?; + result.extend(multimodal); + } + Err(err) => { + if matches!(&err, BitFunError::Validation(msg) if msg.starts_with("Too many images in one request")) + { + return Err(err); + } + let is_current_turn_message = + msg.metadata.turn_id.as_deref() == Some(current_turn_id); + if Self::can_fallback_to_text_only( + images, + &err, + is_current_turn_message, + ) { + // Degrade only for historical multimodal messages. Current-turn + // image failures should still surface to users. + warn!( + "Failed to rebuild multimodal payload, falling back to text-only message: message_id={}, provider={}, turn_id={:?}, current_turn_id={}, error={}", + msg.id, provider, msg.metadata.turn_id, current_turn_id, err + ); + result.push(AIMessage::from(msg)); + } else { + return Err(err); + } + } + } + } + _ => result.push(AIMessage::from(msg)), + } + } + + Ok(result) + } + /// Compress context, will emit compression events (Started, Completed, and Failed) pub async fn compress_messages( &self, @@ -66,7 +213,7 @@ impl ExecutionEngine { context_window: usize, tool_definitions: &Option>, system_prompt_message: Message, - ) -> BitFunResult, Vec)>> { + ) -> BitFunResult)>> { let event_subagent_parent_info = subagent_parent_info.map(|info| info.clone().into()); let mut session = self .session_manager @@ -134,10 +281,8 @@ impl ExecutionEngine { let duration_ms = start_time.elapsed().as_millis() as u64; // Recalculate tokens after compression - let new_ai_messages: Vec = - MessageHelper::convert_messages(&new_messages); - let compressed_tokens = TokenCounter::estimate_request_tokens( - &new_ai_messages, + let compressed_tokens = Self::estimate_request_tokens_internal( + &mut new_messages, tool_definitions.as_deref(), ); @@ -159,7 +304,7 @@ impl ExecutionEngine { ) .await; - Ok(Some((compressed_tokens, new_messages, new_ai_messages))) + Ok(Some((compressed_tokens, new_messages))) } Err(e) => { // Emit compression failed event @@ -353,6 +498,83 @@ impl ExecutionEngine { let support_preserved_thinking = ai_client.config.support_preserved_thinking; let context_window = ai_client.config.context_window as usize; + // Detect whether the primary model supports multimodal image inputs. + // This is used by tools like `view_image` to decide between: + // - attaching image content for the primary model to analyze directly, or + // - using a dedicated vision model to pre-analyze into text. + let (resolved_primary_model_id, primary_supports_image_understanding) = { + let config_service = get_global_config_service().await.ok(); + if let Some(service) = config_service { + let ai_config: crate::service::config::types::AIConfig = + service.get_config(Some("ai")).await.unwrap_or_default(); + + let resolved_id = match model_id.as_str() { + "primary" => ai_config + .default_models + .primary + .clone() + .unwrap_or_else(|| model_id.clone()), + "fast" => ai_config + .default_models + .fast + .clone() + .or_else(|| ai_config.default_models.primary.clone()) + .unwrap_or_else(|| model_id.clone()), + _ => model_id.clone(), + }; + + let model_cfg = ai_config + .models + .iter() + .find(|m| m.id == resolved_id) + .or_else(|| ai_config.models.iter().find(|m| m.name == resolved_id)) + .or_else(|| { + ai_config + .models + .iter() + .find(|m| m.model_name == resolved_id) + }) + .or_else(|| { + ai_config.models.iter().find(|m| { + m.model_name == ai_client.config.model + && m.provider == ai_client.config.format + }) + }); + + let supports = model_cfg.is_some_and(|m| { + m.capabilities + .iter() + .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding)) + || matches!(m.category, ModelCategory::Multimodal) + }); + + (resolved_id, supports) + } else { + warn!( + "Config service unavailable, assuming primary model is text-only for image input gating" + ); + (model_id.clone(), false) + } + }; + + let mut execution_context_vars = context.context.clone(); + execution_context_vars.insert( + "primary_model_id".to_string(), + resolved_primary_model_id.clone(), + ); + execution_context_vars.insert( + "primary_model_name".to_string(), + ai_client.config.model.clone(), + ); + execution_context_vars.insert( + "primary_model_provider".to_string(), + ai_client.config.format.clone(), + ); + execution_context_vars.insert( + "primary_model_supports_image_understanding".to_string(), + primary_supports_image_understanding.to_string(), + ); + // Loop to execute model rounds loop { // Check round limit @@ -369,11 +591,10 @@ impl ExecutionEngine { enable_thinking, support_preserved_thinking, ); - let mut ai_messages = MessageHelper::convert_messages(&messages); // Check and compress before sending AI request let current_tokens = - TokenCounter::estimate_request_tokens(&ai_messages, tool_definitions.as_deref()); + Self::estimate_request_tokens_internal(&mut messages, tool_definitions.as_deref()); debug!( "Round {} token usage before send: {} / {} tokens ({:.1}%)", round_index, @@ -414,7 +635,7 @@ impl ExecutionEngine { ) .await { - Ok(Some((compressed_tokens, compressed_messages, compressed_ai_messages))) => { + Ok(Some((compressed_tokens, compressed_messages))) => { info!( "Round {} compression completed: messages {} -> {}, tokens {} -> {}", round_index, @@ -425,7 +646,6 @@ impl ExecutionEngine { ); messages = compressed_messages; - ai_messages = compressed_ai_messages; } Ok(None) => { debug!("All turns need to be kept, no compression performed"); @@ -440,7 +660,7 @@ impl ExecutionEngine { } // Create round context - let mut round_context_vars = context.context.clone(); + let mut round_context_vars = execution_context_vars.clone(); if context.skip_tool_confirmation { round_context_vars.insert("skip_tool_confirmation".to_string(), "true".to_string()); } @@ -452,11 +672,7 @@ impl ExecutionEngine { round_number: round_index, messages: messages.clone(), available_tools: available_tools.clone(), - model_name: context - .context - .get("model_name") - .cloned() - .unwrap_or_else(|| "default".to_string()), + model_name: ai_client.config.model.clone(), agent_type: agent_type.clone(), context_vars: round_context_vars, cancellation_token: CancellationToken::new(), @@ -469,6 +685,15 @@ impl ExecutionEngine { messages.len() ); + let workspace_path = get_workspace_path(); + let ai_messages = Self::build_ai_messages_for_send( + &messages, + &ai_client.config.format, + workspace_path.as_deref(), + &context.dialog_turn_id, + ) + .await?; + let round_result = self .round_executor .execute_round( diff --git a/src/crates/core/src/agentic/execution/round_executor.rs b/src/crates/core/src/agentic/execution/round_executor.rs index 9a0a8c84..951d31d5 100644 --- a/src/crates/core/src/agentic/execution/round_executor.rs +++ b/src/crates/core/src/agentic/execution/round_executor.rs @@ -6,6 +6,7 @@ use super::stream_processor::StreamProcessor; use super::types::{FinishReason, RoundContext, RoundResult}; use crate::agentic::core::Message; use crate::agentic::events::{AgenticEvent, EventPriority, EventQueue}; +use crate::agentic::image_analysis::ImageContextData as ModelImageContextData; use crate::agentic::tools::pipeline::{ToolExecutionContext, ToolExecutionOptions, ToolPipeline}; use crate::agentic::tools::registry::get_global_tool_registry; use crate::agentic::MessageContent; @@ -16,6 +17,7 @@ use crate::util::types::Message as AIMessage; use crate::util::types::ToolDefinition; use dashmap::DashMap; use log::{debug, error, warn}; +use serde_json::Value as JsonValue; use std::sync::Arc; use std::time::Duration; use tokio_util::sync::CancellationToken; @@ -455,7 +457,32 @@ impl RoundExecutor { // Create tool result messages (also need to set turn_id and round_id) let dialog_turn_id = context.dialog_turn_id.clone(); let round_id_clone = round_id.clone(); - let tool_result_messages: Vec = tool_results + let primary_supports_images = context + .context_vars + .get("primary_model_supports_image_understanding") + .and_then(|v| v.parse::().ok()) + .unwrap_or(false); + let extract_attached_image = |result: &JsonValue| -> Option { + if !primary_supports_images { + return None; + } + let mode = result.get("mode").and_then(|v| v.as_str())?; + if mode != "attached_to_primary_model" { + return None; + } + let image_value = result.get("image")?; + serde_json::from_value::(image_value.clone()).ok() + }; + let mut injected_images = Vec::new(); + for result in &tool_results { + if result.tool_name == "view_image" && !result.is_error { + if let Some(image_ctx) = extract_attached_image(&result.result) { + injected_images.push(image_ctx); + } + } + } + + let mut tool_result_messages: Vec = tool_results .into_iter() .map(|result| { Message::tool_result(result) @@ -464,6 +491,18 @@ impl RoundExecutor { }) .collect(); + if !injected_images.is_empty() { + let reminder_text = format!( + "\nAttached {} image(s) from view_image tool.\n", + injected_images.len() + ); + tool_result_messages.push( + Message::user_multimodal(reminder_text, injected_images) + .with_turn_id(dialog_turn_id.clone()) + .with_round_id(round_id_clone.clone()), + ); + } + let has_more_rounds = !has_end_turn_tool && !tool_result_messages.is_empty(); debug!( diff --git a/src/crates/core/src/agentic/image_analysis/image_processing.rs b/src/crates/core/src/agentic/image_analysis/image_processing.rs index 88d2184d..d4cd763e 100644 --- a/src/crates/core/src/agentic/image_analysis/image_processing.rs +++ b/src/crates/core/src/agentic/image_analysis/image_processing.rs @@ -1,8 +1,10 @@ //! Shared image processing utilities used by both API-side image analysis and tool-driven image analysis. -use super::types::ImageLimits; +use super::types::{ImageContextData, ImageLimits}; use crate::service::config::get_global_config_service; -use crate::service::config::types::{AIConfig as ServiceAIConfig, AIModelConfig, ModelCapability}; +use crate::service::config::types::{ + AIConfig as ServiceAIConfig, AIModelConfig, ModelCapability, ModelCategory, +}; use crate::util::errors::{BitFunError, BitFunResult}; use crate::util::types::Message; use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; @@ -31,34 +33,41 @@ pub fn resolve_vision_model_from_ai_config( let target_model_id = ai_config .default_models .image_understanding - .as_ref() + .as_deref() + .map(str::trim) .filter(|id| !id.is_empty()); - if let Some(id) = target_model_id { - return ai_config - .models - .iter() - .find(|m| m.id == *id) - .cloned() - .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id))); - } + let Some(id) = target_model_id else { + return Err(BitFunError::service( + "Image understanding model is not configured.\nPlease select a model in Settings." + .to_string(), + )); + }; - ai_config + let model = ai_config .models .iter() - .find(|m| { - m.enabled - && m.capabilities - .iter() - .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding)) - }) + .find(|m| m.id == id) .cloned() - .ok_or_else(|| { - BitFunError::service( - "No image understanding model found.\nPlease configure an image understanding model in settings" - .to_string(), - ) - }) + .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id)))?; + + if !model.enabled { + return Err(BitFunError::service(format!("Model is disabled: {}", id))); + } + + let supports_image_understanding = model + .capabilities + .iter() + .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding)) + || matches!(model.category, ModelCategory::Multimodal); + if !supports_image_understanding { + return Err(BitFunError::service(format!( + "Model does not support image understanding: {}", + id + ))); + } + + Ok(model) } pub async fn resolve_vision_model_from_global_config() -> BitFunResult { @@ -275,6 +284,105 @@ pub fn build_multimodal_message( Ok(vec![message]) } +pub async fn process_image_contexts_for_provider( + image_contexts: &[ImageContextData], + provider: &str, + workspace_path: Option<&Path>, +) -> BitFunResult> { + let limits = ImageLimits::for_provider(provider); + + if image_contexts.len() > limits.max_images_per_request { + return Err(BitFunError::validation(format!( + "Too many images in one request: {} > {}", + image_contexts.len(), + limits.max_images_per_request + ))); + } + + let mut results = Vec::with_capacity(image_contexts.len()); + + for ctx in image_contexts { + let (image_data, fallback_mime) = if let Some(data_url) = &ctx.data_url { + let (data, data_url_mime) = decode_data_url(data_url)?; + (data, data_url_mime.or_else(|| Some(ctx.mime_type.clone()))) + } else if let Some(path_str) = &ctx.image_path { + let path = resolve_image_path(path_str, workspace_path)?; + let data = load_image_from_path(&path, workspace_path).await?; + let detected_mime = detect_mime_type_from_bytes(&data, Some(&ctx.mime_type)).ok(); + (data, detected_mime.or_else(|| Some(ctx.mime_type.clone()))) + } else { + return Err(BitFunError::validation(format!( + "Image context missing image_path/data_url: id={}", + ctx.id + ))); + }; + + let processed = + optimize_image_for_provider(image_data, provider, fallback_mime.as_deref())?; + results.push(processed); + } + + Ok(results) +} + +pub fn build_multimodal_message_with_images( + prompt: &str, + images: &[ProcessedImage], + provider: &str, +) -> BitFunResult> { + if images.is_empty() { + return Ok(vec![Message::user(prompt.to_string())]); + } + + let provider_lower = provider.to_lowercase(); + + let content_json = if provider_lower.contains("anthropic") { + let mut blocks = Vec::with_capacity(images.len() + 1); + for img in images { + let base64_data = BASE64.encode(&img.data); + blocks.push(json!({ + "type": "image", + "source": { + "type": "base64", + "media_type": img.mime_type, + "data": base64_data + } + })); + } + blocks.push(json!({ + "type": "text", + "text": prompt + })); + json!(blocks) + } else { + let mut blocks = Vec::with_capacity(images.len() + 1); + for img in images { + let base64_data = BASE64.encode(&img.data); + blocks.push(json!({ + "type": "image_url", + "image_url": { + "url": format!("data:{};base64,{}", img.mime_type, base64_data) + } + })); + } + blocks.push(json!({ + "type": "text", + "text": prompt + })); + json!(blocks) + }; + + Ok(vec![Message { + role: "user".to_string(), + content: Some(serde_json::to_string(&content_json)?), + reasoning_content: None, + thinking_signature: None, + tool_calls: None, + tool_call_id: None, + name: None, + }]) +} + fn image_format_to_mime(format: ImageFormat) -> Option<&'static str> { match format { ImageFormat::Png => Some("image/png"), diff --git a/src/crates/core/src/agentic/image_analysis/mod.rs b/src/crates/core/src/agentic/image_analysis/mod.rs index 814afb66..4ba156f8 100644 --- a/src/crates/core/src/agentic/image_analysis/mod.rs +++ b/src/crates/core/src/agentic/image_analysis/mod.rs @@ -10,8 +10,9 @@ pub mod types; pub use enhancer::MessageEnhancer; pub use image_processing::{ build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path, - optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_ai_config, - resolve_vision_model_from_global_config, ProcessedImage, + optimize_image_for_provider, process_image_contexts_for_provider, resolve_image_path, + resolve_vision_model_from_ai_config, resolve_vision_model_from_global_config, + build_multimodal_message_with_images, ProcessedImage, }; pub use processor::ImageAnalyzer; pub use types::*; diff --git a/src/crates/core/src/agentic/persistence/manager.rs b/src/crates/core/src/agentic/persistence/manager.rs index 57718554..7fd79015 100644 --- a/src/crates/core/src/agentic/persistence/manager.rs +++ b/src/crates/core/src/agentic/persistence/manager.rs @@ -2,7 +2,7 @@ //! //! Responsible for persistent storage of sessions, messages, and tool states -use crate::agentic::core::{DialogTurn, Message, Session, SessionState, SessionSummary}; +use crate::agentic::core::{DialogTurn, Message, MessageContent, Session, SessionState, SessionSummary}; use crate::infrastructure::PathManager; use crate::util::errors::{BitFunError, BitFunResult}; use log::{debug, info, warn}; @@ -46,6 +46,65 @@ impl PersistenceManager { Ok(dir) } + fn sanitize_messages_for_persistence(messages: &[Message]) -> Vec { + messages + .iter() + .map(Self::sanitize_message_for_persistence) + .collect() + } + + fn sanitize_message_for_persistence(message: &Message) -> Message { + let mut sanitized = message.clone(); + + match &mut sanitized.content { + MessageContent::Multimodal { images, .. } => { + for image in images.iter_mut() { + if image.data_url.as_ref().is_some_and(|v| !v.is_empty()) { + image.data_url = None; + + let mut metadata = image + .metadata + .take() + .unwrap_or_else(|| serde_json::json!({})); + if !metadata.is_object() { + metadata = serde_json::json!({ "raw_metadata": metadata }); + } + if let Some(obj) = metadata.as_object_mut() { + obj.insert("has_data_url".to_string(), serde_json::json!(true)); + } + image.metadata = Some(metadata); + } + } + } + MessageContent::ToolResult { result, .. } => { + Self::redact_data_url_in_json(result); + } + _ => {} + } + + sanitized + } + + fn redact_data_url_in_json(value: &mut serde_json::Value) { + match value { + serde_json::Value::Object(map) => { + let had_data_url = map.remove("data_url").is_some(); + if had_data_url { + map.insert("has_data_url".to_string(), serde_json::json!(true)); + } + for child in map.values_mut() { + Self::redact_data_url_in_json(child); + } + } + serde_json::Value::Array(arr) => { + for child in arr { + Self::redact_data_url_in_json(child); + } + } + _ => {} + } + } + // ============ Turn context snapshot (sent to model)============ fn context_snapshots_dir(&self, session_id: &str) -> PathBuf { @@ -70,7 +129,8 @@ impl PersistenceManager { .map_err(|e| BitFunError::io(format!("Failed to create context_snapshots directory: {}", e)))?; let snapshot_path = self.context_snapshot_path(session_id, turn_index); - let json = serde_json::to_string(messages).map_err(|e| { + let sanitized_messages = Self::sanitize_messages_for_persistence(messages); + let json = serde_json::to_string(&sanitized_messages).map_err(|e| { BitFunError::serialization(format!("Failed to serialize turn context snapshot: {}", e)) })?; fs::write(&snapshot_path, json) @@ -312,7 +372,8 @@ impl PersistenceManager { let dir = self.ensure_session_dir(session_id).await?; let messages_path = dir.join("messages.jsonl"); - let json = serde_json::to_string(message) + let sanitized_message = Self::sanitize_message_for_persistence(message); + let json = serde_json::to_string(&sanitized_message) .map_err(|e| BitFunError::serialization(format!("Failed to serialize message: {}", e)))?; let mut file = fs::OpenOptions::new() @@ -397,7 +458,8 @@ impl PersistenceManager { let dir = self.ensure_session_dir(session_id).await?; let compressed_path = dir.join("compressed_messages.jsonl"); - let json = serde_json::to_string(message) + let sanitized_message = Self::sanitize_message_for_persistence(message); + let json = serde_json::to_string(&sanitized_message) .map_err(|e| BitFunError::serialization(format!("Failed to serialize compressed message: {}", e)))?; let mut file = fs::OpenOptions::new() @@ -435,8 +497,10 @@ impl PersistenceManager { .await .map_err(|e| BitFunError::io(format!("Failed to open compressed message file: {}", e)))?; + let sanitized_messages = Self::sanitize_messages_for_persistence(messages); + // Write all messages - for message in messages { + for message in &sanitized_messages { let json = serde_json::to_string(message) .map_err(|e| BitFunError::serialization(format!("Failed to serialize compressed message: {}", e)))?; diff --git a/src/crates/core/src/agentic/session/session_manager.rs b/src/crates/core/src/agentic/session/session_manager.rs index f26042ce..689d8b02 100644 --- a/src/crates/core/src/agentic/session/session_manager.rs +++ b/src/crates/core/src/agentic/session/session_manager.rs @@ -6,6 +6,7 @@ use crate::agentic::core::{ CompressionState, DialogTurn, DialogTurnState, Message, ProcessingPhase, Session, SessionConfig, SessionState, SessionSummary, TurnStats, }; +use crate::agentic::image_analysis::ImageContextData; use crate::agentic::persistence::PersistenceManager; use crate::agentic::session::{CompressionManager, MessageHistoryManager}; use crate::infrastructure::ai::get_global_ai_client_factory; @@ -463,6 +464,7 @@ impl SessionManager { session_id: &str, user_input: String, turn_id: Option, + image_contexts: Option>, ) -> BitFunResult { // Check if session exists let session = self @@ -491,7 +493,13 @@ impl SessionManager { } // 2. Add user message to history and compression managers - let user_message = Message::user(user_input).with_turn_id(turn_id.clone()); + let user_message = if let Some(images) = + image_contexts.as_ref().filter(|v| !v.is_empty()).cloned() + { + Message::user_multimodal(user_input, images).with_turn_id(turn_id.clone()) + } else { + Message::user(user_input).with_turn_id(turn_id.clone()) + }; self.history_manager .add_message(session_id, user_message.clone()) .await?; diff --git a/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs index cbd59b2f..20ff31bc 100644 --- a/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs @@ -5,13 +5,17 @@ //! that can evolve toward direct multimodal attachment in the future. use async_trait::async_trait; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; +use image::GenericImageView; use log::{debug, info, trace}; use serde::Deserialize; use serde_json::{json, Value}; +use uuid::Uuid; use crate::agentic::image_analysis::{ build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path, optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_global_config, + ImageContextData as ModelImageContextData, }; use crate::agentic::tools::framework::{ Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult, @@ -43,6 +47,26 @@ impl ViewImageTool { Self } + fn primary_model_supports_images(context: &ToolUseContext) -> bool { + context + .options + .as_ref() + .and_then(|o| o.custom_data.as_ref()) + .and_then(|m| m.get("primary_model_supports_image_understanding")) + .and_then(|v| v.as_bool()) + .unwrap_or(false) + } + + fn primary_model_provider(context: &ToolUseContext) -> Option<&str> { + context + .options + .as_ref() + .and_then(|o| o.custom_data.as_ref()) + .and_then(|m| m.get("primary_model_provider")) + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + } + fn build_prompt( &self, analysis_prompt: Option<&str>, @@ -80,6 +104,181 @@ impl ViewImageTool { prompt } + async fn build_attachment_image_context( + &self, + input_data: &ViewImageInput, + context: &ToolUseContext, + primary_provider: &str, + ) -> BitFunResult<(ModelImageContextData, String)> { + let workspace_path = get_workspace_path(); + + if let Some(image_id) = &input_data.image_id { + let provider = context.image_context_provider.as_ref().ok_or_else(|| { + BitFunError::tool( + "image_id mode requires ImageContextProvider support, but no provider was injected.\n\ + Please inject image_context_provider when calling the tool, or use image_path/data_url mode." + .to_string(), + ) + })?; + + let ctx = provider.get_image(image_id).ok_or_else(|| { + BitFunError::tool(format!( + "Image context not found: image_id={}. Image may have expired (5-minute validity) or was never uploaded.", + image_id + )) + })?; + + let crate::agentic::tools::image_context::ImageContextData { + id: ctx_id, + image_path: ctx_image_path, + data_url: ctx_data_url, + mime_type: ctx_mime_type, + image_name: ctx_image_name, + file_size: ctx_file_size, + width: ctx_width, + height: ctx_height, + source: ctx_source, + } = ctx; + + let description = format!("{} (clipboard)", ctx_image_name); + + if let Some(path_str) = ctx_image_path.as_ref().filter(|s| !s.is_empty()) { + let path = resolve_image_path(path_str, workspace_path.as_deref())?; + let metadata = json!({ + "name": ctx_image_name, + "width": ctx_width, + "height": ctx_height, + "file_size": ctx_file_size, + "source": ctx_source, + "origin": "image_id", + "image_id": ctx_id.clone(), + }); + + return Ok(( + ModelImageContextData { + id: ctx_id, + image_path: Some(path.display().to_string()), + data_url: None, + mime_type: ctx_mime_type, + metadata: Some(metadata), + }, + description, + )); + } + + if let Some(data_url) = ctx_data_url.as_ref().filter(|s| !s.is_empty()) { + let (data, data_url_mime) = decode_data_url(data_url)?; + let fallback_mime = data_url_mime + .as_deref() + .or_else(|| Some(ctx_mime_type.as_str())); + let processed = + optimize_image_for_provider(data, primary_provider, fallback_mime)?; + let optimized_data_url = format!( + "data:{};base64,{}", + processed.mime_type, + BASE64.encode(&processed.data) + ); + + let metadata = json!({ + "name": ctx_image_name, + "width": processed.width, + "height": processed.height, + "file_size": processed.data.len(), + "source": ctx_source, + "origin": "image_id", + "image_id": ctx_id.clone(), + }); + + return Ok(( + ModelImageContextData { + id: ctx_id, + image_path: None, + data_url: Some(optimized_data_url), + mime_type: processed.mime_type, + metadata: Some(metadata), + }, + description, + )); + } + + return Err(BitFunError::tool(format!( + "Image context {} has neither data_url nor image_path", + image_id + ))); + } + + if let Some(data_url) = &input_data.data_url { + let (data, data_url_mime) = decode_data_url(data_url)?; + let processed = + optimize_image_for_provider(data, primary_provider, data_url_mime.as_deref())?; + let optimized_data_url = format!( + "data:{};base64,{}", + processed.mime_type, + BASE64.encode(&processed.data) + ); + let metadata = json!({ + "name": "clipboard_image", + "width": processed.width, + "height": processed.height, + "file_size": processed.data.len(), + "source": "data_url", + "origin": "data_url" + }); + + return Ok(( + ModelImageContextData { + id: format!("img-view-{}", Uuid::new_v4()), + image_path: None, + data_url: Some(optimized_data_url), + mime_type: processed.mime_type, + metadata: Some(metadata), + }, + "clipboard_image".to_string(), + )); + } + + if let Some(image_path_str) = &input_data.image_path { + let abs_path = resolve_image_path(image_path_str, workspace_path.as_deref())?; + let data = load_image_from_path(&abs_path, workspace_path.as_deref()).await?; + + let mime_type = detect_mime_type_from_bytes(&data, None)?; + let dynamic = image::load_from_memory(&data).map_err(|e| { + BitFunError::validation(format!("Failed to decode image data: {}", e)) + })?; + let (width, height) = dynamic.dimensions(); + + let name = abs_path + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or("image") + .to_string(); + + let metadata = json!({ + "name": name, + "width": width, + "height": height, + "file_size": data.len(), + "source": "local_path", + "origin": "image_path" + }); + + return Ok(( + ModelImageContextData { + id: format!("img-view-{}", Uuid::new_v4()), + image_path: Some(abs_path.display().to_string()), + data_url: None, + mime_type, + metadata: Some(metadata), + }, + abs_path.display().to_string(), + )); + } + + Err(BitFunError::validation( + "Must provide one of image_path, data_url, or image_id", + )) + } + async fn load_source( &self, input_data: &ViewImageInput, @@ -156,8 +355,8 @@ impl Tool for ViewImageTool { Use this tool when the user provides an image (file path, data URL, or uploaded clipboard image_id) and asks questions about it. Current behavior: -- For text-only primary models, this tool converts image content to structured text. -- For multimodal-capable setups, this interface can be extended to direct image attachment in future. +- For text-only primary models, this tool converts image content to structured text (uses the configured image understanding model). +- For multimodal primary models, this tool attaches the image for the primary model to analyze directly. Parameters: - image_path / data_url / image_id: provide one image source @@ -319,6 +518,28 @@ Parameters: let input_data: ViewImageInput = serde_json::from_value(input.clone()) .map_err(|e| BitFunError::parse(format!("Failed to parse input: {}", e)))?; + let primary_provider = Self::primary_model_provider(context).unwrap_or("openai"); + if Self::primary_model_supports_images(context) { + let (image, image_source_description) = self + .build_attachment_image_context(&input_data, context, primary_provider) + .await?; + + let result_for_assistant = format!( + "Image attached for primary model analysis ({})", + image_source_description + ); + + return Ok(vec![ToolResult::Result { + data: json!({ + "success": true, + "mode": "attached_to_primary_model", + "image_source": image_source_description, + "image": image, + }), + result_for_assistant: Some(result_for_assistant), + }]); + } + let (image_data, fallback_mime, image_source_description) = self.load_source(&input_data, context).await?; diff --git a/src/crates/core/src/agentic/tools/pipeline/state_manager.rs b/src/crates/core/src/agentic/tools/pipeline/state_manager.rs index 67d99022..c5d52854 100644 --- a/src/crates/core/src/agentic/tools/pipeline/state_manager.rs +++ b/src/crates/core/src/agentic/tools/pipeline/state_manager.rs @@ -19,6 +19,32 @@ pub struct ToolStateManager { } impl ToolStateManager { + fn sanitize_tool_result_for_event(result: &serde_json::Value) -> serde_json::Value { + let mut sanitized = result.clone(); + Self::redact_data_url_in_json(&mut sanitized); + sanitized + } + + fn redact_data_url_in_json(value: &mut serde_json::Value) { + match value { + serde_json::Value::Object(map) => { + let had_data_url = map.remove("data_url").is_some(); + if had_data_url { + map.insert("has_data_url".to_string(), serde_json::json!(true)); + } + for child in map.values_mut() { + Self::redact_data_url_in_json(child); + } + } + serde_json::Value::Array(arr) => { + for child in arr { + Self::redact_data_url_in_json(child); + } + } + _ => {} + } + } + pub fn new(event_queue: Arc) -> Self { Self { tasks: Arc::new(DashMap::new()), @@ -156,7 +182,7 @@ impl ToolStateManager { ToolExecutionState::Completed { result, duration_ms } => ToolEventData::Completed { tool_id: task.tool_call.tool_id.clone(), tool_name: task.tool_call.tool_name.clone(), - result: result.content(), + result: Self::sanitize_tool_result_for_event(&result.content()), duration_ms: *duration_ms, }, diff --git a/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs b/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs index 3ea85d7f..79db4dd7 100644 --- a/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs +++ b/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs @@ -699,6 +699,40 @@ impl ToolPipeline { map.insert("turn_index".to_string(), serde_json::json!(n)); } } + + if let Some(provider) = task.context.context_vars.get("primary_model_provider") { + if !provider.is_empty() { + map.insert( + "primary_model_provider".to_string(), + serde_json::json!(provider), + ); + } + } + if let Some(model_id) = task.context.context_vars.get("primary_model_id") { + if !model_id.is_empty() { + map.insert("primary_model_id".to_string(), serde_json::json!(model_id)); + } + } + if let Some(model_name) = task.context.context_vars.get("primary_model_name") { + if !model_name.is_empty() { + map.insert( + "primary_model_name".to_string(), + serde_json::json!(model_name), + ); + } + } + if let Some(supports_images) = task + .context + .context_vars + .get("primary_model_supports_image_understanding") + { + if let Ok(flag) = supports_images.parse::() { + map.insert( + "primary_model_supports_image_understanding".to_string(), + serde_json::json!(flag), + ); + } + } map }), @@ -887,4 +921,3 @@ impl ToolPipeline { } } } - diff --git a/src/crates/core/src/infrastructure/ai/client.rs b/src/crates/core/src/infrastructure/ai/client.rs index f883f48d..c0c25d84 100644 --- a/src/crates/core/src/infrastructure/ai/client.rs +++ b/src/crates/core/src/infrastructure/ai/client.rs @@ -30,6 +30,71 @@ pub struct AIClient { } impl AIClient { + const TEST_IMAGE_EXPECTED_CODE: &'static str = "BYGR"; + const TEST_IMAGE_PNG_BASE64: &'static str = + "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAiklEQVR4nNXZwQkAQQzDQEX995wr4giLpgBj8NMDy6XdOc2XOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTuDm+Bzi+B8gvIHESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3G+LvDXB5LJBXz4d6CTAAAAAElFTkSuQmCC"; + + fn image_test_response_matches_expected(response: &str) -> bool { + let upper = response.to_ascii_uppercase(); + + // Accept contiguous letters even when separated by spaces/punctuation. + let letters_only: String = upper.chars().filter(|c| c.is_ascii_alphabetic()).collect(); + if letters_only.contains(Self::TEST_IMAGE_EXPECTED_CODE) { + return true; + } + + let tokens: Vec<&str> = upper + .split(|c: char| !c.is_ascii_alphabetic()) + .filter(|s| !s.is_empty()) + .collect(); + + if tokens + .iter() + .any(|token| *token == Self::TEST_IMAGE_EXPECTED_CODE) + { + return true; + } + + // Accept outputs like: "B Y G R". + let single_letter_stream: String = tokens + .iter() + .filter_map(|token| { + if token.len() == 1 { + let ch = token.chars().next()?; + if matches!(ch, 'R' | 'G' | 'B' | 'Y') { + return Some(ch); + } + } + None + }) + .collect(); + if single_letter_stream.contains(Self::TEST_IMAGE_EXPECTED_CODE) { + return true; + } + + // Accept outputs like: "Blue, Yellow, Green, Red". + let color_word_stream: String = tokens + .iter() + .filter_map(|token| match *token { + "RED" => Some('R'), + "GREEN" => Some('G'), + "BLUE" => Some('B'), + "YELLOW" => Some('Y'), + _ => None, + }) + .collect(); + if color_word_stream.contains(Self::TEST_IMAGE_EXPECTED_CODE) { + return true; + } + + // Last fallback: keep only RGBY letters and search code. + let color_letter_stream: String = upper + .chars() + .filter(|c| matches!(*c, 'R' | 'G' | 'B' | 'Y')) + .collect(); + color_letter_stream.contains(Self::TEST_IMAGE_EXPECTED_CODE) + } + /// Create an AIClient without proxy (backward compatible) pub fn new(config: AIConfig) -> Self { let skip_ssl_verify = config.skip_ssl_verify; @@ -931,4 +996,87 @@ impl AIClient { } } } + + pub async fn test_image_input_connection(&self) -> Result { + let start_time = std::time::Instant::now(); + let provider = self.config.format.to_ascii_lowercase(); + let prompt = "Inspect the attached image and reply with exactly one 4-letter code for quadrant colors in TL,TR,BL,BR order using letters R,G,B,Y (R=red, G=green, B=blue, Y=yellow)."; + + let content = if provider == "anthropic" { + serde_json::json!([ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": Self::TEST_IMAGE_PNG_BASE64 + } + }, + { + "type": "text", + "text": prompt + } + ]) + } else { + serde_json::json!([ + { + "type": "image_url", + "image_url": { + "url": format!("data:image/png;base64,{}", Self::TEST_IMAGE_PNG_BASE64) + } + }, + { + "type": "text", + "text": prompt + } + ]) + }; + + let test_messages = vec![Message { + role: "user".to_string(), + content: Some(content.to_string()), + reasoning_content: None, + thinking_signature: None, + tool_calls: None, + tool_call_id: None, + name: None, + }]; + + match self.send_message(test_messages, None).await { + Ok(response) => { + let matched = Self::image_test_response_matches_expected(&response.text); + + if matched { + Ok(ConnectionTestResult { + success: true, + response_time_ms: start_time.elapsed().as_millis() as u64, + model_response: Some(response.text), + error_details: None, + }) + } else { + let detail = format!( + "Image understanding verification failed: expected code '{}', got response '{}'", + Self::TEST_IMAGE_EXPECTED_CODE, response.text + ); + debug!("test image input connection failed: {}", detail); + Ok(ConnectionTestResult { + success: false, + response_time_ms: start_time.elapsed().as_millis() as u64, + model_response: Some(response.text), + error_details: Some(detail), + }) + } + } + Err(e) => { + let error_msg = format!("{}", e); + debug!("test image input connection failed: {}", error_msg); + Ok(ConnectionTestResult { + success: false, + response_time_ms: start_time.elapsed().as_millis() as u64, + model_response: None, + error_details: Some(error_msg), + }) + } + } + } } diff --git a/src/web-ui/src/flow_chat/components/ModelSelector.tsx b/src/web-ui/src/flow_chat/components/ModelSelector.tsx index 17030fd5..bae057c4 100644 --- a/src/web-ui/src/flow_chat/components/ModelSelector.tsx +++ b/src/web-ui/src/flow_chat/components/ModelSelector.tsx @@ -158,8 +158,9 @@ export const ModelSelector: React.FC = ({ return allModels .filter(m => { if (!m.enabled) return false; - // Text-only models for general chat. - return m.category === 'general_chat'; + // Only show chat-capable models (exclude embeddings / image-gen / speech, etc.). + const capabilities = Array.isArray(m.capabilities) ? m.capabilities : []; + return capabilities.includes('text_chat'); }) .map(m => ({ id: m.id || '', diff --git a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts index 71968fb5..dc74fe44 100644 --- a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts +++ b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts @@ -51,20 +51,46 @@ interface ImageAnalysisResult { analysis_time_ms: number; } -// Keep this off for now: transport currently accepts text-only `userInput`. -// When backend supports multimodal turn input, this can be flipped (or moved to config). -const ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED = false; +const ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED = true; async function resolveSessionModelId( flowChatManager: FlowChatManager, - sessionId: string | undefined + sessionId: string | undefined, + agentType?: string ): Promise { const state = flowChatManager.getFlowChatState(); const session = sessionId ? state.sessions.get(sessionId) : undefined; - const configuredModel = session?.config?.modelName; + const configuredModel = session?.config?.modelName || null; + const { configManager } = await import('@/infrastructure/config/services/ConfigManager'); + const defaultModels = await configManager.getConfig>('ai.default_models') || {}; + const agentModels = await configManager.getConfig>('ai.agent_models') || {}; + + const resolveAlias = (modelId: string | null): string | null => { + if (!modelId) return null; + if (modelId === 'primary') { + return defaultModels.primary || null; + } + if (modelId === 'fast') { + return defaultModels.fast || defaultModels.primary || null; + } + if (modelId === 'default') { + return defaultModels.primary || null; + } + return modelId; + }; - if (configuredModel && configuredModel !== 'default') { - return configuredModel; + const effectiveAgentType = (agentType || session?.mode || 'agentic').trim(); + const configuredFromAgentModels = resolveAlias( + effectiveAgentType ? (agentModels[effectiveAgentType] ?? null) : null + ); + if (configuredFromAgentModels) { + return configuredFromAgentModels; + } + + // Backward-compatibility fallback for historical sessions. + const resolvedConfigured = resolveAlias(configuredModel); + if (resolvedConfigured) { + return resolvedConfigured; } const { getDefaultPrimaryModel } = await import('@/infrastructure/config/utils/modelConfigHelpers'); @@ -76,16 +102,22 @@ async function modelSupportsImageUnderstanding(modelId: string | null): Promise< const { configManager } = await import('@/infrastructure/config/services/ConfigManager'); const allModels = await configManager.getConfig('ai.models') || []; - const model = allModels.find(m => m.id === modelId || m.name === modelId); + const model = allModels.find( + m => m.id === modelId || m.name === modelId || m.model_name === modelId + ); + if (!model || model.enabled === false) return false; + const capabilities = Array.isArray(model?.capabilities) ? model.capabilities : []; - return capabilities.includes('image_understanding'); + const category = typeof model?.category === 'string' ? model.category : ''; + return capabilities.includes('image_understanding') || category === 'multimodal'; } async function chooseImageInputStrategy( flowChatManager: FlowChatManager, - sessionId: string | undefined + sessionId: string | undefined, + agentType?: string ): Promise { - const modelId = await resolveSessionModelId(flowChatManager, sessionId); + const modelId = await resolveSessionModelId(flowChatManager, sessionId, agentType); const supportsImageUnderstanding = await modelSupportsImageUnderstanding(modelId); if (supportsImageUnderstanding && ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED) { @@ -102,7 +134,7 @@ async function chooseImageInputStrategy( modelId, supportsImageUnderstanding, reason: supportsImageUnderstanding - ? 'direct_attach_disabled_until_multimodal_turn_input_is_available' + ? 'direct_attach_disabled_by_feature_flag' : 'primary_model_is_text_only', }; } @@ -136,7 +168,8 @@ async function analyzeImagesBeforeSend( function formatImageContextLine( ctx: ImageContext, - analysis?: ImageAnalysisResult + analysis?: ImageAnalysisResult, + strategy?: ImageInputStrategy ): string { const imgName = ctx.imageName || 'Untitled image'; const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : ''; @@ -144,6 +177,10 @@ function formatImageContextLine( ? `Path: ${ctx.imagePath}` : `Image ID: ${ctx.id}`; + if (strategy === 'direct-attach') { + return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nAttached as multimodal image input.`; + } + if (!analysis) { return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nTip: You can use the view_image tool (${ctx.isLocal ? 'image_path' : 'image_id'}).`; } @@ -237,7 +274,11 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender reason: 'fallback_default_preanalysis', }; try { - strategyDecision = await chooseImageInputStrategy(flowChatManager, sessionId); + strategyDecision = await chooseImageInputStrategy( + flowChatManager, + sessionId, + currentAgentType || undefined + ); } catch (error) { log.warn('Failed to resolve image input strategy, using pre-analysis fallback', { sessionId, @@ -255,28 +296,21 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender let imageAnalyses: ImageAnalysisResult[] = []; if (imageContexts.length > 0) { - if (strategyDecision.strategy === 'direct-attach') { - // Future extensibility hook: - // once start_dialog_turn supports multimodal payloads, this branch can send image items directly. - log.info('Direct image attach strategy is selected but transport is still text-only; using pre-analysis fallback', { - sessionId, - modelId: strategyDecision.modelId, - }); - } - - try { - imageAnalyses = await analyzeImagesBeforeSend(imageContexts, sessionId!, trimmedMessage); - log.debug('Image pre-analysis completed', { - sessionId, - imageCount: imageContexts.length, - analysisCount: imageAnalyses.length, - }); - } catch (error) { - log.warn('Image pre-analysis failed, continuing with context hints only', { - sessionId, - imageCount: imageContexts.length, - error: (error as Error)?.message ?? 'unknown', - }); + if (strategyDecision.strategy === 'vision-preanalysis') { + try { + imageAnalyses = await analyzeImagesBeforeSend(imageContexts, sessionId!, trimmedMessage); + log.debug('Image pre-analysis completed', { + sessionId, + imageCount: imageContexts.length, + analysisCount: imageAnalyses.length, + }); + } catch (error) { + log.warn('Image pre-analysis failed, continuing with context hints only', { + sessionId, + imageCount: imageContexts.length, + error: (error as Error)?.message ?? 'unknown', + }); + } } } @@ -295,7 +329,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender case 'code-snippet': return `[Code Snippet: ${ctx.filePath}:${ctx.startLine}-${ctx.endLine}]`; case 'image': { - return formatImageContextLine(ctx, analysisByImageId.get(ctx.id)); + return formatImageContextLine(ctx, analysisByImageId.get(ctx.id), strategyDecision.strategy); } case 'terminal-command': return `[Command: ${ctx.command}]`; @@ -319,7 +353,27 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender fullMessage, sessionId || undefined, displayMessage, - currentAgentType || 'agentic' + currentAgentType || 'agentic', + undefined, + strategyDecision.strategy === 'direct-attach' + ? { + imageContexts: imageContexts.map(ctx => ({ + id: ctx.id, + image_path: ctx.isLocal ? ctx.imagePath : undefined, + // Clipboard images are uploaded first and referenced by image_id only + // to avoid sending large base64 payloads in the turn request. + data_url: undefined, + mime_type: ctx.mimeType, + metadata: { + name: ctx.imageName, + width: ctx.width, + height: ctx.height, + file_size: ctx.fileSize, + source: ctx.source, + }, + })), + } + : undefined ); onClearContexts(); diff --git a/src/web-ui/src/flow_chat/services/FlowChatManager.ts b/src/web-ui/src/flow_chat/services/FlowChatManager.ts index 13720de3..f2b356d0 100644 --- a/src/web-ui/src/flow_chat/services/FlowChatManager.ts +++ b/src/web-ui/src/flow_chat/services/FlowChatManager.ts @@ -169,7 +169,10 @@ export class FlowChatManager { sessionId?: string, displayMessage?: string, agentType?: string, - switchToMode?: string + switchToMode?: string, + options?: { + imageContexts?: import('@/infrastructure/api/service-api/ImageAnalysisAPI').ImageContextData[]; + } ): Promise { const targetSessionId = sessionId || this.context.flowChatStore.getState().activeSessionId; @@ -177,7 +180,15 @@ export class FlowChatManager { throw new Error('No active session'); } - return sendMessageModule(this.context, message, targetSessionId, displayMessage, agentType, switchToMode); + return sendMessageModule( + this.context, + message, + targetSessionId, + displayMessage, + agentType, + switchToMode, + options + ); } async cancelCurrentTask(): Promise { diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts index 9cc46f82..fa9cf965 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts @@ -14,6 +14,7 @@ import { createLogger } from '@/shared/utils/logger'; import type { FlowChatContext, DialogTurn } from './types'; import { ensureBackendSession, retryCreateBackendSession } from './SessionModule'; import { cleanupSessionBuffers } from './TextChunkModule'; +import type { ImageContextData as ImageInputContextData } from '@/infrastructure/api/service-api/ImageAnalysisAPI'; const log = createLogger('MessageModule'); @@ -31,7 +32,10 @@ export async function sendMessage( sessionId: string, displayMessage?: string, agentType?: string, - switchToMode?: string + switchToMode?: string, + options?: { + imageContexts?: ImageInputContextData[]; + } ): Promise { const session = context.flowChatStore.getState().sessions.get(sessionId); if (!session) { @@ -105,6 +109,7 @@ export async function sendMessage( userInput: message, turnId: dialogTurnId, agentType: currentAgentType, + imageContexts: options?.imageContexts, }); } catch (error: any) { if (error?.message?.includes('Session does not exist') || error?.message?.includes('Not found')) { @@ -120,6 +125,7 @@ export async function sendMessage( userInput: message, turnId: dialogTurnId, agentType: currentAgentType, + imageContexts: options?.imageContexts, }); } else { throw error; diff --git a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts index 807ea431..5540f4a8 100644 --- a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts +++ b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts @@ -2,6 +2,7 @@ import { api } from './ApiClient'; import { createTauriCommandError } from '../errors/TauriCommandError'; +import type { ImageContextData as ImageInputContextData } from './ImageAnalysisAPI'; @@ -44,6 +45,8 @@ export interface StartDialogTurnRequest { userInput: string; turnId?: string; agentType: string; + /** Optional multimodal image contexts (snake_case fields, aligned with backend ImageContextData). */ + imageContexts?: ImageInputContextData[]; } @@ -349,4 +352,4 @@ export class AgentAPI { } -export const agentAPI = new AgentAPI(); \ No newline at end of file +export const agentAPI = new AgentAPI(); diff --git a/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts b/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts index fb6cbc98..e047864a 100644 --- a/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts +++ b/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts @@ -16,6 +16,71 @@ import { import { createLogger } from '@/shared/utils/logger'; const log = createLogger('ApiClient'); +const SENSITIVE_KEY_PATTERNS = [ + 'api_key', + 'apikey', + 'token', + 'secret', + 'password', + 'authorization' +]; + +function isSensitiveKey(key: string): boolean { + const normalized = key.toLowerCase(); + return SENSITIVE_KEY_PATTERNS.some(pattern => normalized.includes(pattern)); +} + +function maskSensitiveValue(value: unknown): string { + if (typeof value !== 'string') { + return '***'; + } + if (value.length <= 8) { + return '***'; + } + return `${value.slice(0, 4)}***${value.slice(-4)}`; +} + +function sanitizeForLog(value: unknown, parentKey?: string): unknown { + if (value === null || value === undefined) { + return value; + } + + if (Array.isArray(value)) { + return value.map(item => sanitizeForLog(item, parentKey)); + } + + if (typeof value !== 'object') { + if (parentKey && isSensitiveKey(parentKey)) { + return maskSensitiveValue(value); + } + return value; + } + + const obj = value as Record; + const sanitized: Record = {}; + + for (const [key, rawVal] of Object.entries(obj)) { + if (isSensitiveKey(key)) { + sanitized[key] = maskSensitiveValue(rawVal); + continue; + } + + // For HTTP header maps, mask sensitive header values by header name. + if ((key === 'headers' || key === 'custom_headers') && rawVal && typeof rawVal === 'object') { + const headerObj = rawVal as Record; + const maskedHeaders: Record = {}; + for (const [hKey, hVal] of Object.entries(headerObj)) { + maskedHeaders[hKey] = isSensitiveKey(hKey) ? maskSensitiveValue(hVal) : hVal; + } + sanitized[key] = maskedHeaders; + continue; + } + + sanitized[key] = sanitizeForLog(rawVal, key); + } + + return sanitized; +} export class ApiClient implements IApiClient { private config: ApiConfig; @@ -159,7 +224,11 @@ export class ApiClient implements IApiClient { if (this.config.enableLogging) { - log.debug('Request completed', { type: request.type, responseTime, config: request.config }); + log.debug('Request completed', { + type: request.type, + responseTime, + config: sanitizeForLog(request.config) + }); } return response.data; @@ -191,7 +260,12 @@ export class ApiClient implements IApiClient { if (this.config.enableLogging) { - log.error('Request failed after retries', { requestId: request.id, retryCount: request.retryCount, error }); + log.error('Request failed after retries', { + requestId: request.id, + retryCount: request.retryCount, + config: sanitizeForLog(request.config), + error + }); } throw this.normalizeError(error as Error); @@ -226,7 +300,7 @@ export class ApiClient implements IApiClient { } else { log.error('Command failed', { command: config.command, - args: config.args, + args: sanitizeForLog(config.args), error: errorMessage, rawError: error }); @@ -400,7 +474,11 @@ export function createLoggingMiddleware(): ApiMiddleware { try { const response = await next(request); const duration = Date.now() - startTime; - middlewareLog.debug('Request completed', { type: request.type, duration, config: request.config }); + middlewareLog.debug('Request completed', { + type: request.type, + duration, + config: sanitizeForLog(request.config) + }); return response; } catch (error) { const duration = Date.now() - startTime; diff --git a/src/web-ui/src/locales/zh-CN/settings.json b/src/web-ui/src/locales/zh-CN/settings.json index 6263dfbc..7ed4a16e 100644 --- a/src/web-ui/src/locales/zh-CN/settings.json +++ b/src/web-ui/src/locales/zh-CN/settings.json @@ -291,7 +291,7 @@ }, "capabilities": { "text_chat": "对话", - "image_understanding": "识图", + "image_understanding": "多模态", "image_generation": "绘图", "search": "搜索", "function_calling": "工具", @@ -322,7 +322,7 @@ }, "capabilityDescs": { "text_chat": "处理所有文本对话、代码生成、工具调用等任务", - "image_understanding": "分析和理解图片内容,支持图文混合对话", + "image_understanding": "当主模型不支持图片输入时,用于分析和理解图片内容", "image_generation": "根据文字描述生成图片(如 DALL-E、Stable Diffusion)", "search": "实时搜索网络信息,提供最新数据支持", "speech_recognition": "将语音转换为文字,支持语音输入功能(如智谱 GLM-ASR)" diff --git a/src/web-ui/src/locales/zh-CN/settings/ai-model.json b/src/web-ui/src/locales/zh-CN/settings/ai-model.json index bb7b8a55..f402c789 100644 --- a/src/web-ui/src/locales/zh-CN/settings/ai-model.json +++ b/src/web-ui/src/locales/zh-CN/settings/ai-model.json @@ -63,28 +63,28 @@ "categories": { "all": "全部", "text": "文本", - "multimodal": "图像", + "multimodal": "多模态", "other": "辅助" }, "category": { "label": "模型分类", "placeholder": "选择模型分类", "general_chat": "文本生成", - "multimodal": "图像理解", + "multimodal": "多模态", "image_generation": "图像生成", "search_enhanced": "信息检索", "speech_recognition": "语音识别" }, "categoryIcons": { "general_chat": "文本", - "multimodal": "视觉", + "multimodal": "多模态", "image_generation": "绘图", "search_enhanced": "检索", "speech_recognition": "语音" }, "categoryHints": { "general_chat": "文本生成:生成文本回复、代码等,适用于大多数对话场景", - "multimodal": "图像理解:理解图片内容并进行图文混合对话", + "multimodal": "多模态:理解图片内容并进行图文混合对话", "image_generation": "图像生成:根据文字描述生成图片", "search_enhanced": "信息检索:搜索网络获取实时信息,只需配置名称、API地址和密钥", "speech_recognition": "语音识别:将语音转换为文字(如智谱 GLM-ASR)" @@ -140,7 +140,7 @@ }, "capabilities": { "text_chat": "对话", - "image_understanding": "识图", + "image_understanding": "多模态", "image_generation": "绘图", "search": "搜索", "function_calling": "工具", diff --git a/src/web-ui/src/locales/zh-CN/settings/default-model.json b/src/web-ui/src/locales/zh-CN/settings/default-model.json index 53b74852..6ae7d1f4 100644 --- a/src/web-ui/src/locales/zh-CN/settings/default-model.json +++ b/src/web-ui/src/locales/zh-CN/settings/default-model.json @@ -25,11 +25,11 @@ } }, "optional": { - "title": "多模态模型配置", + "title": "扩展能力模型配置", "capabilities": { "image_understanding": { - "label": "图像理解", - "description": "分析图片、截图内容,支持图文混合对话" + "label": "图片理解模型", + "description": "当主模型不支持图片输入时,用于分析图片和截图内容" }, "image_generation": { "label": "图像生成",