From 650935e7ab68cc61b9166bf3bbc52925982b0b59 Mon Sep 17 00:00:00 2001
From: wgqqqqq <wgq0617@gmail.com>
Date: Thu, 5 Mar 2026 22:40:00 +0800
Subject: [PATCH 1/2] feat: migrate image tool to view_image and harden image
 flow

---
 Cargo.toml                                    |   1 +
 .../desktop/src/api/image_analysis_api.rs     |  74 +-
 src/apps/desktop/src/api/tool_api.rs          |   6 +-
 src/crates/core/Cargo.toml                    |   1 +
 .../core/src/agentic/agents/agentic_mode.rs   |   2 +-
 .../image_analysis/image_processing.rs        | 328 +++++++++
 .../core/src/agentic/image_analysis/mod.rs    |  17 +-
 .../src/agentic/image_analysis/processor.rs   | 304 +++-----
 .../implementations/analyze_image_tool.rs     | 687 ------------------
 .../src/agentic/tools/implementations/mod.rs  |  62 +-
 .../tools/implementations/view_image_tool.rs  | 396 ++++++++++
 src/crates/core/src/agentic/tools/registry.rs |  12 +-
 .../component-library/components/registry.tsx |   6 +-
 .../src/flow_chat/hooks/useMessageSender.ts   | 228 +++++-
 .../tool-cards/ImageAnalysisCard.tsx          |  40 +-
 src/web-ui/src/flow_chat/tool-cards/index.ts  |   6 +-
 src/web-ui/src/locales/en-US/flow-chat.json   |   9 +-
 src/web-ui/src/locales/zh-CN/flow-chat.json   |   9 +-
 18 files changed, 1131 insertions(+), 1057 deletions(-)
 create mode 100644 src/crates/core/src/agentic/image_analysis/image_processing.rs
 delete mode 100644 src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs
 create mode 100644 src/crates/core/src/agentic/tools/implementations/view_image_tool.rs

diff --git a/Cargo.toml b/Cargo.toml
index accfe9ec..cfbbf464 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,6 +51,7 @@ uuid = { version = "1.0", features = ["v4", "serde"] }
 chrono = { version = "0.4", features = ["serde", "clock"] }
 regex = "1.10"
 base64 = "0.21"
+image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp", "bmp"] }
 md5 = "0.7"
 once_cell = "1.19.0"
 lazy_static = "1.4"
diff --git a/src/apps/desktop/src/api/image_analysis_api.rs b/src/apps/desktop/src/api/image_analysis_api.rs
index 09035c0b..369272ca 100644
--- a/src/apps/desktop/src/api/image_analysis_api.rs
+++ b/src/apps/desktop/src/api/image_analysis_api.rs
@@ -2,7 +2,10 @@
 
 use crate::api::app_state::AppState;
 use bitfun_core::agentic::coordination::ConversationCoordinator;
-use bitfun_core::agentic::image_analysis::*;
+use bitfun_core::agentic::image_analysis::{
+    resolve_vision_model_from_ai_config, AnalyzeImagesRequest, ImageAnalysisResult, ImageAnalyzer,
+    MessageEnhancer, SendEnhancedMessageRequest,
+};
 use log::error;
 use std::sync::Arc;
 use tauri::State;
@@ -21,65 +24,26 @@ pub async fn analyze_images(
             format!("Failed to get AI config: {}", e)
         })?;
 
-    let image_model_id = ai_config
-        .default_models
-        .image_understanding
-        .ok_or_else(|| {
-            error!("Image understanding model not configured");
-            "Image understanding model not configured".to_string()
-        })?;
-
-    let image_model_id = if image_model_id.is_empty() {
-        let vision_model = ai_config
-            .models
-            .iter()
-            .find(|m| {
-                m.enabled
-                    && m.capabilities.iter().any(|cap| {
-                        matches!(
-                        cap,
-                        bitfun_core::service::config::types::ModelCapability::ImageUnderstanding
-                    )
-                    })
-            })
-            .map(|m| m.id.as_str());
-
-        match vision_model {
-            Some(model_id) => model_id,
-            None => {
-                error!("No image understanding model found");
-                return Err(
-                    "Image understanding model not configured and no compatible model found.\n\n\
-                    Please add a model that supports image understanding\
-                    in [Settings → AI Model Config], enable 'image_understanding' capability, \
-                    and assign it in [Settings → Super Agent]."
-                        .to_string(),
-                );
-            }
-        }
-    } else {
-        &image_model_id
-    };
-
-    let image_model = ai_config
-        .models
-        .iter()
-        .find(|m| &m.id == image_model_id)
-        .ok_or_else(|| {
-            error!(
-                "Model not found: model_id={}, available_models={:?}",
-                image_model_id,
-                ai_config.models.iter().map(|m| &m.id).collect::<Vec<_>>()
-            );
-            format!("Model not found: {}", image_model_id)
-        })?
-        .clone();
+    let image_model = resolve_vision_model_from_ai_config(&ai_config).map_err(|e| {
+        error!(
+            "No image understanding model available: available_models={:?}, error={}",
+            ai_config.models.iter().map(|m| &m.id).collect::<Vec<_>>(),
+            e
+        );
+        format!(
+            "Image understanding model not configured and no compatible model found.\n\n\
+             Please add a model that supports image understanding \
+             in [Settings → AI Model Config], enable 'image_understanding' capability, \
+             and assign it in [Settings → Super Agent].\n\nDetails: {}",
+            e
+        )
+    })?;
 
     let workspace_path = state.workspace_path.read().await.clone();
 
     let ai_client = state
         .ai_client_factory
-        .get_client_by_id(image_model_id)
+        .get_client_by_id(&image_model.id)
         .await
         .map_err(|e| format!("Failed to create AI client: {}", e))?;
 
diff --git a/src/apps/desktop/src/api/tool_api.rs b/src/apps/desktop/src/api/tool_api.rs
index 3cca80df..86fa2928 100644
--- a/src/apps/desktop/src/api/tool_api.rs
+++ b/src/apps/desktop/src/api/tool_api.rs
@@ -3,7 +3,9 @@
 use log::error;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use std::sync::Arc;
 
+use crate::api::context_upload_api::create_image_context_provider;
 use bitfun_core::agentic::{
     tools::framework::ToolUseContext,
     tools::{get_all_tools, get_readonly_tools},
@@ -171,7 +173,7 @@ pub async fn validate_tool_input(
                 read_file_timestamps: HashMap::new(),
                 options: None,
                 response_state: None,
-                image_context_provider: None,
+                image_context_provider: Some(Arc::new(create_image_context_provider())),
                 subagent_parent_info: None,
                 cancellation_token: None,
             };
@@ -210,7 +212,7 @@ pub async fn execute_tool(request: ToolExecutionRequest) -> Result<ToolExecution
                 read_file_timestamps: HashMap::new(),
                 options: None,
                 response_state: None,
-                image_context_provider: None,
+                image_context_provider: Some(Arc::new(create_image_context_provider())),
                 subagent_parent_info: None,
                 cancellation_token: None,
             };
diff --git a/src/crates/core/Cargo.toml b/src/crates/core/Cargo.toml
index d0078efe..8d91673f 100644
--- a/src/crates/core/Cargo.toml
+++ b/src/crates/core/Cargo.toml
@@ -31,6 +31,7 @@ uuid = { workspace = true }
 chrono = { workspace = true }
 regex = { workspace = true }
 base64 = { workspace = true }
+image = { workspace = true }
 md5 = { workspace = true }
 once_cell = { workspace = true }
 lazy_static = { workspace = true }
diff --git a/src/crates/core/src/agentic/agents/agentic_mode.rs b/src/crates/core/src/agentic/agents/agentic_mode.rs
index 103c415b..dc82dbed 100644
--- a/src/crates/core/src/agentic/agents/agentic_mode.rs
+++ b/src/crates/core/src/agentic/agents/agentic_mode.rs
@@ -23,7 +23,7 @@ impl AgenticMode {
                 "IdeControl".to_string(),
                 "MermaidInteractive".to_string(),
                 "ReadLints".to_string(),
-                "AnalyzeImage".to_string(),
+                "view_image".to_string(),
                 "Skill".to_string(),
                 "AskUserQuestion".to_string(),
                 "Git".to_string(),
diff --git a/src/crates/core/src/agentic/image_analysis/image_processing.rs b/src/crates/core/src/agentic/image_analysis/image_processing.rs
new file mode 100644
index 00000000..88d2184d
--- /dev/null
+++ b/src/crates/core/src/agentic/image_analysis/image_processing.rs
@@ -0,0 +1,328 @@
+//! Shared image processing utilities used by both API-side image analysis and tool-driven image analysis.
+
+use super::types::ImageLimits;
+use crate::service::config::get_global_config_service;
+use crate::service::config::types::{AIConfig as ServiceAIConfig, AIModelConfig, ModelCapability};
+use crate::util::errors::{BitFunError, BitFunResult};
+use crate::util::types::Message;
+use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _};
+use image::codecs::jpeg::JpegEncoder;
+use image::codecs::png::PngEncoder;
+use image::imageops::FilterType;
+use image::ColorType;
+use image::DynamicImage;
+use image::ImageEncoder;
+use image::ImageFormat;
+use serde_json::json;
+use std::path::{Path, PathBuf};
+use tokio::fs;
+
+#[derive(Debug, Clone)]
+pub struct ProcessedImage {
+    pub data: Vec<u8>,
+    pub mime_type: String,
+    pub width: u32,
+    pub height: u32,
+}
+
+pub fn resolve_vision_model_from_ai_config(
+    ai_config: &ServiceAIConfig,
+) -> BitFunResult<AIModelConfig> {
+    let target_model_id = ai_config
+        .default_models
+        .image_understanding
+        .as_ref()
+        .filter(|id| !id.is_empty());
+
+    if let Some(id) = target_model_id {
+        return ai_config
+            .models
+            .iter()
+            .find(|m| m.id == *id)
+            .cloned()
+            .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id)));
+    }
+
+    ai_config
+        .models
+        .iter()
+        .find(|m| {
+            m.enabled
+                && m.capabilities
+                    .iter()
+                    .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding))
+        })
+        .cloned()
+        .ok_or_else(|| {
+            BitFunError::service(
+                "No image understanding model found.\nPlease configure an image understanding model in settings"
+                    .to_string(),
+            )
+        })
+}
+
+pub async fn resolve_vision_model_from_global_config() -> BitFunResult<AIModelConfig> {
+    let config_service = get_global_config_service().await?;
+    let ai_config: ServiceAIConfig = config_service
+        .get_config(Some("ai"))
+        .await
+        .map_err(|e| BitFunError::service(format!("Failed to get AI config: {}", e)))?;
+
+    resolve_vision_model_from_ai_config(&ai_config)
+}
+
+pub fn resolve_image_path(path: &str, workspace_path: Option<&Path>) -> BitFunResult<PathBuf> {
+    let path_buf = PathBuf::from(path);
+
+    if path_buf.is_absolute() {
+        Ok(path_buf)
+    } else if let Some(workspace) = workspace_path {
+        Ok(workspace.join(path_buf))
+    } else {
+        Ok(path_buf)
+    }
+}
+
+pub async fn load_image_from_path(
+    path: &Path,
+    _workspace_path: Option<&Path>,
+) -> BitFunResult<Vec<u8>> {
+    fs::read(path)
+        .await
+        .map_err(|e| BitFunError::io(format!("Failed to read image: {}", e)))
+}
+
+pub fn decode_data_url(data_url: &str) -> BitFunResult<(Vec<u8>, Option<String>)> {
+    if !data_url.starts_with("data:") {
+        return Err(BitFunError::validation("Invalid data URL format"));
+    }
+
+    let parts: Vec<&str> = data_url.splitn(2, ',').collect();
+    if parts.len() != 2 {
+        return Err(BitFunError::validation("Data URL format error"));
+    }
+
+    let header = parts[0];
+    let mime_type = header
+        .strip_prefix("data:")
+        .and_then(|s| s.split(';').next())
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .map(ToString::to_string);
+
+    let base64_data = parts[1];
+    let image_data = BASE64
+        .decode(base64_data)
+        .map_err(|e| BitFunError::parse(format!("Base64 decode failed: {}", e)))?;
+
+    Ok((image_data, mime_type))
+}
+
+pub fn detect_mime_type_from_bytes(
+    image_data: &[u8],
+    fallback_mime: Option<&str>,
+) -> BitFunResult<String> {
+    if let Ok(format) = image::guess_format(image_data) {
+        if let Some(mime) = image_format_to_mime(format) {
+            return Ok(mime.to_string());
+        }
+    }
+
+    if let Some(fallback) = fallback_mime {
+        if fallback.starts_with("image/") {
+            return Ok(fallback.to_string());
+        }
+    }
+
+    Err(BitFunError::validation(
+        "Unsupported or unrecognized image format",
+    ))
+}
+
+pub fn optimize_image_for_provider(
+    image_data: Vec<u8>,
+    provider: &str,
+    fallback_mime: Option<&str>,
+) -> BitFunResult<ProcessedImage> {
+    let limits = ImageLimits::for_provider(provider);
+
+    let guessed_format = image::guess_format(&image_data).ok();
+    let dynamic = image::load_from_memory(&image_data)
+        .map_err(|e| BitFunError::validation(format!("Failed to decode image data: {}", e)))?;
+
+    let (orig_width, orig_height) = (dynamic.width(), dynamic.height());
+    let needs_resize = orig_width > limits.max_width || orig_height > limits.max_height;
+
+    if !needs_resize && image_data.len() <= limits.max_size {
+        let mime_type = detect_mime_type_from_bytes(&image_data, fallback_mime)?;
+        return Ok(ProcessedImage {
+            data: image_data,
+            mime_type,
+            width: orig_width,
+            height: orig_height,
+        });
+    }
+
+    let mut working = if needs_resize {
+        dynamic.resize(limits.max_width, limits.max_height, FilterType::Triangle)
+    } else {
+        dynamic
+    };
+
+    let preferred_format = match guessed_format {
+        Some(ImageFormat::Jpeg) => ImageFormat::Jpeg,
+        _ => ImageFormat::Png,
+    };
+
+    let mut encoded = encode_dynamic_image(&working, preferred_format, 85)?;
+
+    if encoded.0.len() > limits.max_size {
+        for quality in [80u8, 65, 50, 35] {
+            encoded = encode_dynamic_image(&working, ImageFormat::Jpeg, quality)?;
+            if encoded.0.len() <= limits.max_size {
+                break;
+            }
+        }
+    }
+
+    if encoded.0.len() > limits.max_size {
+        for _ in 0..3 {
+            let next_w = ((working.width() as f32) * 0.85).round().max(64.0) as u32;
+            let next_h = ((working.height() as f32) * 0.85).round().max(64.0) as u32;
+            if next_w == working.width() && next_h == working.height() {
+                break;
+            }
+
+            working = working.resize(next_w, next_h, FilterType::Triangle);
+
+            for quality in [70u8, 55, 40] {
+                encoded = encode_dynamic_image(&working, ImageFormat::Jpeg, quality)?;
+                if encoded.0.len() <= limits.max_size {
+                    break;
+                }
+            }
+
+            if encoded.0.len() <= limits.max_size {
+                break;
+            }
+        }
+    }
+
+    Ok(ProcessedImage {
+        data: encoded.0,
+        mime_type: encoded.1,
+        width: working.width(),
+        height: working.height(),
+    })
+}
+
+pub fn build_multimodal_message(
+    prompt: &str,
+    image_data: &[u8],
+    mime_type: &str,
+    provider: &str,
+) -> BitFunResult<Vec<Message>> {
+    let base64_data = BASE64.encode(image_data);
+    let provider_lower = provider.to_lowercase();
+
+    let message = if provider_lower.contains("anthropic") {
+        Message {
+            role: "user".to_string(),
+            content: Some(serde_json::to_string(&json!([
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": mime_type,
+                        "data": base64_data
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": prompt
+                }
+            ]))?),
+            reasoning_content: None,
+            thinking_signature: None,
+            tool_calls: None,
+            tool_call_id: None,
+            name: None,
+        }
+    } else {
+        // Default to OpenAI-compatible payload shape for OpenAI and most OpenAI-compatible providers.
+        Message {
+            role: "user".to_string(),
+            content: Some(serde_json::to_string(&json!([
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": format!("data:{};base64,{}", mime_type, base64_data)
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": prompt
+                }
+            ]))?),
+            reasoning_content: None,
+            thinking_signature: None,
+            tool_calls: None,
+            tool_call_id: None,
+            name: None,
+        }
+    };
+
+    Ok(vec![message])
+}
+
+fn image_format_to_mime(format: ImageFormat) -> Option<&'static str> {
+    match format {
+        ImageFormat::Png => Some("image/png"),
+        ImageFormat::Jpeg => Some("image/jpeg"),
+        ImageFormat::Gif => Some("image/gif"),
+        ImageFormat::WebP => Some("image/webp"),
+        ImageFormat::Bmp => Some("image/bmp"),
+        _ => None,
+    }
+}
+
+fn encode_dynamic_image(
+    image: &DynamicImage,
+    format: ImageFormat,
+    jpeg_quality: u8,
+) -> BitFunResult<(Vec<u8>, String)> {
+    let target_format = match format {
+        ImageFormat::Jpeg => ImageFormat::Jpeg,
+        _ => ImageFormat::Png,
+    };
+
+    let mut buffer = Vec::new();
+
+    match target_format {
+        ImageFormat::Png => {
+            let rgba = image.to_rgba8();
+            let encoder = PngEncoder::new(&mut buffer);
+            encoder
+                .write_image(
+                    rgba.as_raw(),
+                    image.width(),
+                    image.height(),
+                    ColorType::Rgba8.into(),
+                )
+                .map_err(|e| BitFunError::tool(format!("PNG encode failed: {}", e)))?;
+        }
+        ImageFormat::Jpeg => {
+            let mut encoder = JpegEncoder::new_with_quality(&mut buffer, jpeg_quality);
+            encoder
+                .encode_image(image)
+                .map_err(|e| BitFunError::tool(format!("JPEG encode failed: {}", e)))?;
+        }
+        _ => unreachable!("unsupported target format"),
+    }
+
+    let mime = image_format_to_mime(target_format)
+        .unwrap_or("image/png")
+        .to_string();
+
+    Ok((buffer, mime))
+}
diff --git a/src/crates/core/src/agentic/image_analysis/mod.rs b/src/crates/core/src/agentic/image_analysis/mod.rs
index 2b02ebf4..814afb66 100644
--- a/src/crates/core/src/agentic/image_analysis/mod.rs
+++ b/src/crates/core/src/agentic/image_analysis/mod.rs
@@ -1,12 +1,17 @@
 //! Image Analysis Module
-//! 
+//!
 //! Implements image pre-understanding functionality, converting image content to text descriptions
 
-pub mod types;
-pub mod processor;
 pub mod enhancer;
+pub mod image_processing;
+pub mod processor;
+pub mod types;
 
-pub use types::*;
-pub use processor::ImageAnalyzer;
 pub use enhancer::MessageEnhancer;
-
+pub use image_processing::{
+    build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path,
+    optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_ai_config,
+    resolve_vision_model_from_global_config, ProcessedImage,
+};
+pub use processor::ImageAnalyzer;
+pub use types::*;
diff --git a/src/crates/core/src/agentic/image_analysis/processor.rs b/src/crates/core/src/agentic/image_analysis/processor.rs
index 145b0ae1..2363738d 100644
--- a/src/crates/core/src/agentic/image_analysis/processor.rs
+++ b/src/crates/core/src/agentic/image_analysis/processor.rs
@@ -1,18 +1,18 @@
 //! Image Processor
 //!
-//! Handles image loading, compression, format conversion, and other operations
+//! Handles image loading, preprocessing, multimodal message construction, and response parsing.
 
-use super::types::{AnalyzeImagesRequest, ImageAnalysisResult, ImageContextData, ImageLimits};
+use super::image_processing::{
+    build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path,
+    optimize_image_for_provider, resolve_image_path,
+};
+use super::types::{AnalyzeImagesRequest, ImageAnalysisResult, ImageContextData};
 use crate::infrastructure::ai::AIClient;
 use crate::service::config::types::AIModelConfig;
 use crate::util::errors::*;
-use crate::util::types::Message;
-use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _};
-use log::{debug, error, info};
-use serde_json::json;
-use std::path::{Path, PathBuf};
+use log::{debug, error, info, warn};
+use std::path::PathBuf;
 use std::sync::Arc;
-use tokio::fs;
 
 /// Image Analyzer
 pub struct ImageAnalyzer {
@@ -36,7 +36,6 @@ impl ImageAnalyzer {
     ) -> BitFunResult<Vec<ImageAnalysisResult>> {
         info!("Starting analysis of {} images", request.images.len());
 
-        // Process multiple images in parallel
         let mut tasks = vec![];
 
         for img_ctx in request.images {
@@ -59,7 +58,6 @@ impl ImageAnalyzer {
             tasks.push(task);
         }
 
-        // Wait for all analyses to complete
         let mut results = vec![];
         for task in tasks {
             match task.await {
@@ -70,7 +68,10 @@ impl ImageAnalyzer {
                 }
                 Err(e) => {
                     error!("Image analysis task failed: {:?}", e);
-                    return Err(BitFunError::service(format!("Image analysis task failed: {}", e)));
+                    return Err(BitFunError::service(format!(
+                        "Image analysis task failed: {}",
+                        e
+                    )));
                 }
             }
         }
@@ -79,7 +80,6 @@ impl ImageAnalyzer {
         Ok(results)
     }
 
-    /// Analyze a single image
     async fn analyze_single_image(
         image_ctx: ImageContextData,
         model: &AIModelConfig,
@@ -91,42 +91,35 @@ impl ImageAnalyzer {
 
         debug!("Analyzing image: {}", image_ctx.id);
 
-        // 1. Load image
-        let image_data =
+        let (image_data, fallback_mime) =
             Self::load_image_from_context(&image_ctx, workspace_path.as_deref()).await?;
 
-        // 2. Image preprocessing (compression, format conversion)
-        let (optimized_data, mime_type) =
-            Self::optimize_image_for_model(image_data, &image_ctx.mime_type, model)?;
-
-        // 3. Convert to Base64
-        let base64_data = BASE64.encode(&optimized_data);
+        let processed =
+            optimize_image_for_provider(image_data, &model.provider, fallback_mime.as_deref())?;
 
         debug!(
-            "Image processing completed: original_type={}, optimized_type={}, size={}KB",
-            image_ctx.mime_type,
-            mime_type,
-            optimized_data.len() / 1024
+            "Image processing completed: mime={}, size={}KB, dimensions={}x{}",
+            processed.mime_type,
+            processed.data.len() / 1024,
+            processed.width,
+            processed.height
         );
 
-        // 4. Build analysis prompt
         let analysis_prompt = Self::build_image_analysis_prompt(user_context);
 
-        // 5. Build multimodal message
-        let messages = Self::build_multimodal_message(
+        let messages = build_multimodal_message(
             &analysis_prompt,
-            &base64_data,
-            &mime_type,
+            &processed.data,
+            &processed.mime_type,
             &model.provider,
         )?;
 
-        // Save complete multimodal message to AI log
         debug!(target: "ai::image_analysis_request",
             "Complete multimodal message:\n{}",
-            serde_json::to_string_pretty(&messages).unwrap_or_else(|_| "Serialization failed".to_string())
+            serde_json::to_string_pretty(&messages)
+                .unwrap_or_else(|_| "Serialization failed".to_string())
         );
 
-        // 6. Call AI model for image analysis
         debug!(
             "Calling vision model: image_id={}, model={}",
             image_ctx.id, model.model_name
@@ -138,100 +131,38 @@ impl ImageAnalyzer {
 
         debug!("AI response content: {}", ai_response.text);
 
-        // 7. Parse response into structured result
-        let mut analysis_result = Self::parse_analysis_response(&ai_response.text, &image_ctx.id)?;
-
-        let elapsed = start.elapsed().as_millis() as u64;
-        analysis_result.analysis_time_ms = elapsed;
+        let mut analysis_result = Self::parse_analysis_response(&ai_response.text, &image_ctx.id);
+        analysis_result.analysis_time_ms = start.elapsed().as_millis() as u64;
 
         info!(
             "Image analysis completed: image_id={}, duration={}ms",
-            image_ctx.id, elapsed
+            image_ctx.id, analysis_result.analysis_time_ms
         );
 
         Ok(analysis_result)
     }
 
-    /// Load image from context
     async fn load_image_from_context(
         ctx: &ImageContextData,
-        workspace_path: Option<&Path>,
-    ) -> BitFunResult<Vec<u8>> {
+        workspace_path: Option<&std::path::Path>,
+    ) -> BitFunResult<(Vec<u8>, Option<String>)> {
         if let Some(data_url) = &ctx.data_url {
-            // Parse from data URL
-            Self::decode_data_url(data_url)
-        } else if let Some(path_str) = &ctx.image_path {
-            // Load from file path
-            let path = PathBuf::from(path_str);
-
-            // Security check: ensure path is within workspace
-            if let Some(workspace) = workspace_path {
-                let canonical_path = tokio::fs::canonicalize(&path)
-                    .await
-                    .map_err(|e| BitFunError::io(format!("Image file does not exist: {}", e)))?;
-                let canonical_workspace = tokio::fs::canonicalize(workspace)
-                    .await
-                    .map_err(|e| BitFunError::io(format!("Invalid workspace path: {}", e)))?;
-
-                if !canonical_path.starts_with(&canonical_workspace) {
-                    return Err(BitFunError::validation("Image path must be within workspace"));
-                }
-            }
-
-            fs::read(&path)
-                .await
-                .map_err(|e| BitFunError::io(format!("Failed to read image: {}", e)))
-        } else {
-            Err(BitFunError::validation("Image context missing path or data"))
-        }
-    }
-
-    /// Decode data URL
-    fn decode_data_url(data_url: &str) -> BitFunResult<Vec<u8>> {
-        // data:image/png;base64,iVBORw0KG...
-        if !data_url.starts_with("data:") {
-            return Err(BitFunError::validation("Invalid data URL format"));
+            let (data, mime) = decode_data_url(data_url)?;
+            return Ok((data, mime.or_else(|| Some(ctx.mime_type.clone()))));
         }
 
-        let parts: Vec<&str> = data_url.splitn(2, ',').collect();
-        if parts.len() != 2 {
-            return Err(BitFunError::validation("Data URL format error"));
+        if let Some(path_str) = &ctx.image_path {
+            let path = resolve_image_path(path_str, workspace_path)?;
+            let data = load_image_from_path(&path, workspace_path).await?;
+            let detected_mime = detect_mime_type_from_bytes(&data, Some(&ctx.mime_type)).ok();
+            return Ok((data, detected_mime.or_else(|| Some(ctx.mime_type.clone()))));
         }
 
-        let base64_data = parts[1];
-        BASE64
-            .decode(base64_data)
-            .map_err(|e| BitFunError::parse(format!("Base64 decoding failed: {}", e)))
-    }
-
-    /// Optimize image (compression, format conversion)
-    fn optimize_image_for_model(
-        image_data: Vec<u8>,
-        original_mime: &str,
-        model: &AIModelConfig,
-    ) -> BitFunResult<(Vec<u8>, String)> {
-        // Get model limits
-        let limits = ImageLimits::for_provider(&model.provider);
-
-        // If image size is within limit, return directly
-        if image_data.len() <= limits.max_size {
-            debug!("Image size within limit, no compression needed");
-            return Ok((image_data, original_mime.to_string()));
-        }
-
-        info!(
-            "Image size {}KB exceeds limit {}KB, compression needed",
-            image_data.len() / 1024,
-            limits.max_size / 1024
-        );
-
-        // TODO: Use image crate for actual compression
-
-        // Temporarily return original image, compression logic to be implemented later
-        Ok((image_data, original_mime.to_string()))
+        Err(BitFunError::validation(
+            "Image context missing path or data",
+        ))
     }
 
-    /// Build image analysis prompt
     fn build_image_analysis_prompt(user_context: Option<&str>) -> String {
         let mut prompt = String::from(
             "Please analyze the content of this image in detail. Output in the following JSON format:\n\n\
@@ -261,119 +192,63 @@ impl ImageAnalyzer {
         prompt
     }
 
-    /// Build multimodal message
-    fn build_multimodal_message(
-        prompt: &str,
-        base64_data: &str,
-        mime_type: &str,
-        provider: &str,
-    ) -> BitFunResult<Vec<Message>> {
-        let message = match provider.to_lowercase().as_str() {
-            "openai" => {
-                // OpenAI format (Zhipu AI compatible)
-                // Note:
-                // 1. Zhipu AI only supports url field, does not support detail parameter
-                // 2. Image must come first, text after (consistent with official examples)
-                Message {
-                    role: "user".to_string(),
-                    content: Some(serde_json::to_string(&json!([
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": format!("data:{};base64,{}", mime_type, base64_data)
-                            }
-                        },
-                        {
-                            "type": "text",
-                            "text": prompt
-                        }
-                    ]))?),
-                    reasoning_content: None,
-                    thinking_signature: None,
-                    tool_calls: None,
-                    tool_call_id: None,
-                    name: None,
-                }
-            }
-            "anthropic" => {
-                // Anthropic format (content is an array)
-                Message {
-                    role: "user".to_string(),
-                    content: Some(serde_json::to_string(&json!([
-                        {
-                            "type": "image",
-                            "source": {
-                                "type": "base64",
-                                "media_type": mime_type,
-                                "data": base64_data
-                            }
-                        },
-                        {
-                            "type": "text",
-                            "text": prompt
-                        }
-                    ]))?),
-                    reasoning_content: None,
-                    thinking_signature: None,
-                    tool_calls: None,
-                    tool_call_id: None,
-                    name: None,
-                }
-            }
-            _ => {
-                return Err(BitFunError::validation(format!(
-                    "Unsupported provider: {}",
-                    provider
-                )));
-            }
-        };
+    fn parse_analysis_response(response: &str, image_id: &str) -> ImageAnalysisResult {
+        let json_str = Self::extract_json_from_markdown(response).unwrap_or(response);
 
-        Ok(vec![message])
-    }
+        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(json_str) {
+            return ImageAnalysisResult {
+                image_id: image_id.to_string(),
+                summary: parsed["summary"]
+                    .as_str()
+                    .unwrap_or("Image analysis completed")
+                    .to_string(),
+                detailed_description: parsed["detailed_description"]
+                    .as_str()
+                    .unwrap_or(response)
+                    .to_string(),
+                detected_elements: parsed["detected_elements"]
+                    .as_array()
+                    .map(|arr| {
+                        arr.iter()
+                            .filter_map(|v| v.as_str())
+                            .map(String::from)
+                            .collect()
+                    })
+                    .unwrap_or_default(),
+                confidence: parsed["confidence"].as_f64().unwrap_or(0.8) as f32,
+                analysis_time_ms: 0,
+            };
+        }
 
-    /// Parse AI response into structured result
-    fn parse_analysis_response(
-        response: &str,
-        image_id: &str,
-    ) -> BitFunResult<ImageAnalysisResult> {
-        // Extract JSON
-        let json_str = Self::extract_json_from_markdown(response).unwrap_or(response);
+        warn!(
+            "Image analysis response is not valid JSON, falling back to plain text: image_id={}",
+            image_id
+        );
 
-        // Parse JSON
-        let parsed: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
-            BitFunError::parse(format!(
-                "Failed to parse image analysis result: {}. Original response: {}",
-                e, response
-            ))
-        })?;
+        let cleaned = response.trim();
+        let summary = if cleaned.is_empty() {
+            "Image analysis completed".to_string()
+        } else {
+            cleaned
+                .lines()
+                .next()
+                .unwrap_or("Image analysis completed")
+                .chars()
+                .take(140)
+                .collect()
+        };
 
-        Ok(ImageAnalysisResult {
+        ImageAnalysisResult {
             image_id: image_id.to_string(),
-            summary: parsed["summary"]
-                .as_str()
-                .unwrap_or("Image analysis completed")
-                .to_string(),
-            detailed_description: parsed["detailed_description"]
-                .as_str()
-                .unwrap_or("")
-                .to_string(),
-            detected_elements: parsed["detected_elements"]
-                .as_array()
-                .map(|arr| {
-                    arr.iter()
-                        .filter_map(|v| v.as_str())
-                        .map(String::from)
-                        .collect()
-                })
-                .unwrap_or_default(),
-            confidence: parsed["confidence"].as_f64().unwrap_or(0.8) as f32,
-            analysis_time_ms: 0, // Will be filled externally
-        })
+            summary,
+            detailed_description: cleaned.to_string(),
+            detected_elements: Vec::new(),
+            confidence: 0.5,
+            analysis_time_ms: 0,
+        }
     }
 
-    /// Extract JSON from Markdown code block
     fn extract_json_from_markdown(text: &str) -> Option<&str> {
-        // 1. Try to extract Zhipu AI's special marker format <|begin_of_box|>...<|end_of_box|>
         if let Some(start_idx) = text.find("<|begin_of_box|>") {
             let content_start = start_idx + "<|begin_of_box|>".len();
             if let Some(end_idx) = text[content_start..].find("<|end_of_box|>") {
@@ -383,7 +258,6 @@ impl ImageAnalyzer {
             }
         }
 
-        // 2. Try to extract Markdown code block format ```json ... ``` or ``` ... ```
         let start_markers = ["```json\n", "```\n"];
 
         for marker in &start_markers {
diff --git a/src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs b/src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs
deleted file mode 100644
index 4e4475fe..00000000
--- a/src/crates/core/src/agentic/tools/implementations/analyze_image_tool.rs
+++ /dev/null
@@ -1,687 +0,0 @@
-//! Image analysis tool - allows Agent to analyze image content on demand
-//!
-//! Provides flexible image analysis capabilities, Agent can customize analysis prompts and focus areas
-
-use async_trait::async_trait;
-use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _};
-use log::{debug, info, trace};
-use serde::Deserialize;
-use serde_json::{json, Value};
-use std::path::{Path, PathBuf};
-use std::sync::Arc;
-use tokio::fs;
-
-use crate::agentic::tools::framework::{
-    Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult,
-};
-use crate::infrastructure::ai::AIClient;
-use crate::infrastructure::{get_path_manager_arc, get_workspace_path};
-use crate::service::config::types::{AIConfig as ServiceAIConfig, AIModelConfig, GlobalConfig};
-use crate::util::errors::{BitFunError, BitFunResult};
-use crate::util::types::{AIConfig as ModelConfig, Message};
-
-/// Image analysis tool input
-#[derive(Debug, Deserialize)]
-struct AnalyzeImageInput {
-    /// Image path (relative to workspace or absolute path)
-    #[serde(default)]
-    image_path: Option<String>,
-    /// Base64-encoded image data (clipboard image)
-    #[serde(default)]
-    data_url: Option<String>,
-    /// Image ID (retrieved from temporary storage, for clipboard images)
-    #[serde(default)]
-    image_id: Option<String>,
-    /// Analysis prompt
-    analysis_prompt: String,
-    /// Focus areas (optional)
-    #[serde(default)]
-    focus_areas: Option<Vec<String>>,
-    /// Detail level (optional)
-    #[serde(default)]
-    detail_level: Option<String>,
-}
-
-/// Image analysis tool
-pub struct AnalyzeImageTool;
-
-impl AnalyzeImageTool {
-    pub fn new() -> Self {
-        Self
-    }
-
-    /// Resolve image path (supports relative and absolute paths)
-    fn resolve_image_path(&self, path: &str) -> BitFunResult<PathBuf> {
-        let path_buf = PathBuf::from(path);
-
-        if path_buf.is_absolute() {
-            Ok(path_buf)
-        } else {
-            let workspace_path = get_workspace_path()
-                .ok_or_else(|| BitFunError::tool("Workspace path not set".to_string()))?;
-            Ok(workspace_path.join(path))
-        }
-    }
-
-    /// Load image file
-    async fn load_image(&self, path: &Path) -> BitFunResult<Vec<u8>> {
-        // Security check: ensure path is within workspace
-        if let Some(workspace_path) = get_workspace_path() {
-            let canonical_path = tokio::fs::canonicalize(path)
-                .await
-                .map_err(|e| BitFunError::io(format!("Image file does not exist: {}", e)))?;
-            let canonical_workspace = tokio::fs::canonicalize(&workspace_path)
-                .await
-                .map_err(|e| BitFunError::io(format!("Invalid workspace path: {}", e)))?;
-
-            if !canonical_path.starts_with(&canonical_workspace) {
-                return Err(BitFunError::validation(
-                    "Image path must be within workspace",
-                ));
-            }
-        }
-
-        fs::read(path)
-            .await
-            .map_err(|e| BitFunError::io(format!("Failed to read image: {}", e)))
-    }
-
-    /// Detect image MIME type
-    fn detect_mime_type(&self, path: &Path) -> BitFunResult<String> {
-        let extension = path
-            .extension()
-            .and_then(|e| e.to_str())
-            .ok_or_else(|| BitFunError::validation("Unable to determine image format"))?
-            .to_lowercase();
-
-        let mime_type = match extension.as_str() {
-            "png" => "image/png",
-            "jpg" | "jpeg" => "image/jpeg",
-            "gif" => "image/gif",
-            "webp" => "image/webp",
-            "bmp" => "image/bmp",
-            _ => {
-                return Err(BitFunError::validation(format!(
-                    "Unsupported image format: {}",
-                    extension
-                )))
-            }
-        };
-
-        Ok(mime_type.to_string())
-    }
-
-    /// Get image dimensions (simple implementation)
-    fn get_image_dimensions(&self, _data: &[u8]) -> (u32, u32) {
-        // TODO: Implement real image dimension detection
-        (0, 0)
-    }
-
-    /// Decode data URL
-    fn decode_data_url(&self, data_url: &str) -> BitFunResult<(Vec<u8>, String)> {
-        // data:image/png;base64,iVBORw0KG...
-        if !data_url.starts_with("data:") {
-            return Err(BitFunError::validation("Invalid data URL format"));
-        }
-
-        let parts: Vec<&str> = data_url.splitn(2, ',').collect();
-        if parts.len() != 2 {
-            return Err(BitFunError::validation("Data URL format error"));
-        }
-
-        // Extract MIME type
-        let header = parts[0];
-        let mime_type = header
-            .strip_prefix("data:")
-            .and_then(|s| s.split(';').next())
-            .unwrap_or("image/png")
-            .to_string();
-
-        // Decode base64
-        let base64_data = parts[1];
-        let image_data = BASE64
-            .decode(base64_data)
-            .map_err(|e| BitFunError::parse(format!("Base64 decode failed: {}", e)))?;
-
-        debug!(
-            "Decoded image from data URL: mime={}, size_kb={}",
-            mime_type,
-            image_data.len() / 1024
-        );
-
-        Ok((image_data, mime_type))
-    }
-
-    /// Load AI configuration from config file
-    async fn load_ai_config(&self) -> BitFunResult<ServiceAIConfig> {
-        let path_manager = get_path_manager_arc();
-        let config_file = path_manager.app_config_file();
-
-        if !config_file.exists() {
-            return Err(BitFunError::tool("Config file does not exist".to_string()));
-        }
-
-        let config_content = tokio::fs::read_to_string(&config_file)
-            .await
-            .map_err(|e| BitFunError::tool(format!("Failed to read config file: {}", e)))?;
-
-        let global_config: GlobalConfig = serde_json::from_str(&config_content)
-            .map_err(|e| BitFunError::tool(format!("Failed to parse config file: {}", e)))?;
-
-        Ok(global_config.ai)
-    }
-
-    /// Get vision model configuration
-    async fn get_vision_model(&self) -> BitFunResult<AIModelConfig> {
-        let ai_config = self.load_ai_config().await?;
-
-        let target_model_id = ai_config
-            .default_models
-            .image_understanding
-            .as_ref()
-            .filter(|id| !id.is_empty());
-
-        let model = if let Some(id) = target_model_id {
-            ai_config
-                .models
-                .iter()
-                .find(|m| m.id == *id)
-                .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id)))?
-                .clone()
-        } else {
-            ai_config
-                .models
-                .iter()
-                .find(|m| {
-                    m.enabled
-                        && m.capabilities.iter().any(|cap| {
-                            matches!(
-                                cap,
-                                crate::service::config::types::ModelCapability::ImageUnderstanding
-                            )
-                        })
-                })
-                .ok_or_else(|| {
-                    BitFunError::service(
-                        "No image understanding model found.\n\
-                     Please configure an image understanding model in settings"
-                            .to_string(),
-                    )
-                })?
-                .clone()
-        };
-
-        Ok(model)
-    }
-
-    /// Build analysis prompt
-    fn build_prompt(
-        &self,
-        analysis_prompt: &str,
-        focus_areas: &Option<Vec<String>>,
-        detail_level: &Option<String>,
-    ) -> String {
-        let mut prompt = String::new();
-
-        // 1. User's analysis prompt
-        prompt.push_str(analysis_prompt);
-        prompt.push_str("\n\n");
-
-        if let Some(areas) = focus_areas {
-            if !areas.is_empty() {
-                prompt.push_str("Please pay special attention to the following aspects:\n");
-                for area in areas {
-                    prompt.push_str(&format!("- {}\n", area));
-                }
-                prompt.push_str("\n");
-            }
-        }
-
-        let detail_guide = match detail_level.as_deref() {
-            Some("brief") => "Please answer concisely in 1-2 sentences.",
-            Some("detailed") => {
-                "Please provide a detailed analysis including all relevant details."
-            }
-            _ => "Please provide a moderate level of analysis detail.",
-        };
-        prompt.push_str(detail_guide);
-
-        prompt
-    }
-
-    /// Build multimodal message
-    fn build_multimodal_message(
-        &self,
-        prompt: &str,
-        base64_data: &str,
-        mime_type: &str,
-        provider: &str,
-    ) -> BitFunResult<Vec<Message>> {
-        let message = match provider.to_lowercase().as_str() {
-            "openai" => Message {
-                role: "user".to_string(),
-                content: Some(serde_json::to_string(&json!([
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": format!("data:{};base64,{}", mime_type, base64_data)
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": prompt
-                    }
-                ]))?),
-                reasoning_content: None,
-                thinking_signature: None,
-                tool_calls: None,
-                tool_call_id: None,
-                name: None,
-            },
-            "anthropic" => Message {
-                role: "user".to_string(),
-                content: Some(serde_json::to_string(&json!([
-                    {
-                        "type": "image",
-                        "source": {
-                            "type": "base64",
-                            "media_type": mime_type,
-                            "data": base64_data
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": prompt
-                    }
-                ]))?),
-                reasoning_content: None,
-                thinking_signature: None,
-                tool_calls: None,
-                tool_call_id: None,
-                name: None,
-            },
-            _ => {
-                return Err(BitFunError::validation(format!(
-                    "Unsupported provider: {}",
-                    provider
-                )));
-            }
-        };
-
-        Ok(vec![message])
-    }
-}
-
-#[async_trait]
-impl Tool for AnalyzeImageTool {
-    fn name(&self) -> &str {
-        "AnalyzeImage"
-    }
-
-    async fn description(&self) -> BitFunResult<String> {
-        Ok(r#"Analyzes image content and returns detailed descriptions. Use this tool when the user uploads images and asks related questions.
-
-Core Capabilities:
-- Identify objects, text, structures and other content in images
-- Understand technical diagrams (architecture diagrams, flowcharts, UML diagrams, etc.)
-- Extract code and error messages from code screenshots
-- Analyze UI designs and interface layouts
-- Recognize data, tables, and charts in images
-
-Usage Scenarios:
-1. User uploads architecture diagram and asks architecture questions → Analyze components and relationships
-2. User uploads error screenshot → Extract error messages and stack traces
-3. User uploads code screenshot → Identify code content
-4. User uploads UI design → Analyze design elements and layout
-5. User uploads data charts → Interpret data and trends
-
-Important Notes:
-- You can customize analysis_prompt to precisely control the analysis angle and focus
-- Use focus_areas parameter to specify aspects to emphasize
-- Choose detail_level as needed (brief/normal/detailed)
-- The same image can be analyzed multiple times for different aspects"#.to_string())
-    }
-
-    fn input_schema(&self) -> Value {
-        json!({
-            "type": "object",
-            "properties": {
-                "image_path": {
-                    "type": "string",
-                    "description": "Path to the image file (relative to workspace or absolute path).\nExamples: 'screenshot.png' or 'docs/architecture.png'\nNote: Provide ONE of: image_path, data_url, or (image_id + session_id)."
-                },
-                "data_url": {
-                    "type": "string",
-                    "description": "Base64-encoded image data.\nFormat: 'data:image/png;base64,iVBORw0KG...'\nNot recommended for large images due to token cost."
-                },
-                "image_id": {
-                    "type": "string",
-                    "description": "Image ID for clipboard images stored in temporary cache.\nExample: 'img-clipboard-1234567890-abc123'"
-                },
-                "analysis_prompt": {
-                    "type": "string",
-                    "description": "Analysis prompt describing what information you want to extract from the image.\n\
-                                   Examples:\n\
-                                   - 'What is this architecture diagram? What components and connections does it contain?'\n\
-                                   - 'Extract all error messages and stack traces from this screenshot'\n\
-                                   - 'Describe the layout structure and interactive elements of this UI'"
-                },
-                "focus_areas": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    },
-                    "description": "Optional. List of aspects to focus on.\nExamples: ['technical architecture', 'data flow'] or ['UI layout', 'color scheme']"
-                },
-                "detail_level": {
-                    "type": "string",
-                    "enum": ["brief", "normal", "detailed"],
-                    "description": "Optional. Level of analysis detail.\n- brief: Brief summary (1-2 sentences)\n- normal: Normal detail (default)\n- detailed: Detailed analysis (includes all relevant details)"
-                }
-            },
-            "required": ["analysis_prompt"]
-        })
-    }
-
-    fn is_readonly(&self) -> bool {
-        true
-    }
-
-    fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool {
-        true
-    }
-
-    fn needs_permissions(&self, _input: Option<&Value>) -> bool {
-        false
-    }
-
-    async fn validate_input(
-        &self,
-        input: &Value,
-        _context: Option<&ToolUseContext>,
-    ) -> ValidationResult {
-        // Check if image_path, data_url, or (image_id + session_id) is provided
-        let has_path = input
-            .get("image_path")
-            .and_then(|v| v.as_str())
-            .filter(|s| !s.is_empty())
-            .is_some();
-        let has_data_url = input
-            .get("data_url")
-            .and_then(|v| v.as_str())
-            .filter(|s| !s.is_empty())
-            .is_some();
-        let has_image_id = input
-            .get("image_id")
-            .and_then(|v| v.as_str())
-            .filter(|s| !s.is_empty())
-            .is_some();
-
-        if !has_path && !has_data_url && !has_image_id {
-            return ValidationResult {
-                result: false,
-                message: Some("Must provide one of image_path, data_url, or image_id".to_string()),
-                error_code: Some(400),
-                meta: None,
-            };
-        }
-
-        if let Some(prompt) = input.get("analysis_prompt").and_then(|v| v.as_str()) {
-            if prompt.is_empty() {
-                return ValidationResult {
-                    result: false,
-                    message: Some("analysis_prompt cannot be empty".to_string()),
-                    error_code: Some(400),
-                    meta: None,
-                };
-            }
-        } else {
-            return ValidationResult {
-                result: false,
-                message: Some("analysis_prompt is required".to_string()),
-                error_code: Some(400),
-                meta: None,
-            };
-        }
-
-        if let Some(image_path) = input.get("image_path").and_then(|v| v.as_str()) {
-            if !image_path.is_empty() {
-                match self.resolve_image_path(image_path) {
-                    Ok(path) => {
-                        if !path.exists() {
-                            return ValidationResult {
-                                result: false,
-                                message: Some(format!("Image file does not exist: {}", image_path)),
-                                error_code: Some(404),
-                                meta: None,
-                            };
-                        }
-
-                        if !path.is_file() {
-                            return ValidationResult {
-                                result: false,
-                                message: Some(format!("Path is not a file: {}", image_path)),
-                                error_code: Some(400),
-                                meta: None,
-                            };
-                        }
-                    }
-                    Err(e) => {
-                        return ValidationResult {
-                            result: false,
-                            message: Some(format!("Path parsing failed: {}", e)),
-                            error_code: Some(400),
-                            meta: None,
-                        };
-                    }
-                }
-            }
-        }
-
-        ValidationResult {
-            result: true,
-            message: None,
-            error_code: None,
-            meta: None,
-        }
-    }
-
-    fn render_tool_use_message(&self, input: &Value, options: &ToolRenderOptions) -> String {
-        // Determine image source
-        let image_source = if let Some(path) = input.get("image_path").and_then(|v| v.as_str()) {
-            if !path.is_empty() {
-                path.to_string()
-            } else {
-                "Clipboard image".to_string()
-            }
-        } else if input.get("data_url").is_some() {
-            "Clipboard image".to_string()
-        } else {
-            "unknown".to_string()
-        };
-
-        if options.verbose {
-            let prompt = input
-                .get("analysis_prompt")
-                .and_then(|v| v.as_str())
-                .unwrap_or("...");
-            format!(
-                "Analyzing image: {} (prompt: {})",
-                image_source,
-                if prompt.len() > 50 {
-                    // Safe truncation: find the maximum character boundary not exceeding 50 bytes
-                    let pos = prompt
-                        .char_indices()
-                        .take_while(|(i, _)| *i < 50)
-                        .last()
-                        .map(|(i, c)| i + c.len_utf8())
-                        .unwrap_or(0);
-                    format!("{}...", &prompt[..pos])
-                } else {
-                    prompt.to_string()
-                }
-            )
-        } else {
-            format!("Analyzing image: {}", image_source)
-        }
-    }
-
-    async fn call_impl(
-        &self,
-        input: &Value,
-        _context: &ToolUseContext,
-    ) -> BitFunResult<Vec<ToolResult>> {
-        let start = std::time::Instant::now();
-
-        // Parse input
-        let input_data: AnalyzeImageInput = serde_json::from_value(input.clone())
-            .map_err(|e| BitFunError::parse(format!("Failed to parse input: {}", e)))?;
-
-        let has_data_url = input_data.data_url.is_some();
-        let has_path = input_data.image_path.is_some();
-        let has_image_id = input_data.image_id.is_some();
-
-        if !has_data_url && !has_path && !has_image_id {
-            return Err(BitFunError::validation(
-                "Must provide one of image_path, data_url, or image_id",
-            ));
-        }
-
-        debug!(
-            "Starting image analysis: source={}",
-            if has_image_id {
-                "temporary_storage(image_id)"
-            } else if has_data_url {
-                "direct_input(data_url)"
-            } else {
-                "file_path(image_path)"
-            }
-        );
-        debug!("Analysis prompt: {}", input_data.analysis_prompt);
-
-        let (image_data, mime_type, image_source_description) = if let Some(image_id) =
-            &input_data.image_id
-        {
-            let provider = _context.image_context_provider.as_ref()
-                .ok_or_else(|| BitFunError::tool(
-                    "image_id mode requires ImageContextProvider support, but no provider was injected.\n\
-                     Please inject image_context_provider when calling the tool, or use image_path/data_url mode.".to_string()
-                ))?;
-
-            let image_context = provider.get_image(image_id)
-                .ok_or_else(|| BitFunError::tool(format!(
-                    "Image context not found: image_id={}. Image may have expired (5-minute validity) or was never uploaded.",
-                    image_id
-                )))?;
-
-            debug!(
-                "Retrieved image from context provider: name={}, source={}",
-                image_context.image_name, image_context.mime_type
-            );
-
-            if let Some(data_url) = &image_context.data_url {
-                let (data, mime) = self.decode_data_url(data_url)?;
-                (
-                    data,
-                    mime,
-                    format!("{} (clipboard)", image_context.image_name),
-                )
-            } else if let Some(image_path_str) = &image_context.image_path {
-                let image_path = self.resolve_image_path(image_path_str)?;
-                let data = self.load_image(&image_path).await?;
-                let mime = self.detect_mime_type(&image_path)?;
-                (data, mime, image_path.display().to_string())
-            } else {
-                return Err(BitFunError::tool(format!(
-                    "Image context {} has neither data_url nor image_path",
-                    image_id
-                )));
-            }
-        } else if let Some(data_url) = &input_data.data_url {
-            // Decode from data URL
-            let (data, mime) = self.decode_data_url(data_url)?;
-            (data, mime, "clipboard_image".to_string())
-        } else if let Some(image_path_str) = &input_data.image_path {
-            // Load from file path
-            let image_path = self.resolve_image_path(image_path_str)?;
-            debug!("Parsed image path: {}", image_path.display());
-
-            let data = self.load_image(&image_path).await?;
-            let mime = self.detect_mime_type(&image_path)?;
-
-            debug!("Image size: {} KB, mime: {}", data.len() / 1024, mime);
-
-            (data, mime, image_path.display().to_string())
-        } else {
-            unreachable!("Input already checked above")
-        };
-
-        let base64_data = BASE64.encode(&image_data);
-
-        let vision_model = self.get_vision_model().await?;
-        debug!(
-            "Using vision model: name={}, model={}",
-            vision_model.name, vision_model.model_name
-        );
-
-        let prompt = self.build_prompt(
-            &input_data.analysis_prompt,
-            &input_data.focus_areas,
-            &input_data.detail_level,
-        );
-        trace!("Full analysis prompt: {}", prompt);
-
-        let messages = self.build_multimodal_message(
-            &prompt,
-            &base64_data,
-            &mime_type,
-            &vision_model.provider,
-        )?;
-
-        // Vision models cannot set max_tokens (e.g., glm-4v doesn't support this parameter)
-        // and should never use the thinking process.
-        let mut model_config = ModelConfig::try_from(vision_model.clone())
-            .map_err(|e| BitFunError::parse(format!("Config conversion failed for vision model {}: {}", vision_model.name, e)))?;
-        model_config.max_tokens = None;
-        model_config.enable_thinking_process = false;
-        model_config.support_preserved_thinking = false;
-
-        let ai_client = Arc::new(AIClient::new(model_config));
-
-        debug!("Calling vision model for analysis...");
-        let ai_response = ai_client
-            .send_message(messages, None)
-            .await
-            .map_err(|e| BitFunError::service(format!("AI call failed: {}", e)))?;
-
-        let elapsed = start.elapsed();
-        info!("Image analysis completed: duration={:?}", elapsed);
-
-        let (width, height) = self.get_image_dimensions(&image_data);
-
-        let result_for_assistant = format!(
-            "Image analysis result ({})\n\n{}",
-            image_source_description, ai_response.text
-        );
-
-        let result = ToolResult::Result {
-            data: json!({
-                "success": true,
-                "image_source": image_source_description,
-                "analysis": ai_response.text,
-                "metadata": {
-                    "mime_type": mime_type,
-                    "file_size": image_data.len(),
-                    "width": width,
-                    "height": height,
-                    "analysis_time_ms": elapsed.as_millis() as u64,
-                    "model_used": vision_model.name,
-                    "prompt_used": input_data.analysis_prompt,
-                }
-            }),
-            result_for_assistant: Some(result_for_assistant),
-        };
-
-        Ok(vec![result])
-    }
-}
diff --git a/src/crates/core/src/agentic/tools/implementations/mod.rs b/src/crates/core/src/agentic/tools/implementations/mod.rs
index 4912528b..f6d2f6c0 100644
--- a/src/crates/core/src/agentic/tools/implementations/mod.rs
+++ b/src/crates/core/src/agentic/tools/implementations/mod.rs
@@ -1,51 +1,51 @@
 //! Tool implementation module
 
+pub mod ask_user_question_tool;
+pub mod bash_tool;
+pub mod code_review_tool;
+pub mod create_plan_tool;
+pub mod delete_file_tool;
+pub mod file_edit_tool;
 pub mod file_read_tool;
 pub mod file_write_tool;
-pub mod file_edit_tool;
-pub mod delete_file_tool;
-pub mod bash_tool;
-pub mod grep_tool;
+pub mod get_file_diff_tool;
+pub mod git_tool;
 pub mod glob_tool;
-pub mod web_tools;
-pub mod todo_write_tool;
+pub mod grep_tool;
 pub mod ide_control_tool;
-pub mod mermaid_interactive_tool;
-pub mod log_tool;
 pub mod linter_tool;
-pub mod analyze_image_tool;
+pub mod log_tool;
+pub mod ls_tool;
+pub mod mermaid_interactive_tool;
 pub mod skill_tool;
 pub mod skills;
-pub mod ask_user_question_tool;
-pub mod ls_tool;
 pub mod task_tool;
-pub mod git_tool;
-pub mod create_plan_tool;
-pub mod get_file_diff_tool;
-pub mod code_review_tool;
 pub mod terminal_control_tool;
+pub mod todo_write_tool;
 pub mod util;
+pub mod view_image_tool;
+pub mod web_tools;
 
+pub use ask_user_question_tool::AskUserQuestionTool;
+pub use bash_tool::BashTool;
+pub use code_review_tool::CodeReviewTool;
+pub use create_plan_tool::CreatePlanTool;
+pub use delete_file_tool::DeleteFileTool;
+pub use file_edit_tool::FileEditTool;
 pub use file_read_tool::FileReadTool;
 pub use file_write_tool::FileWriteTool;
-pub use file_edit_tool::FileEditTool;
-pub use delete_file_tool::DeleteFileTool;
-pub use bash_tool::BashTool;
-pub use grep_tool::GrepTool;
+pub use get_file_diff_tool::GetFileDiffTool;
+pub use git_tool::GitTool;
 pub use glob_tool::GlobTool;
-pub use web_tools::{WebSearchTool, WebFetchTool};
-pub use todo_write_tool::TodoWriteTool;
+pub use grep_tool::GrepTool;
 pub use ide_control_tool::IdeControlTool;
-pub use mermaid_interactive_tool::MermaidInteractiveTool;
-pub use log_tool::LogTool;
 pub use linter_tool::ReadLintsTool;
-pub use analyze_image_tool::AnalyzeImageTool;
-pub use skill_tool::SkillTool;
-pub use ask_user_question_tool::AskUserQuestionTool;
+pub use log_tool::LogTool;
 pub use ls_tool::LSTool;
+pub use mermaid_interactive_tool::MermaidInteractiveTool;
+pub use skill_tool::SkillTool;
 pub use task_tool::TaskTool;
-pub use git_tool::GitTool;
-pub use create_plan_tool::CreatePlanTool;
-pub use get_file_diff_tool::GetFileDiffTool;
-pub use code_review_tool::CodeReviewTool;
-pub use terminal_control_tool::TerminalControlTool;
\ No newline at end of file
+pub use terminal_control_tool::TerminalControlTool;
+pub use todo_write_tool::TodoWriteTool;
+pub use view_image_tool::ViewImageTool;
+pub use web_tools::{WebFetchTool, WebSearchTool};
diff --git a/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs
new file mode 100644
index 00000000..cbd59b2f
--- /dev/null
+++ b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs
@@ -0,0 +1,396 @@
+//! view_image tool - analyzes image content for text-only or multimodal main models.
+//!
+//! Current default behavior is to convert image content into structured text analysis.
+//! This keeps the tool useful for text-only primary models while preserving an interface
+//! that can evolve toward direct multimodal attachment in the future.
+
+use async_trait::async_trait;
+use log::{debug, info, trace};
+use serde::Deserialize;
+use serde_json::{json, Value};
+
+use crate::agentic::image_analysis::{
+    build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path,
+    optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_global_config,
+};
+use crate::agentic::tools::framework::{
+    Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult,
+};
+use crate::infrastructure::ai::get_global_ai_client_factory;
+use crate::infrastructure::get_workspace_path;
+use crate::util::errors::{BitFunError, BitFunResult};
+
+#[derive(Debug, Deserialize)]
+struct ViewImageInput {
+    #[serde(default)]
+    image_path: Option<String>,
+    #[serde(default)]
+    data_url: Option<String>,
+    #[serde(default)]
+    image_id: Option<String>,
+    #[serde(default)]
+    analysis_prompt: Option<String>,
+    #[serde(default)]
+    focus_areas: Option<Vec<String>>,
+    #[serde(default)]
+    detail_level: Option<String>,
+}
+
+pub struct ViewImageTool;
+
+impl ViewImageTool {
+    pub fn new() -> Self {
+        Self
+    }
+
+    fn build_prompt(
+        &self,
+        analysis_prompt: Option<&str>,
+        focus_areas: &Option<Vec<String>>,
+        detail_level: &Option<String>,
+    ) -> String {
+        let mut prompt = String::new();
+
+        prompt.push_str(
+            analysis_prompt
+                .filter(|s| !s.trim().is_empty())
+                .unwrap_or("Please analyze this image and describe the relevant details."),
+        );
+        prompt.push_str("\n\n");
+
+        if let Some(areas) = focus_areas {
+            if !areas.is_empty() {
+                prompt.push_str("Please pay special attention to the following aspects:\n");
+                for area in areas {
+                    prompt.push_str(&format!("- {}\n", area));
+                }
+                prompt.push('\n');
+            }
+        }
+
+        let detail_guide = match detail_level.as_deref() {
+            Some("brief") => "Please answer concisely in 1-2 sentences.",
+            Some("detailed") => {
+                "Please provide a detailed analysis including all relevant details."
+            }
+            _ => "Please provide a moderate level of analysis detail.",
+        };
+        prompt.push_str(detail_guide);
+
+        prompt
+    }
+
+    async fn load_source(
+        &self,
+        input_data: &ViewImageInput,
+        context: &ToolUseContext,
+    ) -> BitFunResult<(Vec<u8>, Option<String>, String)> {
+        let workspace_path = get_workspace_path();
+
+        if let Some(image_id) = &input_data.image_id {
+            let provider = context.image_context_provider.as_ref().ok_or_else(|| {
+                BitFunError::tool(
+                    "image_id mode requires ImageContextProvider support, but no provider was injected.\n\
+                     Please inject image_context_provider when calling the tool, or use image_path/data_url mode.".to_string()
+                )
+            })?;
+
+            let image_context = provider.get_image(image_id).ok_or_else(|| {
+                BitFunError::tool(format!(
+                    "Image context not found: image_id={}. Image may have expired (5-minute validity) or was never uploaded.",
+                    image_id
+                ))
+            })?;
+
+            if let Some(data_url) = &image_context.data_url {
+                let (data, data_url_mime) = decode_data_url(data_url)?;
+                let fallback_mime = data_url_mime.or_else(|| Some(image_context.mime_type.clone()));
+                return Ok((
+                    data,
+                    fallback_mime,
+                    format!("{} (clipboard)", image_context.image_name),
+                ));
+            }
+
+            if let Some(image_path_str) = &image_context.image_path {
+                let image_path = resolve_image_path(image_path_str, workspace_path.as_deref())?;
+                let data = load_image_from_path(&image_path, workspace_path.as_deref()).await?;
+                let detected_mime =
+                    detect_mime_type_from_bytes(&data, Some(&image_context.mime_type)).ok();
+                return Ok((data, detected_mime, image_path.display().to_string()));
+            }
+
+            return Err(BitFunError::tool(format!(
+                "Image context {} has neither data_url nor image_path",
+                image_id
+            )));
+        }
+
+        if let Some(data_url) = &input_data.data_url {
+            let (data, data_url_mime) = decode_data_url(data_url)?;
+            return Ok((data, data_url_mime, "clipboard_image".to_string()));
+        }
+
+        if let Some(image_path_str) = &input_data.image_path {
+            let image_path = resolve_image_path(image_path_str, workspace_path.as_deref())?;
+            let data = load_image_from_path(&image_path, workspace_path.as_deref()).await?;
+            let detected_mime = detect_mime_type_from_bytes(&data, None).ok();
+            return Ok((data, detected_mime, image_path.display().to_string()));
+        }
+
+        Err(BitFunError::validation(
+            "Must provide one of image_path, data_url, or image_id",
+        ))
+    }
+}
+
+#[async_trait]
+impl Tool for ViewImageTool {
+    fn name(&self) -> &str {
+        "view_image"
+    }
+
+    async fn description(&self) -> BitFunResult<String> {
+        Ok(r#"Analyzes image content and returns detailed text descriptions.
+
+Use this tool when the user provides an image (file path, data URL, or uploaded clipboard image_id) and asks questions about it.
+
+Current behavior:
+- For text-only primary models, this tool converts image content to structured text.
+- For multimodal-capable setups, this interface can be extended to direct image attachment in future.
+
+Parameters:
+- image_path / data_url / image_id: provide one image source
+- analysis_prompt: optional custom analysis goal
+- focus_areas: optional analysis focus list
+- detail_level: brief / normal / detailed"#.to_string())
+    }
+
+    fn input_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "image_path": {
+                    "type": "string",
+                    "description": "Path to image file (relative to workspace or absolute path). Example: 'screenshot.png'"
+                },
+                "data_url": {
+                    "type": "string",
+                    "description": "Base64-encoded image data URL. Example: 'data:image/png;base64,...'"
+                },
+                "image_id": {
+                    "type": "string",
+                    "description": "Temporary image ID from clipboard upload. Example: 'img-clipboard-1234567890-abc123'"
+                },
+                "analysis_prompt": {
+                    "type": "string",
+                    "description": "Optional custom prompt describing what to extract from the image"
+                },
+                "focus_areas": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Optional list of aspects to emphasize"
+                },
+                "detail_level": {
+                    "type": "string",
+                    "enum": ["brief", "normal", "detailed"],
+                    "description": "Optional detail level"
+                }
+            }
+        })
+    }
+
+    fn is_readonly(&self) -> bool {
+        true
+    }
+
+    fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool {
+        true
+    }
+
+    fn needs_permissions(&self, _input: Option<&Value>) -> bool {
+        false
+    }
+
+    async fn validate_input(
+        &self,
+        input: &Value,
+        _context: Option<&ToolUseContext>,
+    ) -> ValidationResult {
+        let has_path = input
+            .get("image_path")
+            .and_then(|v| v.as_str())
+            .is_some_and(|s| !s.is_empty());
+        let has_data_url = input
+            .get("data_url")
+            .and_then(|v| v.as_str())
+            .is_some_and(|s| !s.is_empty());
+        let has_image_id = input
+            .get("image_id")
+            .and_then(|v| v.as_str())
+            .is_some_and(|s| !s.is_empty());
+
+        if !has_path && !has_data_url && !has_image_id {
+            return ValidationResult {
+                result: false,
+                message: Some("Must provide one of image_path, data_url, or image_id".to_string()),
+                error_code: Some(400),
+                meta: None,
+            };
+        }
+
+        if let Some(image_path) = input.get("image_path").and_then(|v| v.as_str()) {
+            if !image_path.is_empty() {
+                let workspace_path = get_workspace_path();
+                match resolve_image_path(image_path, workspace_path.as_deref()) {
+                    Ok(path) => {
+                        if !path.exists() {
+                            return ValidationResult {
+                                result: false,
+                                message: Some(format!("Image file does not exist: {}", image_path)),
+                                error_code: Some(404),
+                                meta: None,
+                            };
+                        }
+
+                        if !path.is_file() {
+                            return ValidationResult {
+                                result: false,
+                                message: Some(format!("Path is not a file: {}", image_path)),
+                                error_code: Some(400),
+                                meta: None,
+                            };
+                        }
+                    }
+                    Err(e) => {
+                        return ValidationResult {
+                            result: false,
+                            message: Some(format!("Path parsing failed: {}", e)),
+                            error_code: Some(400),
+                            meta: None,
+                        };
+                    }
+                }
+            }
+        }
+
+        ValidationResult::default()
+    }
+
+    fn render_tool_use_message(&self, input: &Value, options: &ToolRenderOptions) -> String {
+        let image_source = if let Some(path) = input.get("image_path").and_then(|v| v.as_str()) {
+            if !path.is_empty() {
+                path.to_string()
+            } else {
+                "Clipboard image".to_string()
+            }
+        } else if input
+            .get("image_id")
+            .and_then(|v| v.as_str())
+            .is_some_and(|id| !id.is_empty())
+        {
+            "Clipboard image (image_id)".to_string()
+        } else if input.get("data_url").is_some() {
+            "Clipboard image".to_string()
+        } else {
+            "unknown".to_string()
+        };
+
+        if options.verbose {
+            let prompt = input
+                .get("analysis_prompt")
+                .and_then(|v| v.as_str())
+                .unwrap_or("default analysis");
+            format!("Viewing image: {} (prompt: {})", image_source, prompt)
+        } else {
+            format!("Viewing image: {}", image_source)
+        }
+    }
+
+    async fn call_impl(
+        &self,
+        input: &Value,
+        context: &ToolUseContext,
+    ) -> BitFunResult<Vec<ToolResult>> {
+        let start = std::time::Instant::now();
+
+        let input_data: ViewImageInput = serde_json::from_value(input.clone())
+            .map_err(|e| BitFunError::parse(format!("Failed to parse input: {}", e)))?;
+
+        let (image_data, fallback_mime, image_source_description) =
+            self.load_source(&input_data, context).await?;
+
+        let vision_model = resolve_vision_model_from_global_config().await?;
+        debug!(
+            "Using image understanding model: id={}, name={}, provider={}",
+            vision_model.id, vision_model.name, vision_model.provider
+        );
+
+        let processed = optimize_image_for_provider(
+            image_data,
+            &vision_model.provider,
+            fallback_mime.as_deref(),
+        )?;
+
+        let prompt = self.build_prompt(
+            input_data.analysis_prompt.as_deref(),
+            &input_data.focus_areas,
+            &input_data.detail_level,
+        );
+        trace!("Full view_image prompt: {}", prompt);
+
+        let messages = build_multimodal_message(
+            &prompt,
+            &processed.data,
+            &processed.mime_type,
+            &vision_model.provider,
+        )?;
+
+        let ai_client_factory = get_global_ai_client_factory()
+            .await
+            .map_err(|e| BitFunError::service(format!("Failed to get AI client factory: {}", e)))?;
+        let ai_client = ai_client_factory
+            .get_client_by_id(&vision_model.id)
+            .await
+            .map_err(|e| {
+                BitFunError::service(format!(
+                    "Failed to create vision model client for {}: {}",
+                    vision_model.id, e
+                ))
+            })?;
+
+        debug!("Calling vision model for image analysis...");
+        let ai_response = ai_client
+            .send_message(messages, None)
+            .await
+            .map_err(|e| BitFunError::service(format!("AI call failed: {}", e)))?;
+
+        let elapsed = start.elapsed();
+        info!("view_image completed: duration={:?}", elapsed);
+
+        let result_for_assistant = format!(
+            "Image analysis result ({})\n\n{}",
+            image_source_description, ai_response.text
+        );
+
+        Ok(vec![ToolResult::Result {
+            data: json!({
+                "success": true,
+                "image_source": image_source_description,
+                "analysis": ai_response.text,
+                "metadata": {
+                    "mime_type": processed.mime_type,
+                    "file_size": processed.data.len(),
+                    "width": processed.width,
+                    "height": processed.height,
+                    "analysis_time_ms": elapsed.as_millis() as u64,
+                    "model_used": vision_model.name,
+                    "prompt_used": input_data.analysis_prompt.unwrap_or_else(|| "default".to_string()),
+                }
+            }),
+            result_for_assistant: Some(result_for_assistant),
+        }])
+    }
+}
diff --git a/src/crates/core/src/agentic/tools/registry.rs b/src/crates/core/src/agentic/tools/registry.rs
index 1eefdb51..6f822601 100644
--- a/src/crates/core/src/agentic/tools/registry.rs
+++ b/src/crates/core/src/agentic/tools/registry.rs
@@ -122,8 +122,8 @@ impl ToolRegistry {
         // Linter tool (LSP diagnosis)
         self.register_tool(Arc::new(ReadLintsTool::new()));
 
-        // Image analysis tool
-        self.register_tool(Arc::new(AnalyzeImageTool::new()));
+        // Image analysis / viewing tool
+        self.register_tool(Arc::new(ViewImageTool::new()));
 
         // Git version control tool
         self.register_tool(Arc::new(GitTool::new()));
@@ -173,11 +173,11 @@ mod tests {
 }
 
 /// Get all tools
-/// - Snapshot initialized: 
+/// - Snapshot initialized:
 /// return tools only in the snapshot manager (wrapped file tools + built-in non-file tools)
 /// **not containing** dynamically registered MCP tools.
-/// - Snapshot not initialized: 
-/// return all tools in the global registry, 
+/// - Snapshot not initialized:
+/// return all tools in the global registry,
 /// **containing** MCP tools.
 /// If you need **always include** MCP tools, use [get_all_registered_tools]
 pub async fn get_all_tools() -> Vec<Arc<dyn Tool>> {
@@ -234,7 +234,7 @@ pub fn get_global_tool_registry() -> Arc<TokioRwLock<ToolRegistry>> {
 }
 
 /// Get all registered tools (**always include** dynamically registered MCP tools)
-/// - Snapshot initialized: 
+/// - Snapshot initialized:
 /// return wrapped file tools + other tools in the global registry (containing MCP tools)
 /// - Snapshot not initialized: return all tools in the global registry.
 pub async fn get_all_registered_tools() -> Vec<Arc<dyn Tool>> {
diff --git a/src/web-ui/src/component-library/components/registry.tsx b/src/web-ui/src/component-library/components/registry.tsx
index abf4a77e..b4df6113 100644
--- a/src/web-ui/src/component-library/components/registry.tsx
+++ b/src/web-ui/src/component-library/components/registry.tsx
@@ -1587,14 +1587,14 @@ All requirements met`,
       },
       {
         id: 'image-analysis-card',
-        name: 'AnalyzeImage - ????',
+        name: 'view_image - ????',
         description: '????????',
         category: 'flowchat-cards',
         component: () => (
           <div style={{ display: 'flex', flexDirection: 'column', gap: '16px', padding: '20px' }}>
             <h3 style={{ color: '#ffffff', marginBottom: '8px' }}>Read - Success</h3>
             <ImageAnalysisCard
-              toolItem={createMockToolItem('AnalyzeImage',
+              toolItem={createMockToolItem('view_image',
                 {
                   image_path: '/path/to/screenshot.png',
                   analysis_prompt: 'Analyze the UI components',
@@ -1607,7 +1607,7 @@ All requirements met`,
                 },
                 'completed'
               )}
-              config={TOOL_CARD_CONFIGS['AnalyzeImage']}
+              config={TOOL_CARD_CONFIGS['view_image']}
               sessionId="preview-session"
             />
           </div>
diff --git a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts
index 1a39275f..71968fb5 100644
--- a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts
+++ b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts
@@ -33,6 +33,127 @@ interface UseMessageSenderReturn {
   isSending: boolean;
 }
 
+type ImageInputStrategy = 'vision-preanalysis' | 'direct-attach';
+
+interface StrategyDecision {
+  strategy: ImageInputStrategy;
+  modelId: string | null;
+  supportsImageUnderstanding: boolean;
+  reason: string;
+}
+
+interface ImageAnalysisResult {
+  image_id: string;
+  summary: string;
+  detailed_description: string;
+  detected_elements: string[];
+  confidence: number;
+  analysis_time_ms: number;
+}
+
+// Keep this off for now: transport currently accepts text-only `userInput`.
+// When backend supports multimodal turn input, this can be flipped (or moved to config).
+const ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED = false;
+
+async function resolveSessionModelId(
+  flowChatManager: FlowChatManager,
+  sessionId: string | undefined
+): Promise<string | null> {
+  const state = flowChatManager.getFlowChatState();
+  const session = sessionId ? state.sessions.get(sessionId) : undefined;
+  const configuredModel = session?.config?.modelName;
+
+  if (configuredModel && configuredModel !== 'default') {
+    return configuredModel;
+  }
+
+  const { getDefaultPrimaryModel } = await import('@/infrastructure/config/utils/modelConfigHelpers');
+  return getDefaultPrimaryModel();
+}
+
+async function modelSupportsImageUnderstanding(modelId: string | null): Promise<boolean> {
+  if (!modelId) return false;
+
+  const { configManager } = await import('@/infrastructure/config/services/ConfigManager');
+  const allModels = await configManager.getConfig<any[]>('ai.models') || [];
+  const model = allModels.find(m => m.id === modelId || m.name === modelId);
+  const capabilities = Array.isArray(model?.capabilities) ? model.capabilities : [];
+  return capabilities.includes('image_understanding');
+}
+
+async function chooseImageInputStrategy(
+  flowChatManager: FlowChatManager,
+  sessionId: string | undefined
+): Promise<StrategyDecision> {
+  const modelId = await resolveSessionModelId(flowChatManager, sessionId);
+  const supportsImageUnderstanding = await modelSupportsImageUnderstanding(modelId);
+
+  if (supportsImageUnderstanding && ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED) {
+    return {
+      strategy: 'direct-attach',
+      modelId,
+      supportsImageUnderstanding,
+      reason: 'model_supports_image_understanding',
+    };
+  }
+
+  return {
+    strategy: 'vision-preanalysis',
+    modelId,
+    supportsImageUnderstanding,
+    reason: supportsImageUnderstanding
+      ? 'direct_attach_disabled_until_multimodal_turn_input_is_available'
+      : 'primary_model_is_text_only',
+  };
+}
+
+async function analyzeImagesBeforeSend(
+  imageContexts: ImageContext[],
+  sessionId: string,
+  userMessage: string
+): Promise<ImageAnalysisResult[]> {
+  if (imageContexts.length === 0) return [];
+
+  const { imageAnalysisAPI } = await import('@/infrastructure/api/service-api/ImageAnalysisAPI');
+  return imageAnalysisAPI.analyzeImages({
+    session_id: sessionId,
+    user_message: userMessage,
+    images: imageContexts.map(ctx => ({
+      id: ctx.id,
+      image_path: ctx.isLocal ? ctx.imagePath : undefined,
+      data_url: !ctx.isLocal ? ctx.dataUrl : undefined,
+      mime_type: ctx.mimeType,
+      metadata: {
+        name: ctx.imageName,
+        width: ctx.width,
+        height: ctx.height,
+        file_size: ctx.fileSize,
+        source: ctx.source,
+      },
+    })),
+  });
+}
+
+function formatImageContextLine(
+  ctx: ImageContext,
+  analysis?: ImageAnalysisResult
+): string {
+  const imgName = ctx.imageName || 'Untitled image';
+  const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : '';
+  const sourceLine = ctx.isLocal
+    ? `Path: ${ctx.imagePath}`
+    : `Image ID: ${ctx.id}`;
+
+  if (!analysis) {
+    return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nTip: You can use the view_image tool (${ctx.isLocal ? 'image_path' : 'image_id'}).`;
+  }
+
+  const topElements = (analysis.detected_elements || []).slice(0, 5).join(', ');
+  const keyElementsLine = topElements ? `\nPre-analysis key elements: ${topElements}` : '';
+
+  return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nPre-analysis summary: ${analysis.summary}${keyElementsLine}`;
+}
+
 export function useMessageSender(props: UseMessageSenderProps): UseMessageSenderReturn {
   const {
     currentSessionId,
@@ -56,14 +177,14 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
       hasSession: !!sessionId,
       agentType: currentAgentType || 'agentic',
     });
-    
+
     try {
       const flowChatManager = FlowChatManager.getInstance();
-      
+
       if (!sessionId) {
         const { getDefaultPrimaryModel } = await import('@/infrastructure/config/utils/modelConfigHelpers');
         const modelId = await getDefaultPrimaryModel();
-        
+
         sessionId = await flowChatManager.createChatSession({
           modelName: modelId || undefined
         }, currentAgentType || 'agentic');
@@ -71,12 +192,10 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
       } else {
         log.debug('Reusing existing session', { sessionId });
       }
-      
-      // Upload clipboard images to temporary backend storage first.
-      const clipboardImages = contexts.filter(ctx => 
-        ctx.type === 'image' && !ctx.isLocal && ctx.dataUrl
-      ) as ImageContext[];
-      
+
+      const imageContexts = contexts.filter(ctx => ctx.type === 'image') as ImageContext[];
+      const clipboardImages = imageContexts.filter(ctx => !ctx.isLocal && ctx.dataUrl);
+
       if (clipboardImages.length > 0) {
         try {
           const { api } = await import('@/infrastructure/api/service-api/ApiClient');
@@ -95,7 +214,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
               }))
             }
           };
-          
+
           await api.invoke('upload_image_contexts', uploadData);
           log.debug('Clipboard images uploaded', {
             imageCount: clipboardImages.length,
@@ -110,13 +229,63 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
           throw error;
         }
       }
-      
-      // Build both backend and display versions of the message.
+
+      let strategyDecision: StrategyDecision = {
+        strategy: 'vision-preanalysis',
+        modelId: null,
+        supportsImageUnderstanding: false,
+        reason: 'fallback_default_preanalysis',
+      };
+      try {
+        strategyDecision = await chooseImageInputStrategy(flowChatManager, sessionId);
+      } catch (error) {
+        log.warn('Failed to resolve image input strategy, using pre-analysis fallback', {
+          sessionId,
+          error: (error as Error)?.message ?? 'unknown',
+        });
+      }
+
+      log.debug('Image input strategy selected', {
+        sessionId,
+        strategy: strategyDecision.strategy,
+        modelId: strategyDecision.modelId,
+        supportsImageUnderstanding: strategyDecision.supportsImageUnderstanding,
+        reason: strategyDecision.reason,
+      });
+
+      let imageAnalyses: ImageAnalysisResult[] = [];
+      if (imageContexts.length > 0) {
+        if (strategyDecision.strategy === 'direct-attach') {
+          // Future extensibility hook:
+          // once start_dialog_turn supports multimodal payloads, this branch can send image items directly.
+          log.info('Direct image attach strategy is selected but transport is still text-only; using pre-analysis fallback', {
+            sessionId,
+            modelId: strategyDecision.modelId,
+          });
+        }
+
+        try {
+          imageAnalyses = await analyzeImagesBeforeSend(imageContexts, sessionId!, trimmedMessage);
+          log.debug('Image pre-analysis completed', {
+            sessionId,
+            imageCount: imageContexts.length,
+            analysisCount: imageAnalyses.length,
+          });
+        } catch (error) {
+          log.warn('Image pre-analysis failed, continuing with context hints only', {
+            sessionId,
+            imageCount: imageContexts.length,
+            error: (error as Error)?.message ?? 'unknown',
+          });
+        }
+      }
+
       let fullMessage = trimmedMessage;
       const displayMessage = trimmedMessage;
-      
+
       if (contexts.length > 0) {
-        // Full version includes absolute details for the backend.
+        const analysisByImageId = new Map(imageAnalyses.map(result => [result.image_id, result]));
+
         const fullContextSection = contexts.map(ctx => {
           switch (ctx.type) {
             case 'file':
@@ -126,20 +295,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
             case 'code-snippet':
               return `[Code Snippet: ${ctx.filePath}:${ctx.startLine}-${ctx.endLine}]`;
             case 'image': {
-              const imgName = ctx.imageName || 'Untitled image';
-              const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : '';
-              
-              // Distinguish local files and clipboard images.
-              if (ctx.isLocal && ctx.imagePath) {
-                return `[Image: ${imgName}${imgSize}]\n` +
-                       `Path: ${ctx.imagePath}\n` +
-                       `Tip: You can use the AnalyzeImage tool with the image_path parameter.`;
-              } else {
-                return `[Image: ${imgName}${imgSize} (from clipboard)]\n` +
-                       `Image ID: ${ctx.id}\n` +
-                       `Tip: You can use the AnalyzeImage tool.\n` +
-                       `Parameter: image_id="${ctx.id}"`;
-              }
+              return formatImageContextLine(ctx, analysisByImageId.get(ctx.id));
             }
             case 'terminal-command':
               return `[Command: ${ctx.command}]`;
@@ -155,21 +311,21 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
               return '';
           }
         }).filter(Boolean).join('\n');
-        
+
         fullMessage = `${fullContextSection}\n\n${trimmedMessage}`;
       }
-      
+
       await flowChatManager.sendMessage(
-        fullMessage, 
-        sessionId || undefined, 
+        fullMessage,
+        sessionId || undefined,
         displayMessage,
         currentAgentType || 'agentic'
       );
-      
+
       onClearContexts();
-      
+
       onExitTemplateMode?.();
-      
+
       onSuccess?.(trimmedMessage);
       log.info('Message sent successfully', {
         sessionId,
@@ -185,7 +341,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
       });
       throw error;
     }
-  }, [currentSessionId, contexts, onClearContexts, onSuccess, onExitTemplateMode]);
+  }, [currentSessionId, contexts, onClearContexts, onSuccess, onExitTemplateMode, currentAgentType]);
 
   return {
     sendMessage,
diff --git a/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx b/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx
index a286ee2e..1204d058 100644
--- a/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx
+++ b/src/web-ui/src/flow_chat/tool-cards/ImageAnalysisCard.tsx
@@ -1,23 +1,32 @@
 /**
  * Image analysis tool card - compact mode
- * Used for AnalyzeImage tool
+ * Used for view_image tool
  */
 
-import React, { useState, useMemo } from 'react';
+import React, { useState, useMemo, useEffect, useCallback } from 'react';
 import { Loader2, Clock, Check } from 'lucide-react';
 import { useTranslation } from 'react-i18next';
 import type { ToolCardProps } from '../types/flow-chat';
 import { CompactToolCard, CompactToolCardHeader } from './CompactToolCard';
 import './ImageAnalysisCard.scss';
 
+const imageAnalysisExpandedStateCache = new Map<string, boolean>();
+
 export const ImageAnalysisCard: React.FC<ToolCardProps> = ({
   toolItem,
   onExpand
 }) => {
   const { t } = useTranslation('flow-chat');
   const { toolCall, toolResult, status } = toolItem;
+  const toolId = toolItem.id || toolCall?.id;
   const [isExpanded, setIsExpanded] = useState(false);
 
+  useEffect(() => {
+    if (!toolId) return;
+    const cached = imageAnalysisExpandedStateCache.get(toolId);
+    setIsExpanded(cached ?? false);
+  }, [toolId]);
+
   const getStatusIcon = () => {
     switch (status) {
       case 'running':
@@ -66,10 +75,15 @@ export const ImageAnalysisCard: React.FC<ToolCardProps> = ({
 
   const getAnalysisResult = () => {
     if (!toolResult?.result) return null;
-    
-    const result = toolResult.result;
-    
-    if (result.analysis || result.description || result.content) {
+
+    const raw = toolResult.result;
+    const result =
+      (raw?.analysis || raw?.description || raw?.content) ? raw :
+      (raw?.result?.analysis || raw?.result?.description || raw?.result?.content) ? raw.result :
+      (raw?.data?.analysis || raw?.data?.description || raw?.data?.content) ? raw.data :
+      null;
+
+    if (result) {
       return {
         analysis: result.analysis || result.description || result.content,
         modelUsed: result.model_used || result.model,
@@ -80,10 +94,17 @@ export const ImageAnalysisCard: React.FC<ToolCardProps> = ({
     return null;
   };
 
-  const handleToggleExpand = () => {
-    setIsExpanded(!isExpanded);
+  const handleToggleExpand = useCallback(() => {
+    window.dispatchEvent(new CustomEvent('tool-card-toggle'));
+    setIsExpanded(prev => {
+      const next = !prev;
+      if (toolId) {
+        imageAnalysisExpandedStateCache.set(toolId, next);
+      }
+      return next;
+    });
     onExpand?.();
-  };
+  }, [onExpand, toolId]);
 
   const analysisInfo = useMemo(() => getAnalysisInfo(), [toolCall?.input]);
   const analysisResult = useMemo(() => getAnalysisResult(), [toolResult?.result]);
@@ -181,4 +202,3 @@ export const ImageAnalysisCard: React.FC<ToolCardProps> = ({
     />
   );
 };
-
diff --git a/src/web-ui/src/flow_chat/tool-cards/index.ts b/src/web-ui/src/flow_chat/tool-cards/index.ts
index f32facba..f3724216 100644
--- a/src/web-ui/src/flow_chat/tool-cards/index.ts
+++ b/src/web-ui/src/flow_chat/tool-cards/index.ts
@@ -191,8 +191,8 @@ export const TOOL_CARD_CONFIGS: Record<string, ToolCardConfig> = {
     displayMode: 'compact',
     primaryColor: '#8b5cf6'
   },
-  'AnalyzeImage': {
-    toolName: 'AnalyzeImage',
+  'view_image': {
+    toolName: 'view_image',
     displayName: 'Image Analysis',
     icon: 'IMG',
     requiresConfirmation: false,
@@ -329,7 +329,7 @@ export const TOOL_CARD_COMPONENTS = {
   'submit_code_review': CodeReviewToolCard,
   
   // Image analysis tools
-  'AnalyzeImage': ImageAnalysisCard,
+  'view_image': ImageAnalysisCard,
   
   // Context compression
   'ContextCompression': ContextCompressionDisplay,
diff --git a/src/web-ui/src/locales/en-US/flow-chat.json b/src/web-ui/src/locales/en-US/flow-chat.json
index f272e4b9..6e12e5e3 100644
--- a/src/web-ui/src/locales/en-US/flow-chat.json
+++ b/src/web-ui/src/locales/en-US/flow-chat.json
@@ -582,7 +582,14 @@
       "parsingAnalysisInfo": "Parsing analysis info...",
       "unknownImage": "Unknown image",
       "clipboardImage": "Clipboard image",
-      "analyzeImageContent": "Analyze image content"
+      "analyzeImageContent": "Analyze image content",
+      "imageAnalysis": "Image analysis",
+      "completed": "Completed",
+      "analyzing": "Analyzing",
+      "preparing": "Preparing",
+      "analysisPrompt": "Analysis prompt",
+      "focusAreas": "Focus areas",
+      "analysisResult": "Analysis result"
     },
     "contextCompression": {
       "beforeUserMessage": "Before user message",
diff --git a/src/web-ui/src/locales/zh-CN/flow-chat.json b/src/web-ui/src/locales/zh-CN/flow-chat.json
index e8053306..f3ed8632 100644
--- a/src/web-ui/src/locales/zh-CN/flow-chat.json
+++ b/src/web-ui/src/locales/zh-CN/flow-chat.json
@@ -582,7 +582,14 @@
       "parsingAnalysisInfo": "解析分析信息中...",
       "unknownImage": "未知图片",
       "clipboardImage": "剪贴板图片",
-      "analyzeImageContent": "分析图片内容"
+      "analyzeImageContent": "分析图片内容",
+      "imageAnalysis": "图片分析",
+      "completed": "已完成",
+      "analyzing": "正在分析",
+      "preparing": "准备分析",
+      "analysisPrompt": "分析提示",
+      "focusAreas": "关注点",
+      "analysisResult": "分析结果"
     },
     "contextCompression": {
       "beforeUserMessage": "用户消息前",

From b1ce49dd12c557c7a44f88e472d08c8dd7fe1634 Mon Sep 17 00:00:00 2001
From: wgqqqqq <wgq0617@gmail.com>
Date: Thu, 5 Mar 2026 22:50:00 +0800
Subject: [PATCH 2/2] feat: support multimodal image turn flow with persistence
 redaction

---
 src/apps/desktop/src/api/agentic_api.rs       | 152 +++++++++-
 src/apps/desktop/src/api/commands.rs          |  73 +++++
 .../desktop/src/api/image_analysis_api.rs     |   9 +-
 .../src/agentic/coordination/coordinator.rs   |  38 ++-
 src/crates/core/src/agentic/core/message.rs   | 140 +++++++++-
 .../core/src/agentic/core/messages_helper.rs  |  38 ++-
 .../src/agentic/execution/execution_engine.rs | 259 ++++++++++++++++--
 .../src/agentic/execution/round_executor.rs   |  41 ++-
 .../image_analysis/image_processing.rs        | 156 +++++++++--
 .../core/src/agentic/image_analysis/mod.rs    |   5 +-
 .../core/src/agentic/persistence/manager.rs   |  74 ++++-
 .../src/agentic/session/session_manager.rs    |  10 +-
 .../tools/implementations/view_image_tool.rs  | 225 ++++++++++++++-
 .../agentic/tools/pipeline/state_manager.rs   |  28 +-
 .../agentic/tools/pipeline/tool_pipeline.rs   |  35 ++-
 .../core/src/infrastructure/ai/client.rs      | 148 ++++++++++
 .../flow_chat/components/ModelSelector.tsx    |   5 +-
 .../src/flow_chat/hooks/useMessageSender.ts   | 130 ++++++---
 .../src/flow_chat/services/FlowChatManager.ts |  15 +-
 .../flow-chat-manager/MessageModule.ts        |   8 +-
 .../api/service-api/AgentAPI.ts               |   5 +-
 .../api/service-api/ApiClient.ts              |  86 +++++-
 src/web-ui/src/locales/zh-CN/settings.json    |   4 +-
 .../src/locales/zh-CN/settings/ai-model.json  |  10 +-
 .../locales/zh-CN/settings/default-model.json |   6 +-
 25 files changed, 1556 insertions(+), 144 deletions(-)

diff --git a/src/apps/desktop/src/api/agentic_api.rs b/src/apps/desktop/src/api/agentic_api.rs
index 10fa5c9d..1143c4a4 100644
--- a/src/apps/desktop/src/api/agentic_api.rs
+++ b/src/apps/desktop/src/api/agentic_api.rs
@@ -6,8 +6,10 @@ use std::sync::Arc;
 use tauri::{AppHandle, State};
 
 use crate::api::app_state::AppState;
+use crate::api::context_upload_api::get_image_context;
 use bitfun_core::agentic::coordination::ConversationCoordinator;
 use bitfun_core::agentic::core::*;
+use bitfun_core::agentic::image_analysis::ImageContextData;
 
 #[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -45,6 +47,8 @@ pub struct StartDialogTurnRequest {
     pub user_input: String,
     pub agent_type: String,
     pub turn_id: Option<String>,
+    #[serde(default)]
+    pub image_contexts: Option<Vec<ImageContextData>>,
 }
 
 #[derive(Debug, Serialize)]
@@ -179,16 +183,42 @@ pub async fn start_dialog_turn(
     coordinator: State<'_, Arc<ConversationCoordinator>>,
     request: StartDialogTurnRequest,
 ) -> Result<StartDialogTurnResponse, String> {
-    let _stream = coordinator
-        .start_dialog_turn(
-            request.session_id,
-            request.user_input,
-            request.turn_id,
-            request.agent_type,
-            false,
-        )
-        .await
-        .map_err(|e| format!("Failed to start dialog turn: {}", e))?;
+    let StartDialogTurnRequest {
+        session_id,
+        user_input,
+        agent_type,
+        turn_id,
+        image_contexts,
+    } = request;
+
+    if let Some(image_contexts) = image_contexts
+        .as_ref()
+        .filter(|images| !images.is_empty())
+        .cloned()
+    {
+        let resolved_image_contexts = resolve_missing_image_payloads(image_contexts)?;
+        coordinator
+            .start_dialog_turn_with_image_contexts(
+                session_id,
+                user_input,
+                resolved_image_contexts,
+                turn_id,
+                agent_type,
+            )
+            .await
+            .map_err(|e| format!("Failed to start dialog turn: {}", e))?;
+    } else {
+        coordinator
+            .start_dialog_turn(
+                session_id,
+                user_input,
+                turn_id,
+                agent_type,
+                false,
+            )
+            .await
+            .map_err(|e| format!("Failed to start dialog turn: {}", e))?;
+    }
 
     Ok(StartDialogTurnResponse {
         success: true,
@@ -196,6 +226,88 @@ pub async fn start_dialog_turn(
     })
 }
 
+fn is_blank_text(value: Option<&String>) -> bool {
+    value.map(|s| s.trim().is_empty()).unwrap_or(true)
+}
+
+fn resolve_missing_image_payloads(
+    image_contexts: Vec<ImageContextData>,
+) -> Result<Vec<ImageContextData>, String> {
+    let mut resolved = Vec::with_capacity(image_contexts.len());
+
+    for mut image in image_contexts {
+        let missing_payload =
+            is_blank_text(image.image_path.as_ref()) && is_blank_text(image.data_url.as_ref());
+        if !missing_payload {
+            resolved.push(image);
+            continue;
+        }
+
+        let stored = get_image_context(&image.id).ok_or_else(|| {
+            format!(
+                "Image context not found for image_id={}. It may have expired. Please re-attach the image and retry.",
+                image.id
+            )
+        })?;
+
+        if is_blank_text(image.image_path.as_ref()) {
+            image.image_path = stored
+                .image_path
+                .clone()
+                .filter(|s| !s.trim().is_empty());
+        }
+        if is_blank_text(image.data_url.as_ref()) {
+            image.data_url = stored
+                .data_url
+                .clone()
+                .filter(|s| !s.trim().is_empty());
+        }
+        if image.mime_type.trim().is_empty() {
+            image.mime_type = stored.mime_type.clone();
+        }
+
+        let mut metadata = image.metadata.take().unwrap_or_else(|| serde_json::json!({}));
+        if !metadata.is_object() {
+            metadata = serde_json::json!({ "raw_metadata": metadata });
+        }
+        if let Some(obj) = metadata.as_object_mut() {
+            if !obj.contains_key("name") {
+                obj.insert("name".to_string(), serde_json::json!(stored.image_name));
+            }
+            if !obj.contains_key("width") {
+                obj.insert("width".to_string(), serde_json::json!(stored.width));
+            }
+            if !obj.contains_key("height") {
+                obj.insert("height".to_string(), serde_json::json!(stored.height));
+            }
+            if !obj.contains_key("file_size") {
+                obj.insert("file_size".to_string(), serde_json::json!(stored.file_size));
+            }
+            if !obj.contains_key("source") {
+                obj.insert("source".to_string(), serde_json::json!(stored.source));
+            }
+            obj.insert(
+                "resolved_from_upload_cache".to_string(),
+                serde_json::json!(true),
+            );
+        }
+        image.metadata = Some(metadata);
+
+        let still_missing =
+            is_blank_text(image.image_path.as_ref()) && is_blank_text(image.data_url.as_ref());
+        if still_missing {
+            return Err(format!(
+                "Image context {} is missing image_path/data_url after cache resolution",
+                image.id
+            ));
+        }
+
+        resolved.push(image);
+    }
+
+    Ok(resolved)
+}
+
 #[tauri::command]
 pub async fn cancel_dialog_turn(
     coordinator: State<'_, Arc<ConversationCoordinator>>,
@@ -394,6 +506,26 @@ fn message_to_dto(message: Message) -> MessageDTO {
 
     let content = match message.content {
         MessageContent::Text(text) => serde_json::json!({ "type": "text", "text": text }),
+        MessageContent::Multimodal { text, images } => {
+            let images: Vec<serde_json::Value> = images
+                .into_iter()
+                .map(|img| {
+                    serde_json::json!({
+                        "id": img.id,
+                        "image_path": img.image_path,
+                        "mime_type": img.mime_type,
+                        "metadata": img.metadata,
+                        "has_data_url": img.data_url.as_ref().is_some_and(|s| !s.is_empty()),
+                    })
+                })
+                .collect();
+
+            serde_json::json!({
+                "type": "multimodal",
+                "text": text,
+                "images": images,
+            })
+        }
         MessageContent::ToolResult {
             tool_id,
             tool_name,
diff --git a/src/apps/desktop/src/api/commands.rs b/src/apps/desktop/src/api/commands.rs
index eedb6d57..3e61f9f9 100644
--- a/src/apps/desktop/src/api/commands.rs
+++ b/src/apps/desktop/src/api/commands.rs
@@ -197,6 +197,21 @@ pub async fn test_ai_config_connection(
     request: TestAIConfigConnectionRequest,
 ) -> Result<bitfun_core::util::types::ConnectionTestResult, String> {
     let model_name = request.config.name.clone();
+    let supports_image_input = request
+        .config
+        .capabilities
+        .iter()
+        .any(|cap| {
+            matches!(
+                cap,
+                bitfun_core::service::config::types::ModelCapability::ImageUnderstanding
+            )
+        })
+        || matches!(
+            request.config.category,
+            bitfun_core::service::config::types::ModelCategory::Multimodal
+        );
+
     let ai_config = match request.config.try_into() {
         Ok(config) => config,
         Err(e) => {
@@ -209,6 +224,64 @@ pub async fn test_ai_config_connection(
 
     match ai_client.test_connection().await {
         Ok(result) => {
+            if !result.success {
+                info!(
+                    "AI config connection test completed: model={}, success={}, response_time={}ms",
+                    model_name, result.success, result.response_time_ms
+                );
+                return Ok(result);
+            }
+
+            if supports_image_input {
+                match ai_client.test_image_input_connection().await {
+                    Ok(image_result) => {
+                        let response_time_ms =
+                            result.response_time_ms + image_result.response_time_ms;
+
+                        if !image_result.success {
+                            let image_error = image_result
+                                .error_details
+                                .unwrap_or_else(|| "Unknown image input test error".to_string());
+                            let merged = bitfun_core::util::types::ConnectionTestResult {
+                                success: false,
+                                response_time_ms,
+                                model_response: image_result.model_response.or(result.model_response),
+                                error_details: Some(format!(
+                                    "Basic connection passed, but multimodal image input test failed: {}",
+                                    image_error
+                                )),
+                            };
+                            info!(
+                                "AI config connection test completed: model={}, success={}, response_time={}ms",
+                                model_name, merged.success, merged.response_time_ms
+                            );
+                            return Ok(merged);
+                        }
+
+                        let merged = bitfun_core::util::types::ConnectionTestResult {
+                            success: true,
+                            response_time_ms,
+                            model_response: image_result
+                                .model_response
+                                .or(result.model_response),
+                            error_details: None,
+                        };
+                        info!(
+                            "AI config connection test completed: model={}, success={}, response_time={}ms",
+                            model_name, merged.success, merged.response_time_ms
+                        );
+                        return Ok(merged);
+                    }
+                    Err(e) => {
+                        error!(
+                            "AI config multimodal image input test failed unexpectedly: model={}, error={}",
+                            model_name, e
+                        );
+                        return Err(format!("Connection test failed: {}", e));
+                    }
+                }
+            }
+
             info!(
                 "AI config connection test completed: model={}, success={}, response_time={}ms",
                 model_name, result.success, result.response_time_ms
diff --git a/src/apps/desktop/src/api/image_analysis_api.rs b/src/apps/desktop/src/api/image_analysis_api.rs
index 369272ca..25438837 100644
--- a/src/apps/desktop/src/api/image_analysis_api.rs
+++ b/src/apps/desktop/src/api/image_analysis_api.rs
@@ -26,15 +26,14 @@ pub async fn analyze_images(
 
     let image_model = resolve_vision_model_from_ai_config(&ai_config).map_err(|e| {
         error!(
-            "No image understanding model available: available_models={:?}, error={}",
+            "Image understanding model resolution failed: available_models={:?}, error={}",
             ai_config.models.iter().map(|m| &m.id).collect::<Vec<_>>(),
             e
         );
         format!(
-            "Image understanding model not configured and no compatible model found.\n\n\
-             Please add a model that supports image understanding \
-             in [Settings → AI Model Config], enable 'image_understanding' capability, \
-             and assign it in [Settings → Super Agent].\n\nDetails: {}",
+            "Image understanding model is not configured.\n\n\
+             Please select a model for [Settings → Default Model Config → Image Understanding Model].\n\n\
+             Details: {}",
             e
         )
     })?;
diff --git a/src/crates/core/src/agentic/coordination/coordinator.rs b/src/crates/core/src/agentic/coordination/coordinator.rs
index f84b8cf4..811bc567 100644
--- a/src/crates/core/src/agentic/coordination/coordinator.rs
+++ b/src/crates/core/src/agentic/coordination/coordinator.rs
@@ -13,6 +13,7 @@ use crate::agentic::events::{
 use crate::agentic::execution::{ExecutionContext, ExecutionEngine};
 use crate::agentic::session::SessionManager;
 use crate::agentic::tools::pipeline::{SubagentParentInfo, ToolPipeline};
+use crate::agentic::image_analysis::ImageContextData;
 use crate::util::errors::{BitFunError, BitFunResult};
 use log::{debug, error, info, warn};
 use std::sync::Arc;
@@ -171,6 +172,36 @@ impl ConversationCoordinator {
         turn_id: Option<String>,
         agent_type: String,
         skip_tool_confirmation: bool,
+    ) -> BitFunResult<()> {
+        self.start_dialog_turn_internal(session_id, user_input, None, turn_id, agent_type)
+            .await
+    }
+
+    pub async fn start_dialog_turn_with_image_contexts(
+        &self,
+        session_id: String,
+        user_input: String,
+        image_contexts: Vec<ImageContextData>,
+        turn_id: Option<String>,
+        agent_type: String,
+    ) -> BitFunResult<()> {
+        self.start_dialog_turn_internal(
+            session_id,
+            user_input,
+            Some(image_contexts),
+            turn_id,
+            agent_type,
+        )
+        .await
+    }
+
+    async fn start_dialog_turn_internal(
+        &self,
+        session_id: String,
+        user_input: String,
+        image_contexts: Option<Vec<ImageContextData>>,
+        turn_id: Option<String>,
+        agent_type: String,
     ) -> BitFunResult<()> {
         // Get latest session (re-fetch each time to ensure latest state)
         let session = self
@@ -286,7 +317,12 @@ impl ConversationCoordinator {
         // Pass frontend turnId, generate if not provided
         let turn_id = self
             .session_manager
-            .start_dialog_turn(&session_id, wrapped_user_input.clone(), turn_id)
+            .start_dialog_turn(
+                &session_id,
+                wrapped_user_input.clone(),
+                turn_id,
+                image_contexts,
+            )
             .await?;
 
         // Send dialog turn started event
diff --git a/src/crates/core/src/agentic/core/message.rs b/src/crates/core/src/agentic/core/message.rs
index 853574e8..59d75ade 100644
--- a/src/crates/core/src/agentic/core/message.rs
+++ b/src/crates/core/src/agentic/core/message.rs
@@ -1,3 +1,4 @@
+use crate::agentic::image_analysis::ImageContextData;
 use crate::util::types::{Message as AIMessage, ToolCall as AIToolCall};
 use crate::util::TokenCounter;
 use log::warn;
@@ -27,6 +28,10 @@ pub enum MessageRole {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum MessageContent {
     Text(String),
+    Multimodal {
+        text: String,
+        images: Vec<ImageContextData>,
+    },
     ToolResult {
         tool_id: String,
         tool_name: String,
@@ -92,6 +97,42 @@ impl From<Message> for AIMessage {
                     name: None,
                 }
             }
+            MessageContent::Multimodal { text, images } => {
+                let mut content = text;
+                if !images.is_empty() {
+                    content.push_str("\n\n[Attached image(s):\n");
+                    for image in images {
+                        let name = image
+                            .metadata
+                            .as_ref()
+                            .and_then(|m| m.get("name"))
+                            .and_then(|v| v.as_str())
+                            .filter(|s| !s.is_empty())
+                            .map(str::to_string)
+                            .or_else(|| {
+                                image
+                                    .image_path
+                                    .as_ref()
+                                    .filter(|s| !s.is_empty())
+                                    .cloned()
+                            })
+                            .unwrap_or_else(|| image.id.clone());
+
+                        content.push_str(&format!("- {} ({})\n", name, image.mime_type));
+                    }
+                    content.push(']');
+                }
+
+                Self {
+                    role: "user".to_string(),
+                    content: Some(content),
+                    reasoning_content: None,
+                    thinking_signature: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                    name: None,
+                }
+            }
             MessageContent::Mixed {
                 reasoning_content,
                 text,
@@ -213,6 +254,16 @@ impl Message {
         }
     }
 
+    pub fn user_multimodal(text: String, images: Vec<ImageContextData>) -> Self {
+        Self {
+            id: Uuid::new_v4().to_string(),
+            role: MessageRole::User,
+            content: MessageContent::Multimodal { text, images },
+            timestamp: SystemTime::now(),
+            metadata: MessageMetadata::default(),
+        }
+    }
+
     pub fn assistant(text: String) -> Self {
         Self {
             id: Uuid::new_v4().to_string(),
@@ -277,10 +328,13 @@ impl Message {
         if self.role != MessageRole::User {
             return false;
         }
-        if let MessageContent::Text(text) = &self.content {
-            if text.starts_with("<system-reminder>") {
-                return false;
-            }
+        let text = match &self.content {
+            MessageContent::Text(text) => Some(text.as_str()),
+            MessageContent::Multimodal { text, .. } => Some(text.as_str()),
+            _ => None,
+        };
+        if text.is_some_and(|t| t.starts_with("<system-reminder>")) {
+            return false;
         }
         true
     }
@@ -308,16 +362,92 @@ impl Message {
         if let Some(tokens) = self.metadata.tokens {
             return tokens;
         }
-        let tokens = TokenCounter::estimate_message_tokens(&AIMessage::from(&*self));
+        let tokens = self.estimate_tokens();
         self.metadata.tokens = Some(tokens);
         tokens
     }
+
+    fn estimate_image_tokens(metadata: Option<&serde_json::Value>) -> usize {
+        let (width, height) = metadata
+            .and_then(|m| {
+                let w = m.get("width").and_then(|v| v.as_u64());
+                let h = m.get("height").and_then(|v| v.as_u64());
+                match (w, h) {
+                    (Some(w), Some(h)) if w > 0 && h > 0 => Some((w as u32, h as u32)),
+                    _ => None,
+                }
+            })
+            .unwrap_or((1024, 1024));
+
+        let tiles_w = (width + 511) / 512;
+        let tiles_h = (height + 511) / 512;
+        let tiles = (tiles_w.max(1) * tiles_h.max(1)) as usize;
+        50 + tiles * 200
+    }
+
+    fn estimate_tokens(&self) -> usize {
+        let mut total = 0usize;
+        total += 4;
+
+        match &self.content {
+            MessageContent::Text(text) => {
+                total += TokenCounter::estimate_tokens(text);
+            }
+            MessageContent::Multimodal { text, images } => {
+                total += TokenCounter::estimate_tokens(text);
+                for image in images {
+                    total += Self::estimate_image_tokens(image.metadata.as_ref());
+                }
+            }
+            MessageContent::Mixed {
+                reasoning_content,
+                text,
+                tool_calls,
+            } => {
+                if self.metadata.keep_thinking {
+                    if let Some(reasoning) = reasoning_content.as_ref() {
+                        total += TokenCounter::estimate_tokens(reasoning);
+                    }
+                }
+                total += TokenCounter::estimate_tokens(text);
+
+                for tool_call in tool_calls {
+                    total += TokenCounter::estimate_tokens(&tool_call.tool_name);
+                    if let Ok(json_str) = serde_json::to_string(&tool_call.arguments) {
+                        total += TokenCounter::estimate_tokens(&json_str);
+                    }
+                    total += 10;
+                }
+            }
+            MessageContent::ToolResult {
+                tool_name,
+                result,
+                result_for_assistant,
+                ..
+            } => {
+                if let Some(text) = result_for_assistant.as_ref().filter(|s| !s.is_empty()) {
+                    total += TokenCounter::estimate_tokens(text);
+                } else if let Ok(json_str) = serde_json::to_string(result) {
+                    total += TokenCounter::estimate_tokens(&json_str);
+                } else {
+                    total += TokenCounter::estimate_tokens(tool_name);
+                }
+            }
+        }
+
+        total
+    }
 }
 
 impl ToString for MessageContent {
     fn to_string(&self) -> String {
         match self {
             MessageContent::Text(text) => text.clone(),
+            MessageContent::Multimodal { text, images } => format!(
+                "Multimodal: text_length={}, images={}",
+                text.len(),
+                images.len()
+            ),
             MessageContent::ToolResult {
                 tool_id,
                 tool_name,
diff --git a/src/crates/core/src/agentic/core/messages_helper.rs b/src/crates/core/src/agentic/core/messages_helper.rs
index 203701e1..1d4c5fc1 100644
--- a/src/crates/core/src/agentic/core/messages_helper.rs
+++ b/src/crates/core/src/agentic/core/messages_helper.rs
@@ -13,22 +13,32 @@ impl MessageHelper {
             return;
         }
         if !enable_thinking {
-            messages
-                .iter_mut()
-                .for_each(|m| m.metadata.keep_thinking = false);
+            messages.iter_mut().for_each(|m| {
+                if m.metadata.keep_thinking {
+                    m.metadata.keep_thinking = false;
+                    m.metadata.tokens = None;
+                }
+            });
         } else if support_preserved_thinking {
-            messages
-                .iter_mut()
-                .for_each(|m| m.metadata.keep_thinking = true);
+            messages.iter_mut().for_each(|m| {
+                if !m.metadata.keep_thinking {
+                    m.metadata.keep_thinking = true;
+                    m.metadata.tokens = None;
+                }
+            });
         } else {
             let last_message_turn_id = messages.last().and_then(|m| m.metadata.turn_id.clone());
             if let Some(last_turn_id) = last_message_turn_id {
                 messages.iter_mut().for_each(|m| {
-                    m.metadata.keep_thinking = m
+                    let keep_thinking = m
                         .metadata
                         .turn_id
                         .as_ref()
                         .is_some_and(|cur_turn_id| cur_turn_id == &last_turn_id);
+                    if m.metadata.keep_thinking != keep_thinking {
+                        m.metadata.keep_thinking = keep_thinking;
+                        m.metadata.tokens = None;
+                    }
                 })
             } else {
                 // Find the index of the last user message (role is user and not <system-reminder>) from back to front
@@ -38,15 +48,21 @@ impl MessageHelper {
                     // Messages from the last user message onwards are messages for this turn
                     messages.iter_mut().enumerate().for_each(|(index, m)| {
                         let keep_thinking = index >= last_user_message_index;
-                        m.metadata.keep_thinking = keep_thinking;
+                        if m.metadata.keep_thinking != keep_thinking {
+                            m.metadata.keep_thinking = keep_thinking;
+                            m.metadata.tokens = None;
+                        }
                     })
                 } else {
                     // No user message found, should not reach here in practice
                     warn!("compute_keep_thinking_flags: no user message found");
 
-                    messages
-                        .iter_mut()
-                        .for_each(|m| m.metadata.keep_thinking = false);
+                    messages.iter_mut().for_each(|m| {
+                        if m.metadata.keep_thinking {
+                            m.metadata.keep_thinking = false;
+                            m.metadata.tokens = None;
+                        }
+                    });
                 }
             }
         }
diff --git a/src/crates/core/src/agentic/execution/execution_engine.rs b/src/crates/core/src/agentic/execution/execution_engine.rs
index 70512430..61adeb16 100644
--- a/src/crates/core/src/agentic/execution/execution_engine.rs
+++ b/src/crates/core/src/agentic/execution/execution_engine.rs
@@ -5,18 +5,25 @@
 use super::round_executor::RoundExecutor;
 use super::types::{ExecutionContext, ExecutionResult, RoundContext};
 use crate::agentic::agents::get_agent_registry;
-use crate::agentic::core::{Message, MessageHelper};
+use crate::agentic::core::{Message, MessageContent, MessageHelper};
 use crate::agentic::events::{AgenticEvent, EventPriority, EventQueue};
+use crate::agentic::image_analysis::{
+    build_multimodal_message_with_images, process_image_contexts_for_provider, ImageContextData,
+    ImageLimits,
+};
 use crate::agentic::session::SessionManager;
 use crate::agentic::tools::{get_all_registered_tools, SubagentParentInfo};
 use crate::infrastructure::ai::get_global_ai_client_factory;
 use crate::infrastructure::get_workspace_path;
+use crate::service::config::get_global_config_service;
+use crate::service::config::types::{ModelCapability, ModelCategory};
 use crate::util::errors::{BitFunError, BitFunResult};
 use crate::util::token_counter::TokenCounter;
 use crate::util::types::Message as AIMessage;
 use crate::util::types::ToolDefinition;
 use log::{debug, error, info, trace, warn};
 use std::collections::HashMap;
+use std::path::Path;
 use std::sync::Arc;
 use tokio_util::sync::CancellationToken;
 
@@ -55,6 +62,146 @@ impl ExecutionEngine {
         }
     }
 
+    fn estimate_request_tokens_internal(
+        messages: &mut [Message],
+        tools: Option<&[ToolDefinition]>,
+    ) -> usize {
+        let mut total: usize = messages.iter_mut().map(|m| m.get_tokens()).sum();
+        total += 3;
+
+        if let Some(tool_defs) = tools {
+            total += TokenCounter::estimate_tool_definitions_tokens(tool_defs);
+        }
+
+        total
+    }
+
+    fn is_redacted_image_context(image: &ImageContextData) -> bool {
+        let missing_path = image
+            .image_path
+            .as_ref()
+            .map(|s| s.trim().is_empty())
+            .unwrap_or(true);
+        let missing_data_url = image
+            .data_url
+            .as_ref()
+            .map(|s| s.trim().is_empty())
+            .unwrap_or(true);
+        let has_redaction_hint = image
+            .metadata
+            .as_ref()
+            .and_then(|m| m.get("has_data_url"))
+            .and_then(|v| v.as_bool())
+            .unwrap_or(false);
+
+        missing_path && missing_data_url && has_redaction_hint
+    }
+
+    fn is_recoverable_historical_image_error(err: &BitFunError) -> bool {
+        match err {
+            BitFunError::Io(_) | BitFunError::Deserialization(_) => true,
+            BitFunError::Validation(msg) => {
+                msg.starts_with("Failed to decode image data")
+                    || msg.starts_with("Unsupported or unrecognized image format")
+                    || msg.starts_with("Invalid data URL format")
+                    || msg.starts_with("Data URL format error")
+            }
+            _ => false,
+        }
+    }
+
+    fn can_fallback_to_text_only(
+        images: &[ImageContextData],
+        err: &BitFunError,
+        is_current_turn_message: bool,
+    ) -> bool {
+        let is_redacted_payload_error = matches!(
+            err,
+            BitFunError::Validation(msg) if msg.starts_with("Image context missing image_path/data_url")
+        ) && !images.is_empty()
+            && images.iter().all(Self::is_redacted_image_context);
+
+        if is_redacted_payload_error {
+            return true;
+        }
+
+        if is_current_turn_message {
+            return false;
+        }
+
+        Self::is_recoverable_historical_image_error(err)
+    }
+
+    async fn build_ai_messages_for_send(
+        messages: &[Message],
+        provider: &str,
+        workspace_path: Option<&Path>,
+        current_turn_id: &str,
+    ) -> BitFunResult<Vec<AIMessage>> {
+        let limits = ImageLimits::for_provider(provider);
+
+        let mut result = Vec::with_capacity(messages.len());
+        let mut attached_image_count = 0usize;
+
+        for msg in messages {
+            match &msg.content {
+                MessageContent::Multimodal { text, images } => {
+                    let prompt = if text.trim().is_empty() {
+                        "(image attached)".to_string()
+                    } else {
+                        text.clone()
+                    };
+
+                    match process_image_contexts_for_provider(images, provider, workspace_path)
+                        .await
+                    {
+                        Ok(processed) => {
+                            let next_count = attached_image_count + processed.len();
+                            if next_count > limits.max_images_per_request {
+                                return Err(BitFunError::validation(format!(
+                                    "Too many images in one request: {} > {}",
+                                    next_count, limits.max_images_per_request
+                                )));
+                            }
+                            attached_image_count = next_count;
+
+                            let multimodal = build_multimodal_message_with_images(
+                                &prompt, &processed, provider,
+                            )?;
+                            result.extend(multimodal);
+                        }
+                        Err(err) => {
+                            if matches!(&err, BitFunError::Validation(msg) if msg.starts_with("Too many images in one request"))
+                            {
+                                return Err(err);
+                            }
+                            let is_current_turn_message =
+                                msg.metadata.turn_id.as_deref() == Some(current_turn_id);
+                            if Self::can_fallback_to_text_only(
+                                images,
+                                &err,
+                                is_current_turn_message,
+                            ) {
+                                // Degrade only for historical multimodal messages. Current-turn
+                                // image failures should still surface to users.
+                                warn!(
+                                    "Failed to rebuild multimodal payload, falling back to text-only message: message_id={}, provider={}, turn_id={:?}, current_turn_id={}, error={}",
+                                    msg.id, provider, msg.metadata.turn_id, current_turn_id, err
+                                );
+                                result.push(AIMessage::from(msg));
+                            } else {
+                                return Err(err);
+                            }
+                        }
+                    }
+                }
+                _ => result.push(AIMessage::from(msg)),
+            }
+        }
+
+        Ok(result)
+    }
+
     /// Compress context, will emit compression events (Started, Completed, and Failed)
     pub async fn compress_messages(
         &self,
@@ -66,7 +213,7 @@ impl ExecutionEngine {
         context_window: usize,
         tool_definitions: &Option<Vec<ToolDefinition>>,
         system_prompt_message: Message,
-    ) -> BitFunResult<Option<(usize, Vec<Message>, Vec<AIMessage>)>> {
+    ) -> BitFunResult<Option<(usize, Vec<Message>)>> {
         let event_subagent_parent_info = subagent_parent_info.map(|info| info.clone().into());
         let mut session = self
             .session_manager
@@ -134,10 +281,8 @@ impl ExecutionEngine {
                 let duration_ms = start_time.elapsed().as_millis() as u64;
 
                 // Recalculate tokens after compression
-                let new_ai_messages: Vec<AIMessage> =
-                    MessageHelper::convert_messages(&new_messages);
-                let compressed_tokens = TokenCounter::estimate_request_tokens(
-                    &new_ai_messages,
+                let compressed_tokens = Self::estimate_request_tokens_internal(
+                    &mut new_messages,
                     tool_definitions.as_deref(),
                 );
 
@@ -159,7 +304,7 @@ impl ExecutionEngine {
                 )
                 .await;
 
-                Ok(Some((compressed_tokens, new_messages, new_ai_messages)))
+                Ok(Some((compressed_tokens, new_messages)))
             }
             Err(e) => {
                 // Emit compression failed event
@@ -353,6 +498,83 @@ impl ExecutionEngine {
         let support_preserved_thinking = ai_client.config.support_preserved_thinking;
         let context_window = ai_client.config.context_window as usize;
 
+        // Detect whether the primary model supports multimodal image inputs.
+        // This is used by tools like `view_image` to decide between:
+        // - attaching image content for the primary model to analyze directly, or
+        // - using a dedicated vision model to pre-analyze into text.
+        let (resolved_primary_model_id, primary_supports_image_understanding) = {
+            let config_service = get_global_config_service().await.ok();
+            if let Some(service) = config_service {
+                let ai_config: crate::service::config::types::AIConfig =
+                    service.get_config(Some("ai")).await.unwrap_or_default();
+
+                let resolved_id = match model_id.as_str() {
+                    "primary" => ai_config
+                        .default_models
+                        .primary
+                        .clone()
+                        .unwrap_or_else(|| model_id.clone()),
+                    "fast" => ai_config
+                        .default_models
+                        .fast
+                        .clone()
+                        .or_else(|| ai_config.default_models.primary.clone())
+                        .unwrap_or_else(|| model_id.clone()),
+                    _ => model_id.clone(),
+                };
+
+                let model_cfg = ai_config
+                    .models
+                    .iter()
+                    .find(|m| m.id == resolved_id)
+                    .or_else(|| ai_config.models.iter().find(|m| m.name == resolved_id))
+                    .or_else(|| {
+                        ai_config
+                            .models
+                            .iter()
+                            .find(|m| m.model_name == resolved_id)
+                    })
+                    .or_else(|| {
+                        ai_config.models.iter().find(|m| {
+                            m.model_name == ai_client.config.model
+                                && m.provider == ai_client.config.format
+                        })
+                    });
+
+                let supports = model_cfg.is_some_and(|m| {
+                    m.capabilities
+                        .iter()
+                        .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding))
+                        || matches!(m.category, ModelCategory::Multimodal)
+                });
+
+                (resolved_id, supports)
+            } else {
+                warn!(
+                    "Config service unavailable, assuming primary model is text-only for image input gating"
+                );
+                (model_id.clone(), false)
+            }
+        };
+
+        let mut execution_context_vars = context.context.clone();
+        execution_context_vars.insert(
+            "primary_model_id".to_string(),
+            resolved_primary_model_id.clone(),
+        );
+        execution_context_vars.insert(
+            "primary_model_name".to_string(),
+            ai_client.config.model.clone(),
+        );
+        execution_context_vars.insert(
+            "primary_model_provider".to_string(),
+            ai_client.config.format.clone(),
+        );
+        execution_context_vars.insert(
+            "primary_model_supports_image_understanding".to_string(),
+            primary_supports_image_understanding.to_string(),
+        );
+
         // Loop to execute model rounds
         loop {
             // Check round limit
@@ -369,11 +591,10 @@ impl ExecutionEngine {
                 enable_thinking,
                 support_preserved_thinking,
             );
-            let mut ai_messages = MessageHelper::convert_messages(&messages);
 
             // Check and compress before sending AI request
             let current_tokens =
-                TokenCounter::estimate_request_tokens(&ai_messages, tool_definitions.as_deref());
+                Self::estimate_request_tokens_internal(&mut messages, tool_definitions.as_deref());
             debug!(
                 "Round {} token usage before send: {} / {} tokens ({:.1}%)",
                 round_index,
@@ -414,7 +635,7 @@ impl ExecutionEngine {
                     )
                     .await
                 {
-                    Ok(Some((compressed_tokens, compressed_messages, compressed_ai_messages))) => {
+                    Ok(Some((compressed_tokens, compressed_messages))) => {
                         info!(
                             "Round {} compression completed: messages {} -> {}, tokens {} -> {}",
                             round_index,
@@ -425,7 +646,6 @@ impl ExecutionEngine {
                         );
 
                         messages = compressed_messages;
-                        ai_messages = compressed_ai_messages;
                     }
                     Ok(None) => {
                         debug!("All turns need to be kept, no compression performed");
@@ -440,7 +660,7 @@ impl ExecutionEngine {
             }
 
             // Create round context
-            let mut round_context_vars = context.context.clone();
+            let mut round_context_vars = execution_context_vars.clone();
             if context.skip_tool_confirmation {
                 round_context_vars.insert("skip_tool_confirmation".to_string(), "true".to_string());
             }
@@ -452,11 +672,7 @@ impl ExecutionEngine {
                 round_number: round_index,
                 messages: messages.clone(),
                 available_tools: available_tools.clone(),
-                model_name: context
-                    .context
-                    .get("model_name")
-                    .cloned()
-                    .unwrap_or_else(|| "default".to_string()),
+                model_name: ai_client.config.model.clone(),
                 agent_type: agent_type.clone(),
                 context_vars: round_context_vars,
                 cancellation_token: CancellationToken::new(),
@@ -469,6 +685,15 @@ impl ExecutionEngine {
                 messages.len()
             );
 
+            let workspace_path = get_workspace_path();
+            let ai_messages = Self::build_ai_messages_for_send(
+                &messages,
+                &ai_client.config.format,
+                workspace_path.as_deref(),
+                &context.dialog_turn_id,
+            )
+            .await?;
+
             let round_result = self
                 .round_executor
                 .execute_round(
diff --git a/src/crates/core/src/agentic/execution/round_executor.rs b/src/crates/core/src/agentic/execution/round_executor.rs
index 9a0a8c84..951d31d5 100644
--- a/src/crates/core/src/agentic/execution/round_executor.rs
+++ b/src/crates/core/src/agentic/execution/round_executor.rs
@@ -6,6 +6,7 @@ use super::stream_processor::StreamProcessor;
 use super::types::{FinishReason, RoundContext, RoundResult};
 use crate::agentic::core::Message;
 use crate::agentic::events::{AgenticEvent, EventPriority, EventQueue};
+use crate::agentic::image_analysis::ImageContextData as ModelImageContextData;
 use crate::agentic::tools::pipeline::{ToolExecutionContext, ToolExecutionOptions, ToolPipeline};
 use crate::agentic::tools::registry::get_global_tool_registry;
 use crate::agentic::MessageContent;
@@ -16,6 +17,7 @@ use crate::util::types::Message as AIMessage;
 use crate::util::types::ToolDefinition;
 use dashmap::DashMap;
 use log::{debug, error, warn};
+use serde_json::Value as JsonValue;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
@@ -455,7 +457,32 @@ impl RoundExecutor {
         // Create tool result messages (also need to set turn_id and round_id)
         let dialog_turn_id = context.dialog_turn_id.clone();
         let round_id_clone = round_id.clone();
-        let tool_result_messages: Vec<Message> = tool_results
+        let primary_supports_images = context
+            .context_vars
+            .get("primary_model_supports_image_understanding")
+            .and_then(|v| v.parse::<bool>().ok())
+            .unwrap_or(false);
+        let extract_attached_image = |result: &JsonValue| -> Option<ModelImageContextData> {
+            if !primary_supports_images {
+                return None;
+            }
+            let mode = result.get("mode").and_then(|v| v.as_str())?;
+            if mode != "attached_to_primary_model" {
+                return None;
+            }
+            let image_value = result.get("image")?;
+            serde_json::from_value::<ModelImageContextData>(image_value.clone()).ok()
+        };
+        let mut injected_images = Vec::new();
+        for result in &tool_results {
+            if result.tool_name == "view_image" && !result.is_error {
+                if let Some(image_ctx) = extract_attached_image(&result.result) {
+                    injected_images.push(image_ctx);
+                }
+            }
+        }
+
+        let mut tool_result_messages: Vec<Message> = tool_results
             .into_iter()
             .map(|result| {
                 Message::tool_result(result)
@@ -464,6 +491,18 @@ impl RoundExecutor {
             })
             .collect();
 
+        if !injected_images.is_empty() {
+            let reminder_text = format!(
+                "<system-reminder>\nAttached {} image(s) from view_image tool.\n</system-reminder>",
+                injected_images.len()
+            );
+            tool_result_messages.push(
+                Message::user_multimodal(reminder_text, injected_images)
+                    .with_turn_id(dialog_turn_id.clone())
+                    .with_round_id(round_id_clone.clone()),
+            );
+        }
+
         let has_more_rounds = !has_end_turn_tool && !tool_result_messages.is_empty();
 
         debug!(
diff --git a/src/crates/core/src/agentic/image_analysis/image_processing.rs b/src/crates/core/src/agentic/image_analysis/image_processing.rs
index 88d2184d..d4cd763e 100644
--- a/src/crates/core/src/agentic/image_analysis/image_processing.rs
+++ b/src/crates/core/src/agentic/image_analysis/image_processing.rs
@@ -1,8 +1,10 @@
 //! Shared image processing utilities used by both API-side image analysis and tool-driven image analysis.
 
-use super::types::ImageLimits;
+use super::types::{ImageContextData, ImageLimits};
 use crate::service::config::get_global_config_service;
-use crate::service::config::types::{AIConfig as ServiceAIConfig, AIModelConfig, ModelCapability};
+use crate::service::config::types::{
+    AIConfig as ServiceAIConfig, AIModelConfig, ModelCapability, ModelCategory,
+};
 use crate::util::errors::{BitFunError, BitFunResult};
 use crate::util::types::Message;
 use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _};
@@ -31,34 +33,41 @@ pub fn resolve_vision_model_from_ai_config(
     let target_model_id = ai_config
         .default_models
         .image_understanding
-        .as_ref()
+        .as_deref()
+        .map(str::trim)
         .filter(|id| !id.is_empty());
 
-    if let Some(id) = target_model_id {
-        return ai_config
-            .models
-            .iter()
-            .find(|m| m.id == *id)
-            .cloned()
-            .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id)));
-    }
+    let Some(id) = target_model_id else {
+        return Err(BitFunError::service(
+            "Image understanding model is not configured.\nPlease select a model in Settings."
+                .to_string(),
+        ));
+    };
 
-    ai_config
+    let model = ai_config
         .models
         .iter()
-        .find(|m| {
-            m.enabled
-                && m.capabilities
-                    .iter()
-                    .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding))
-        })
+        .find(|m| m.id == id)
         .cloned()
-        .ok_or_else(|| {
-            BitFunError::service(
-                "No image understanding model found.\nPlease configure an image understanding model in settings"
-                    .to_string(),
-            )
-        })
+        .ok_or_else(|| BitFunError::service(format!("Model not found: {}", id)))?;
+
+    if !model.enabled {
+        return Err(BitFunError::service(format!("Model is disabled: {}", id)));
+    }
+
+    let supports_image_understanding = model
+        .capabilities
+        .iter()
+        .any(|cap| matches!(cap, ModelCapability::ImageUnderstanding))
+        || matches!(model.category, ModelCategory::Multimodal);
+    if !supports_image_understanding {
+        return Err(BitFunError::service(format!(
+            "Model does not support image understanding: {}",
+            id
+        )));
+    }
+
+    Ok(model)
 }
 
 pub async fn resolve_vision_model_from_global_config() -> BitFunResult<AIModelConfig> {
@@ -275,6 +284,105 @@ pub fn build_multimodal_message(
     Ok(vec![message])
 }
 
+pub async fn process_image_contexts_for_provider(
+    image_contexts: &[ImageContextData],
+    provider: &str,
+    workspace_path: Option<&Path>,
+) -> BitFunResult<Vec<ProcessedImage>> {
+    let limits = ImageLimits::for_provider(provider);
+
+    if image_contexts.len() > limits.max_images_per_request {
+        return Err(BitFunError::validation(format!(
+            "Too many images in one request: {} > {}",
+            image_contexts.len(),
+            limits.max_images_per_request
+        )));
+    }
+
+    let mut results = Vec::with_capacity(image_contexts.len());
+
+    for ctx in image_contexts {
+        let (image_data, fallback_mime) = if let Some(data_url) = &ctx.data_url {
+            let (data, data_url_mime) = decode_data_url(data_url)?;
+            (data, data_url_mime.or_else(|| Some(ctx.mime_type.clone())))
+        } else if let Some(path_str) = &ctx.image_path {
+            let path = resolve_image_path(path_str, workspace_path)?;
+            let data = load_image_from_path(&path, workspace_path).await?;
+            let detected_mime = detect_mime_type_from_bytes(&data, Some(&ctx.mime_type)).ok();
+            (data, detected_mime.or_else(|| Some(ctx.mime_type.clone())))
+        } else {
+            return Err(BitFunError::validation(format!(
+                "Image context missing image_path/data_url: id={}",
+                ctx.id
+            )));
+        };
+
+        let processed =
+            optimize_image_for_provider(image_data, provider, fallback_mime.as_deref())?;
+        results.push(processed);
+    }
+
+    Ok(results)
+}
+
+pub fn build_multimodal_message_with_images(
+    prompt: &str,
+    images: &[ProcessedImage],
+    provider: &str,
+) -> BitFunResult<Vec<Message>> {
+    if images.is_empty() {
+        return Ok(vec![Message::user(prompt.to_string())]);
+    }
+
+    let provider_lower = provider.to_lowercase();
+
+    let content_json = if provider_lower.contains("anthropic") {
+        let mut blocks = Vec::with_capacity(images.len() + 1);
+        for img in images {
+            let base64_data = BASE64.encode(&img.data);
+            blocks.push(json!({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": img.mime_type,
+                    "data": base64_data
+                }
+            }));
+        }
+        blocks.push(json!({
+            "type": "text",
+            "text": prompt
+        }));
+        json!(blocks)
+    } else {
+        let mut blocks = Vec::with_capacity(images.len() + 1);
+        for img in images {
+            let base64_data = BASE64.encode(&img.data);
+            blocks.push(json!({
+                "type": "image_url",
+                "image_url": {
+                    "url": format!("data:{};base64,{}", img.mime_type, base64_data)
+                }
+            }));
+        }
+        blocks.push(json!({
+            "type": "text",
+            "text": prompt
+        }));
+        json!(blocks)
+    };
+
+    Ok(vec![Message {
+        role: "user".to_string(),
+        content: Some(serde_json::to_string(&content_json)?),
+        reasoning_content: None,
+        thinking_signature: None,
+        tool_calls: None,
+        tool_call_id: None,
+        name: None,
+    }])
+}
+
 fn image_format_to_mime(format: ImageFormat) -> Option<&'static str> {
     match format {
         ImageFormat::Png => Some("image/png"),
diff --git a/src/crates/core/src/agentic/image_analysis/mod.rs b/src/crates/core/src/agentic/image_analysis/mod.rs
index 814afb66..4ba156f8 100644
--- a/src/crates/core/src/agentic/image_analysis/mod.rs
+++ b/src/crates/core/src/agentic/image_analysis/mod.rs
@@ -10,8 +10,9 @@ pub mod types;
 pub use enhancer::MessageEnhancer;
 pub use image_processing::{
     build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path,
-    optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_ai_config,
-    resolve_vision_model_from_global_config, ProcessedImage,
+    optimize_image_for_provider, process_image_contexts_for_provider, resolve_image_path,
+    resolve_vision_model_from_ai_config, resolve_vision_model_from_global_config,
+    build_multimodal_message_with_images, ProcessedImage,
 };
 pub use processor::ImageAnalyzer;
 pub use types::*;
diff --git a/src/crates/core/src/agentic/persistence/manager.rs b/src/crates/core/src/agentic/persistence/manager.rs
index 57718554..7fd79015 100644
--- a/src/crates/core/src/agentic/persistence/manager.rs
+++ b/src/crates/core/src/agentic/persistence/manager.rs
@@ -2,7 +2,7 @@
 //!
 //! Responsible for persistent storage of sessions, messages, and tool states
 
-use crate::agentic::core::{DialogTurn, Message, Session, SessionState, SessionSummary};
+use crate::agentic::core::{DialogTurn, Message, MessageContent, Session, SessionState, SessionSummary};
 use crate::infrastructure::PathManager;
 use crate::util::errors::{BitFunError, BitFunResult};
 use log::{debug, info, warn};
@@ -46,6 +46,65 @@ impl PersistenceManager {
         Ok(dir)
     }
 
+    fn sanitize_messages_for_persistence(messages: &[Message]) -> Vec<Message> {
+        messages
+            .iter()
+            .map(Self::sanitize_message_for_persistence)
+            .collect()
+    }
+
+    fn sanitize_message_for_persistence(message: &Message) -> Message {
+        let mut sanitized = message.clone();
+
+        match &mut sanitized.content {
+            MessageContent::Multimodal { images, .. } => {
+                for image in images.iter_mut() {
+                    if image.data_url.as_ref().is_some_and(|v| !v.is_empty()) {
+                        image.data_url = None;
+
+                        let mut metadata = image
+                            .metadata
+                            .take()
+                            .unwrap_or_else(|| serde_json::json!({}));
+                        if !metadata.is_object() {
+                            metadata = serde_json::json!({ "raw_metadata": metadata });
+                        }
+                        if let Some(obj) = metadata.as_object_mut() {
+                            obj.insert("has_data_url".to_string(), serde_json::json!(true));
+                        }
+                        image.metadata = Some(metadata);
+                    }
+                }
+            }
+            MessageContent::ToolResult { result, .. } => {
+                Self::redact_data_url_in_json(result);
+            }
+            _ => {}
+        }
+
+        sanitized
+    }
+
+    fn redact_data_url_in_json(value: &mut serde_json::Value) {
+        match value {
+            serde_json::Value::Object(map) => {
+                let had_data_url = map.remove("data_url").is_some();
+                if had_data_url {
+                    map.insert("has_data_url".to_string(), serde_json::json!(true));
+                }
+                for child in map.values_mut() {
+                    Self::redact_data_url_in_json(child);
+                }
+            }
+            serde_json::Value::Array(arr) => {
+                for child in arr {
+                    Self::redact_data_url_in_json(child);
+                }
+            }
+            _ => {}
+        }
+    }
+
     // ============ Turn context snapshot (sent to model)============
 
     fn context_snapshots_dir(&self, session_id: &str) -> PathBuf {
@@ -70,7 +129,8 @@ impl PersistenceManager {
             .map_err(|e| BitFunError::io(format!("Failed to create context_snapshots directory: {}", e)))?;
 
         let snapshot_path = self.context_snapshot_path(session_id, turn_index);
-        let json = serde_json::to_string(messages).map_err(|e| {
+        let sanitized_messages = Self::sanitize_messages_for_persistence(messages);
+        let json = serde_json::to_string(&sanitized_messages).map_err(|e| {
             BitFunError::serialization(format!("Failed to serialize turn context snapshot: {}", e))
         })?;
         fs::write(&snapshot_path, json)
@@ -312,7 +372,8 @@ impl PersistenceManager {
         let dir = self.ensure_session_dir(session_id).await?;
         let messages_path = dir.join("messages.jsonl");
 
-        let json = serde_json::to_string(message)
+        let sanitized_message = Self::sanitize_message_for_persistence(message);
+        let json = serde_json::to_string(&sanitized_message)
             .map_err(|e| BitFunError::serialization(format!("Failed to serialize message: {}", e)))?;
 
         let mut file = fs::OpenOptions::new()
@@ -397,7 +458,8 @@ impl PersistenceManager {
         let dir = self.ensure_session_dir(session_id).await?;
         let compressed_path = dir.join("compressed_messages.jsonl");
 
-        let json = serde_json::to_string(message)
+        let sanitized_message = Self::sanitize_message_for_persistence(message);
+        let json = serde_json::to_string(&sanitized_message)
             .map_err(|e| BitFunError::serialization(format!("Failed to serialize compressed message: {}", e)))?;
 
         let mut file = fs::OpenOptions::new()
@@ -435,8 +497,10 @@ impl PersistenceManager {
             .await
             .map_err(|e| BitFunError::io(format!("Failed to open compressed message file: {}", e)))?;
 
+        let sanitized_messages = Self::sanitize_messages_for_persistence(messages);
+
         // Write all messages
-        for message in messages {
+        for message in &sanitized_messages {
             let json = serde_json::to_string(message)
                 .map_err(|e| BitFunError::serialization(format!("Failed to serialize compressed message: {}", e)))?;
 
diff --git a/src/crates/core/src/agentic/session/session_manager.rs b/src/crates/core/src/agentic/session/session_manager.rs
index f26042ce..689d8b02 100644
--- a/src/crates/core/src/agentic/session/session_manager.rs
+++ b/src/crates/core/src/agentic/session/session_manager.rs
@@ -6,6 +6,7 @@ use crate::agentic::core::{
     CompressionState, DialogTurn, DialogTurnState, Message, ProcessingPhase, Session,
     SessionConfig, SessionState, SessionSummary, TurnStats,
 };
+use crate::agentic::image_analysis::ImageContextData;
 use crate::agentic::persistence::PersistenceManager;
 use crate::agentic::session::{CompressionManager, MessageHistoryManager};
 use crate::infrastructure::ai::get_global_ai_client_factory;
@@ -463,6 +464,7 @@ impl SessionManager {
         session_id: &str,
         user_input: String,
         turn_id: Option<String>,
+        image_contexts: Option<Vec<ImageContextData>>,
     ) -> BitFunResult<String> {
         // Check if session exists
         let session = self
@@ -491,7 +493,13 @@ impl SessionManager {
         }
 
         // 2. Add user message to history and compression managers
-        let user_message = Message::user(user_input).with_turn_id(turn_id.clone());
+        let user_message = if let Some(images) =
+            image_contexts.as_ref().filter(|v| !v.is_empty()).cloned()
+        {
+            Message::user_multimodal(user_input, images).with_turn_id(turn_id.clone())
+        } else {
+            Message::user(user_input).with_turn_id(turn_id.clone())
+        };
         self.history_manager
             .add_message(session_id, user_message.clone())
             .await?;
diff --git a/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs
index cbd59b2f..20ff31bc 100644
--- a/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs
+++ b/src/crates/core/src/agentic/tools/implementations/view_image_tool.rs
@@ -5,13 +5,17 @@
 //! that can evolve toward direct multimodal attachment in the future.
 
 use async_trait::async_trait;
+use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _};
+use image::GenericImageView;
 use log::{debug, info, trace};
 use serde::Deserialize;
 use serde_json::{json, Value};
+use uuid::Uuid;
 
 use crate::agentic::image_analysis::{
     build_multimodal_message, decode_data_url, detect_mime_type_from_bytes, load_image_from_path,
     optimize_image_for_provider, resolve_image_path, resolve_vision_model_from_global_config,
+    ImageContextData as ModelImageContextData,
 };
 use crate::agentic::tools::framework::{
     Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult,
@@ -43,6 +47,26 @@ impl ViewImageTool {
         Self
     }
 
+    fn primary_model_supports_images(context: &ToolUseContext) -> bool {
+        context
+            .options
+            .as_ref()
+            .and_then(|o| o.custom_data.as_ref())
+            .and_then(|m| m.get("primary_model_supports_image_understanding"))
+            .and_then(|v| v.as_bool())
+            .unwrap_or(false)
+    }
+
+    fn primary_model_provider(context: &ToolUseContext) -> Option<&str> {
+        context
+            .options
+            .as_ref()
+            .and_then(|o| o.custom_data.as_ref())
+            .and_then(|m| m.get("primary_model_provider"))
+            .and_then(|v| v.as_str())
+            .filter(|s| !s.is_empty())
+    }
+
     fn build_prompt(
         &self,
         analysis_prompt: Option<&str>,
@@ -80,6 +104,181 @@ impl ViewImageTool {
         prompt
     }
 
+    async fn build_attachment_image_context(
+        &self,
+        input_data: &ViewImageInput,
+        context: &ToolUseContext,
+        primary_provider: &str,
+    ) -> BitFunResult<(ModelImageContextData, String)> {
+        let workspace_path = get_workspace_path();
+
+        if let Some(image_id) = &input_data.image_id {
+            let provider = context.image_context_provider.as_ref().ok_or_else(|| {
+                BitFunError::tool(
+                    "image_id mode requires ImageContextProvider support, but no provider was injected.\n\
+                     Please inject image_context_provider when calling the tool, or use image_path/data_url mode."
+                        .to_string(),
+                )
+            })?;
+
+            let ctx = provider.get_image(image_id).ok_or_else(|| {
+                BitFunError::tool(format!(
+                    "Image context not found: image_id={}. Image may have expired (5-minute validity) or was never uploaded.",
+                    image_id
+                ))
+            })?;
+
+            let crate::agentic::tools::image_context::ImageContextData {
+                id: ctx_id,
+                image_path: ctx_image_path,
+                data_url: ctx_data_url,
+                mime_type: ctx_mime_type,
+                image_name: ctx_image_name,
+                file_size: ctx_file_size,
+                width: ctx_width,
+                height: ctx_height,
+                source: ctx_source,
+            } = ctx;
+
+            let description = format!("{} (clipboard)", ctx_image_name);
+
+            if let Some(path_str) = ctx_image_path.as_ref().filter(|s| !s.is_empty()) {
+                let path = resolve_image_path(path_str, workspace_path.as_deref())?;
+                let metadata = json!({
+                    "name": ctx_image_name,
+                    "width": ctx_width,
+                    "height": ctx_height,
+                    "file_size": ctx_file_size,
+                    "source": ctx_source,
+                    "origin": "image_id",
+                    "image_id": ctx_id.clone(),
+                });
+
+                return Ok((
+                    ModelImageContextData {
+                        id: ctx_id,
+                        image_path: Some(path.display().to_string()),
+                        data_url: None,
+                        mime_type: ctx_mime_type,
+                        metadata: Some(metadata),
+                    },
+                    description,
+                ));
+            }
+
+            if let Some(data_url) = ctx_data_url.as_ref().filter(|s| !s.is_empty()) {
+                let (data, data_url_mime) = decode_data_url(data_url)?;
+                let fallback_mime = data_url_mime
+                    .as_deref()
+                    .or_else(|| Some(ctx_mime_type.as_str()));
+                let processed =
+                    optimize_image_for_provider(data, primary_provider, fallback_mime)?;
+                let optimized_data_url = format!(
+                    "data:{};base64,{}",
+                    processed.mime_type,
+                    BASE64.encode(&processed.data)
+                );
+
+                let metadata = json!({
+                    "name": ctx_image_name,
+                    "width": processed.width,
+                    "height": processed.height,
+                    "file_size": processed.data.len(),
+                    "source": ctx_source,
+                    "origin": "image_id",
+                    "image_id": ctx_id.clone(),
+                });
+
+                return Ok((
+                    ModelImageContextData {
+                        id: ctx_id,
+                        image_path: None,
+                        data_url: Some(optimized_data_url),
+                        mime_type: processed.mime_type,
+                        metadata: Some(metadata),
+                    },
+                    description,
+                ));
+            }
+
+            return Err(BitFunError::tool(format!(
+                "Image context {} has neither data_url nor image_path",
+                image_id
+            )));
+        }
+
+        if let Some(data_url) = &input_data.data_url {
+            let (data, data_url_mime) = decode_data_url(data_url)?;
+            let processed =
+                optimize_image_for_provider(data, primary_provider, data_url_mime.as_deref())?;
+            let optimized_data_url = format!(
+                "data:{};base64,{}",
+                processed.mime_type,
+                BASE64.encode(&processed.data)
+            );
+            let metadata = json!({
+                "name": "clipboard_image",
+                "width": processed.width,
+                "height": processed.height,
+                "file_size": processed.data.len(),
+                "source": "data_url",
+                "origin": "data_url"
+            });
+
+            return Ok((
+                ModelImageContextData {
+                    id: format!("img-view-{}", Uuid::new_v4()),
+                    image_path: None,
+                    data_url: Some(optimized_data_url),
+                    mime_type: processed.mime_type,
+                    metadata: Some(metadata),
+                },
+                "clipboard_image".to_string(),
+            ));
+        }
+
+        if let Some(image_path_str) = &input_data.image_path {
+            let abs_path = resolve_image_path(image_path_str, workspace_path.as_deref())?;
+            let data = load_image_from_path(&abs_path, workspace_path.as_deref()).await?;
+
+            let mime_type = detect_mime_type_from_bytes(&data, None)?;
+            let dynamic = image::load_from_memory(&data).map_err(|e| {
+                BitFunError::validation(format!("Failed to decode image data: {}", e))
+            })?;
+            let (width, height) = dynamic.dimensions();
+
+            let name = abs_path
+                .file_name()
+                .and_then(|s| s.to_str())
+                .unwrap_or("image")
+                .to_string();
+
+            let metadata = json!({
+                "name": name,
+                "width": width,
+                "height": height,
+                "file_size": data.len(),
+                "source": "local_path",
+                "origin": "image_path"
+            });
+
+            return Ok((
+                ModelImageContextData {
+                    id: format!("img-view-{}", Uuid::new_v4()),
+                    image_path: Some(abs_path.display().to_string()),
+                    data_url: None,
+                    mime_type,
+                    metadata: Some(metadata),
+                },
+                abs_path.display().to_string(),
+            ));
+        }
+
+        Err(BitFunError::validation(
+            "Must provide one of image_path, data_url, or image_id",
+        ))
+    }
+
     async fn load_source(
         &self,
         input_data: &ViewImageInput,
@@ -156,8 +355,8 @@ impl Tool for ViewImageTool {
 Use this tool when the user provides an image (file path, data URL, or uploaded clipboard image_id) and asks questions about it.
 
 Current behavior:
-- For text-only primary models, this tool converts image content to structured text.
-- For multimodal-capable setups, this interface can be extended to direct image attachment in future.
+- For text-only primary models, this tool converts image content to structured text (uses the configured image understanding model).
+- For multimodal primary models, this tool attaches the image for the primary model to analyze directly.
 
 Parameters:
 - image_path / data_url / image_id: provide one image source
@@ -319,6 +518,28 @@ Parameters:
         let input_data: ViewImageInput = serde_json::from_value(input.clone())
             .map_err(|e| BitFunError::parse(format!("Failed to parse input: {}", e)))?;
 
+        let primary_provider = Self::primary_model_provider(context).unwrap_or("openai");
+        if Self::primary_model_supports_images(context) {
+            let (image, image_source_description) = self
+                .build_attachment_image_context(&input_data, context, primary_provider)
+                .await?;
+
+            let result_for_assistant = format!(
+                "Image attached for primary model analysis ({})",
+                image_source_description
+            );
+
+            return Ok(vec![ToolResult::Result {
+                data: json!({
+                    "success": true,
+                    "mode": "attached_to_primary_model",
+                    "image_source": image_source_description,
+                    "image": image,
+                }),
+                result_for_assistant: Some(result_for_assistant),
+            }]);
+        }
+
         let (image_data, fallback_mime, image_source_description) =
             self.load_source(&input_data, context).await?;
 
diff --git a/src/crates/core/src/agentic/tools/pipeline/state_manager.rs b/src/crates/core/src/agentic/tools/pipeline/state_manager.rs
index 67d99022..c5d52854 100644
--- a/src/crates/core/src/agentic/tools/pipeline/state_manager.rs
+++ b/src/crates/core/src/agentic/tools/pipeline/state_manager.rs
@@ -19,6 +19,32 @@ pub struct ToolStateManager {
 }
 
 impl ToolStateManager {
+    fn sanitize_tool_result_for_event(result: &serde_json::Value) -> serde_json::Value {
+        let mut sanitized = result.clone();
+        Self::redact_data_url_in_json(&mut sanitized);
+        sanitized
+    }
+
+    fn redact_data_url_in_json(value: &mut serde_json::Value) {
+        match value {
+            serde_json::Value::Object(map) => {
+                let had_data_url = map.remove("data_url").is_some();
+                if had_data_url {
+                    map.insert("has_data_url".to_string(), serde_json::json!(true));
+                }
+                for child in map.values_mut() {
+                    Self::redact_data_url_in_json(child);
+                }
+            }
+            serde_json::Value::Array(arr) => {
+                for child in arr {
+                    Self::redact_data_url_in_json(child);
+                }
+            }
+            _ => {}
+        }
+    }
+
     pub fn new(event_queue: Arc<EventQueue>) -> Self {
         Self {
             tasks: Arc::new(DashMap::new()),
@@ -156,7 +182,7 @@ impl ToolStateManager {
             ToolExecutionState::Completed { result, duration_ms } => ToolEventData::Completed {
                 tool_id: task.tool_call.tool_id.clone(),
                 tool_name: task.tool_call.tool_name.clone(),
-                result: result.content(),
+                result: Self::sanitize_tool_result_for_event(&result.content()),
                 duration_ms: *duration_ms,
             },
             
diff --git a/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs b/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs
index 3ea85d7f..79db4dd7 100644
--- a/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs
+++ b/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs
@@ -699,6 +699,40 @@ impl ToolPipeline {
                             map.insert("turn_index".to_string(), serde_json::json!(n));
                         }
                     }
+
+                    if let Some(provider) = task.context.context_vars.get("primary_model_provider") {
+                        if !provider.is_empty() {
+                            map.insert(
+                                "primary_model_provider".to_string(),
+                                serde_json::json!(provider),
+                            );
+                        }
+                    }
+                    if let Some(model_id) = task.context.context_vars.get("primary_model_id") {
+                        if !model_id.is_empty() {
+                            map.insert("primary_model_id".to_string(), serde_json::json!(model_id));
+                        }
+                    }
+                    if let Some(model_name) = task.context.context_vars.get("primary_model_name") {
+                        if !model_name.is_empty() {
+                            map.insert(
+                                "primary_model_name".to_string(),
+                                serde_json::json!(model_name),
+                            );
+                        }
+                    }
+                    if let Some(supports_images) = task
+                        .context
+                        .context_vars
+                        .get("primary_model_supports_image_understanding")
+                    {
+                        if let Ok(flag) = supports_images.parse::<bool>() {
+                            map.insert(
+                                "primary_model_supports_image_understanding".to_string(),
+                                serde_json::json!(flag),
+                            );
+                        }
+                    }
                     
                     map
                 }),
@@ -887,4 +921,3 @@ impl ToolPipeline {
         }
     }
 }
-
diff --git a/src/crates/core/src/infrastructure/ai/client.rs b/src/crates/core/src/infrastructure/ai/client.rs
index f883f48d..c0c25d84 100644
--- a/src/crates/core/src/infrastructure/ai/client.rs
+++ b/src/crates/core/src/infrastructure/ai/client.rs
@@ -30,6 +30,71 @@ pub struct AIClient {
 }
 
 impl AIClient {
+    const TEST_IMAGE_EXPECTED_CODE: &'static str = "BYGR";
+    const TEST_IMAGE_PNG_BASE64: &'static str =
+        "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAiklEQVR4nNXZwQkAQQzDQEX995wr4giLpgBj8NMDy6XdOc2XOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTOImTuDm+Bzi+B8gvIHESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3ESJ3G+LvDXB5LJBXz4d6CTAAAAAElFTkSuQmCC";
+
+    fn image_test_response_matches_expected(response: &str) -> bool {
+        let upper = response.to_ascii_uppercase();
+
+        // Accept contiguous letters even when separated by spaces/punctuation.
+        let letters_only: String = upper.chars().filter(|c| c.is_ascii_alphabetic()).collect();
+        if letters_only.contains(Self::TEST_IMAGE_EXPECTED_CODE) {
+            return true;
+        }
+
+        let tokens: Vec<&str> = upper
+            .split(|c: char| !c.is_ascii_alphabetic())
+            .filter(|s| !s.is_empty())
+            .collect();
+
+        if tokens
+            .iter()
+            .any(|token| *token == Self::TEST_IMAGE_EXPECTED_CODE)
+        {
+            return true;
+        }
+
+        // Accept outputs like: "B Y G R".
+        let single_letter_stream: String = tokens
+            .iter()
+            .filter_map(|token| {
+                if token.len() == 1 {
+                    let ch = token.chars().next()?;
+                    if matches!(ch, 'R' | 'G' | 'B' | 'Y') {
+                        return Some(ch);
+                    }
+                }
+                None
+            })
+            .collect();
+        if single_letter_stream.contains(Self::TEST_IMAGE_EXPECTED_CODE) {
+            return true;
+        }
+
+        // Accept outputs like: "Blue, Yellow, Green, Red".
+        let color_word_stream: String = tokens
+            .iter()
+            .filter_map(|token| match *token {
+                "RED" => Some('R'),
+                "GREEN" => Some('G'),
+                "BLUE" => Some('B'),
+                "YELLOW" => Some('Y'),
+                _ => None,
+            })
+            .collect();
+        if color_word_stream.contains(Self::TEST_IMAGE_EXPECTED_CODE) {
+            return true;
+        }
+
+        // Last fallback: keep only RGBY letters and search code.
+        let color_letter_stream: String = upper
+            .chars()
+            .filter(|c| matches!(*c, 'R' | 'G' | 'B' | 'Y'))
+            .collect();
+        color_letter_stream.contains(Self::TEST_IMAGE_EXPECTED_CODE)
+    }
+
     /// Create an AIClient without proxy (backward compatible)
     pub fn new(config: AIConfig) -> Self {
         let skip_ssl_verify = config.skip_ssl_verify;
@@ -931,4 +996,87 @@ impl AIClient {
             }
         }
     }
+
+    pub async fn test_image_input_connection(&self) -> Result<ConnectionTestResult> {
+        let start_time = std::time::Instant::now();
+        let provider = self.config.format.to_ascii_lowercase();
+        let prompt = "Inspect the attached image and reply with exactly one 4-letter code for quadrant colors in TL,TR,BL,BR order using letters R,G,B,Y (R=red, G=green, B=blue, Y=yellow).";
+
+        let content = if provider == "anthropic" {
+            serde_json::json!([
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": Self::TEST_IMAGE_PNG_BASE64
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": prompt
+                }
+            ])
+        } else {
+            serde_json::json!([
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": format!("data:image/png;base64,{}", Self::TEST_IMAGE_PNG_BASE64)
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": prompt
+                }
+            ])
+        };
+
+        let test_messages = vec![Message {
+            role: "user".to_string(),
+            content: Some(content.to_string()),
+            reasoning_content: None,
+            thinking_signature: None,
+            tool_calls: None,
+            tool_call_id: None,
+            name: None,
+        }];
+
+        match self.send_message(test_messages, None).await {
+            Ok(response) => {
+                let matched = Self::image_test_response_matches_expected(&response.text);
+
+                if matched {
+                    Ok(ConnectionTestResult {
+                        success: true,
+                        response_time_ms: start_time.elapsed().as_millis() as u64,
+                        model_response: Some(response.text),
+                        error_details: None,
+                    })
+                } else {
+                    let detail = format!(
+                        "Image understanding verification failed: expected code '{}', got response '{}'",
+                        Self::TEST_IMAGE_EXPECTED_CODE, response.text
+                    );
+                    debug!("test image input connection failed: {}", detail);
+                    Ok(ConnectionTestResult {
+                        success: false,
+                        response_time_ms: start_time.elapsed().as_millis() as u64,
+                        model_response: Some(response.text),
+                        error_details: Some(detail),
+                    })
+                }
+            }
+            Err(e) => {
+                let error_msg = format!("{}", e);
+                debug!("test image input connection failed: {}", error_msg);
+                Ok(ConnectionTestResult {
+                    success: false,
+                    response_time_ms: start_time.elapsed().as_millis() as u64,
+                    model_response: None,
+                    error_details: Some(error_msg),
+                })
+            }
+        }
+    }
 }
diff --git a/src/web-ui/src/flow_chat/components/ModelSelector.tsx b/src/web-ui/src/flow_chat/components/ModelSelector.tsx
index 17030fd5..bae057c4 100644
--- a/src/web-ui/src/flow_chat/components/ModelSelector.tsx
+++ b/src/web-ui/src/flow_chat/components/ModelSelector.tsx
@@ -158,8 +158,9 @@ export const ModelSelector: React.FC<ModelSelectorProps> = ({
     return allModels
       .filter(m => {
         if (!m.enabled) return false;
-        // Text-only models for general chat.
-        return m.category === 'general_chat';
+        // Only show chat-capable models (exclude embeddings / image-gen / speech, etc.).
+        const capabilities = Array.isArray(m.capabilities) ? m.capabilities : [];
+        return capabilities.includes('text_chat');
       })
       .map(m => ({
         id: m.id || '',
diff --git a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts
index 71968fb5..dc74fe44 100644
--- a/src/web-ui/src/flow_chat/hooks/useMessageSender.ts
+++ b/src/web-ui/src/flow_chat/hooks/useMessageSender.ts
@@ -51,20 +51,46 @@ interface ImageAnalysisResult {
   analysis_time_ms: number;
 }
 
-// Keep this off for now: transport currently accepts text-only `userInput`.
-// When backend supports multimodal turn input, this can be flipped (or moved to config).
-const ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED = false;
+const ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED = true;
 
 async function resolveSessionModelId(
   flowChatManager: FlowChatManager,
-  sessionId: string | undefined
+  sessionId: string | undefined,
+  agentType?: string
 ): Promise<string | null> {
   const state = flowChatManager.getFlowChatState();
   const session = sessionId ? state.sessions.get(sessionId) : undefined;
-  const configuredModel = session?.config?.modelName;
+  const configuredModel = session?.config?.modelName || null;
+  const { configManager } = await import('@/infrastructure/config/services/ConfigManager');
+  const defaultModels = await configManager.getConfig<Record<string, string>>('ai.default_models') || {};
+  const agentModels = await configManager.getConfig<Record<string, string>>('ai.agent_models') || {};
+
+  const resolveAlias = (modelId: string | null): string | null => {
+    if (!modelId) return null;
+    if (modelId === 'primary') {
+      return defaultModels.primary || null;
+    }
+    if (modelId === 'fast') {
+      return defaultModels.fast || defaultModels.primary || null;
+    }
+    if (modelId === 'default') {
+      return defaultModels.primary || null;
+    }
+    return modelId;
+  };
 
-  if (configuredModel && configuredModel !== 'default') {
-    return configuredModel;
+  const effectiveAgentType = (agentType || session?.mode || 'agentic').trim();
+  const configuredFromAgentModels = resolveAlias(
+    effectiveAgentType ? (agentModels[effectiveAgentType] ?? null) : null
+  );
+  if (configuredFromAgentModels) {
+    return configuredFromAgentModels;
+  }
+
+  // Backward-compatibility fallback for historical sessions.
+  const resolvedConfigured = resolveAlias(configuredModel);
+  if (resolvedConfigured) {
+    return resolvedConfigured;
   }
 
   const { getDefaultPrimaryModel } = await import('@/infrastructure/config/utils/modelConfigHelpers');
@@ -76,16 +102,22 @@ async function modelSupportsImageUnderstanding(modelId: string | null): Promise<
 
   const { configManager } = await import('@/infrastructure/config/services/ConfigManager');
   const allModels = await configManager.getConfig<any[]>('ai.models') || [];
-  const model = allModels.find(m => m.id === modelId || m.name === modelId);
+  const model = allModels.find(
+    m => m.id === modelId || m.name === modelId || m.model_name === modelId
+  );
+  if (!model || model.enabled === false) return false;
+
   const capabilities = Array.isArray(model?.capabilities) ? model.capabilities : [];
-  return capabilities.includes('image_understanding');
+  const category = typeof model?.category === 'string' ? model.category : '';
+  return capabilities.includes('image_understanding') || category === 'multimodal';
 }
 
 async function chooseImageInputStrategy(
   flowChatManager: FlowChatManager,
-  sessionId: string | undefined
+  sessionId: string | undefined,
+  agentType?: string
 ): Promise<StrategyDecision> {
-  const modelId = await resolveSessionModelId(flowChatManager, sessionId);
+  const modelId = await resolveSessionModelId(flowChatManager, sessionId, agentType);
   const supportsImageUnderstanding = await modelSupportsImageUnderstanding(modelId);
 
   if (supportsImageUnderstanding && ENABLE_DIRECT_ATTACH_WHEN_SUPPORTED) {
@@ -102,7 +134,7 @@ async function chooseImageInputStrategy(
     modelId,
     supportsImageUnderstanding,
     reason: supportsImageUnderstanding
-      ? 'direct_attach_disabled_until_multimodal_turn_input_is_available'
+      ? 'direct_attach_disabled_by_feature_flag'
       : 'primary_model_is_text_only',
   };
 }
@@ -136,7 +168,8 @@ async function analyzeImagesBeforeSend(
 
 function formatImageContextLine(
   ctx: ImageContext,
-  analysis?: ImageAnalysisResult
+  analysis?: ImageAnalysisResult,
+  strategy?: ImageInputStrategy
 ): string {
   const imgName = ctx.imageName || 'Untitled image';
   const imgSize = ctx.fileSize ? ` (${(ctx.fileSize / 1024).toFixed(1)}KB)` : '';
@@ -144,6 +177,10 @@ function formatImageContextLine(
     ? `Path: ${ctx.imagePath}`
     : `Image ID: ${ctx.id}`;
 
+  if (strategy === 'direct-attach') {
+    return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nAttached as multimodal image input.`;
+  }
+
   if (!analysis) {
     return `[Image: ${imgName}${imgSize}]\n${sourceLine}\nTip: You can use the view_image tool (${ctx.isLocal ? 'image_path' : 'image_id'}).`;
   }
@@ -237,7 +274,11 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
         reason: 'fallback_default_preanalysis',
       };
       try {
-        strategyDecision = await chooseImageInputStrategy(flowChatManager, sessionId);
+        strategyDecision = await chooseImageInputStrategy(
+          flowChatManager,
+          sessionId,
+          currentAgentType || undefined
+        );
       } catch (error) {
         log.warn('Failed to resolve image input strategy, using pre-analysis fallback', {
           sessionId,
@@ -255,28 +296,21 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
 
       let imageAnalyses: ImageAnalysisResult[] = [];
       if (imageContexts.length > 0) {
-        if (strategyDecision.strategy === 'direct-attach') {
-          // Future extensibility hook:
-          // once start_dialog_turn supports multimodal payloads, this branch can send image items directly.
-          log.info('Direct image attach strategy is selected but transport is still text-only; using pre-analysis fallback', {
-            sessionId,
-            modelId: strategyDecision.modelId,
-          });
-        }
-
-        try {
-          imageAnalyses = await analyzeImagesBeforeSend(imageContexts, sessionId!, trimmedMessage);
-          log.debug('Image pre-analysis completed', {
-            sessionId,
-            imageCount: imageContexts.length,
-            analysisCount: imageAnalyses.length,
-          });
-        } catch (error) {
-          log.warn('Image pre-analysis failed, continuing with context hints only', {
-            sessionId,
-            imageCount: imageContexts.length,
-            error: (error as Error)?.message ?? 'unknown',
-          });
+        if (strategyDecision.strategy === 'vision-preanalysis') {
+          try {
+            imageAnalyses = await analyzeImagesBeforeSend(imageContexts, sessionId!, trimmedMessage);
+            log.debug('Image pre-analysis completed', {
+              sessionId,
+              imageCount: imageContexts.length,
+              analysisCount: imageAnalyses.length,
+            });
+          } catch (error) {
+            log.warn('Image pre-analysis failed, continuing with context hints only', {
+              sessionId,
+              imageCount: imageContexts.length,
+              error: (error as Error)?.message ?? 'unknown',
+            });
+          }
         }
       }
 
@@ -295,7 +329,7 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
             case 'code-snippet':
               return `[Code Snippet: ${ctx.filePath}:${ctx.startLine}-${ctx.endLine}]`;
             case 'image': {
-              return formatImageContextLine(ctx, analysisByImageId.get(ctx.id));
+              return formatImageContextLine(ctx, analysisByImageId.get(ctx.id), strategyDecision.strategy);
             }
             case 'terminal-command':
               return `[Command: ${ctx.command}]`;
@@ -319,7 +353,27 @@ export function useMessageSender(props: UseMessageSenderProps): UseMessageSender
         fullMessage,
         sessionId || undefined,
         displayMessage,
-        currentAgentType || 'agentic'
+        currentAgentType || 'agentic',
+        undefined,
+        strategyDecision.strategy === 'direct-attach'
+          ? {
+              imageContexts: imageContexts.map(ctx => ({
+                id: ctx.id,
+                image_path: ctx.isLocal ? ctx.imagePath : undefined,
+                // Clipboard images are uploaded first and referenced by image_id only
+                // to avoid sending large base64 payloads in the turn request.
+                data_url: undefined,
+                mime_type: ctx.mimeType,
+                metadata: {
+                  name: ctx.imageName,
+                  width: ctx.width,
+                  height: ctx.height,
+                  file_size: ctx.fileSize,
+                  source: ctx.source,
+                },
+              })),
+            }
+          : undefined
       );
 
       onClearContexts();
diff --git a/src/web-ui/src/flow_chat/services/FlowChatManager.ts b/src/web-ui/src/flow_chat/services/FlowChatManager.ts
index 13720de3..f2b356d0 100644
--- a/src/web-ui/src/flow_chat/services/FlowChatManager.ts
+++ b/src/web-ui/src/flow_chat/services/FlowChatManager.ts
@@ -169,7 +169,10 @@ export class FlowChatManager {
     sessionId?: string,
     displayMessage?: string,
     agentType?: string,
-    switchToMode?: string
+    switchToMode?: string,
+    options?: {
+      imageContexts?: import('@/infrastructure/api/service-api/ImageAnalysisAPI').ImageContextData[];
+    }
   ): Promise<void> {
     const targetSessionId = sessionId || this.context.flowChatStore.getState().activeSessionId;
     
@@ -177,7 +180,15 @@ export class FlowChatManager {
       throw new Error('No active session');
     }
 
-    return sendMessageModule(this.context, message, targetSessionId, displayMessage, agentType, switchToMode);
+    return sendMessageModule(
+      this.context,
+      message,
+      targetSessionId,
+      displayMessage,
+      agentType,
+      switchToMode,
+      options
+    );
   }
 
   async cancelCurrentTask(): Promise<boolean> {
diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts
index 9cc46f82..fa9cf965 100644
--- a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts
+++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts
@@ -14,6 +14,7 @@ import { createLogger } from '@/shared/utils/logger';
 import type { FlowChatContext, DialogTurn } from './types';
 import { ensureBackendSession, retryCreateBackendSession } from './SessionModule';
 import { cleanupSessionBuffers } from './TextChunkModule';
+import type { ImageContextData as ImageInputContextData } from '@/infrastructure/api/service-api/ImageAnalysisAPI';
 
 const log = createLogger('MessageModule');
 
@@ -31,7 +32,10 @@ export async function sendMessage(
   sessionId: string,
   displayMessage?: string,
   agentType?: string,
-  switchToMode?: string
+  switchToMode?: string,
+  options?: {
+    imageContexts?: ImageInputContextData[];
+  }
 ): Promise<void> {
   const session = context.flowChatStore.getState().sessions.get(sessionId);
   if (!session) {
@@ -105,6 +109,7 @@ export async function sendMessage(
         userInput: message,
         turnId: dialogTurnId,
         agentType: currentAgentType,
+        imageContexts: options?.imageContexts,
       });
     } catch (error: any) {
       if (error?.message?.includes('Session does not exist') || error?.message?.includes('Not found')) {
@@ -120,6 +125,7 @@ export async function sendMessage(
           userInput: message,
           turnId: dialogTurnId,
           agentType: currentAgentType,
+          imageContexts: options?.imageContexts,
         });
       } else {
         throw error;
diff --git a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts
index 807ea431..5540f4a8 100644
--- a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts
+++ b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts
@@ -2,6 +2,7 @@
 
 import { api } from './ApiClient';
 import { createTauriCommandError } from '../errors/TauriCommandError';
+import type { ImageContextData as ImageInputContextData } from './ImageAnalysisAPI';
 
 
 
@@ -44,6 +45,8 @@ export interface StartDialogTurnRequest {
   userInput: string;
   turnId?: string; 
   agentType: string; 
+  /** Optional multimodal image contexts (snake_case fields, aligned with backend ImageContextData). */
+  imageContexts?: ImageInputContextData[];
 }
 
  
@@ -349,4 +352,4 @@ export class AgentAPI {
 }
 
 
-export const agentAPI = new AgentAPI();
\ No newline at end of file
+export const agentAPI = new AgentAPI();
diff --git a/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts b/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts
index fb6cbc98..e047864a 100644
--- a/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts
+++ b/src/web-ui/src/infrastructure/api/service-api/ApiClient.ts
@@ -16,6 +16,71 @@ import {
 import { createLogger } from '@/shared/utils/logger';
 
 const log = createLogger('ApiClient');
+const SENSITIVE_KEY_PATTERNS = [
+  'api_key',
+  'apikey',
+  'token',
+  'secret',
+  'password',
+  'authorization'
+];
+
+function isSensitiveKey(key: string): boolean {
+  const normalized = key.toLowerCase();
+  return SENSITIVE_KEY_PATTERNS.some(pattern => normalized.includes(pattern));
+}
+
+function maskSensitiveValue(value: unknown): string {
+  if (typeof value !== 'string') {
+    return '***';
+  }
+  if (value.length <= 8) {
+    return '***';
+  }
+  return `${value.slice(0, 4)}***${value.slice(-4)}`;
+}
+
+function sanitizeForLog(value: unknown, parentKey?: string): unknown {
+  if (value === null || value === undefined) {
+    return value;
+  }
+
+  if (Array.isArray(value)) {
+    return value.map(item => sanitizeForLog(item, parentKey));
+  }
+
+  if (typeof value !== 'object') {
+    if (parentKey && isSensitiveKey(parentKey)) {
+      return maskSensitiveValue(value);
+    }
+    return value;
+  }
+
+  const obj = value as Record<string, unknown>;
+  const sanitized: Record<string, unknown> = {};
+
+  for (const [key, rawVal] of Object.entries(obj)) {
+    if (isSensitiveKey(key)) {
+      sanitized[key] = maskSensitiveValue(rawVal);
+      continue;
+    }
+
+    // For HTTP header maps, mask sensitive header values by header name.
+    if ((key === 'headers' || key === 'custom_headers') && rawVal && typeof rawVal === 'object') {
+      const headerObj = rawVal as Record<string, unknown>;
+      const maskedHeaders: Record<string, unknown> = {};
+      for (const [hKey, hVal] of Object.entries(headerObj)) {
+        maskedHeaders[hKey] = isSensitiveKey(hKey) ? maskSensitiveValue(hVal) : hVal;
+      }
+      sanitized[key] = maskedHeaders;
+      continue;
+    }
+
+    sanitized[key] = sanitizeForLog(rawVal, key);
+  }
+
+  return sanitized;
+}
 
 export class ApiClient implements IApiClient {
   private config: ApiConfig;
@@ -159,7 +224,11 @@ export class ApiClient implements IApiClient {
 
 
         if (this.config.enableLogging) {
-          log.debug('Request completed', { type: request.type, responseTime, config: request.config });
+          log.debug('Request completed', {
+            type: request.type,
+            responseTime,
+            config: sanitizeForLog(request.config)
+          });
         }
 
         return response.data;
@@ -191,7 +260,12 @@ export class ApiClient implements IApiClient {
 
 
       if (this.config.enableLogging) {
-        log.error('Request failed after retries', { requestId: request.id, retryCount: request.retryCount, error });
+        log.error('Request failed after retries', {
+          requestId: request.id,
+          retryCount: request.retryCount,
+          config: sanitizeForLog(request.config),
+          error
+        });
       }
 
       throw this.normalizeError(error as Error);
@@ -226,7 +300,7 @@ export class ApiClient implements IApiClient {
       } else {
         log.error('Command failed', {
           command: config.command,
-          args: config.args,
+          args: sanitizeForLog(config.args),
           error: errorMessage,
           rawError: error
         });
@@ -400,7 +474,11 @@ export function createLoggingMiddleware(): ApiMiddleware {
     try {
       const response = await next(request);
       const duration = Date.now() - startTime;
-      middlewareLog.debug('Request completed', { type: request.type, duration, config: request.config });
+      middlewareLog.debug('Request completed', {
+        type: request.type,
+        duration,
+        config: sanitizeForLog(request.config)
+      });
       return response;
     } catch (error) {
       const duration = Date.now() - startTime;
diff --git a/src/web-ui/src/locales/zh-CN/settings.json b/src/web-ui/src/locales/zh-CN/settings.json
index 6263dfbc..7ed4a16e 100644
--- a/src/web-ui/src/locales/zh-CN/settings.json
+++ b/src/web-ui/src/locales/zh-CN/settings.json
@@ -291,7 +291,7 @@
     },
     "capabilities": {
       "text_chat": "对话",
-      "image_understanding": "识图",
+      "image_understanding": "多模态",
       "image_generation": "绘图",
       "search": "搜索",
       "function_calling": "工具",
@@ -322,7 +322,7 @@
     },
     "capabilityDescs": {
       "text_chat": "处理所有文本对话、代码生成、工具调用等任务",
-      "image_understanding": "分析和理解图片内容，支持图文混合对话",
+      "image_understanding": "当主模型不支持图片输入时，用于分析和理解图片内容",
       "image_generation": "根据文字描述生成图片（如 DALL-E、Stable Diffusion）",
       "search": "实时搜索网络信息，提供最新数据支持",
       "speech_recognition": "将语音转换为文字，支持语音输入功能（如智谱 GLM-ASR）"
diff --git a/src/web-ui/src/locales/zh-CN/settings/ai-model.json b/src/web-ui/src/locales/zh-CN/settings/ai-model.json
index bb7b8a55..f402c789 100644
--- a/src/web-ui/src/locales/zh-CN/settings/ai-model.json
+++ b/src/web-ui/src/locales/zh-CN/settings/ai-model.json
@@ -63,28 +63,28 @@
   "categories": {
     "all": "全部",
     "text": "文本",
-    "multimodal": "图像",
+    "multimodal": "多模态",
     "other": "辅助"
   },
   "category": {
     "label": "模型分类",
     "placeholder": "选择模型分类",
     "general_chat": "文本生成",
-    "multimodal": "图像理解",
+    "multimodal": "多模态",
     "image_generation": "图像生成",
     "search_enhanced": "信息检索",
     "speech_recognition": "语音识别"
   },
   "categoryIcons": {
     "general_chat": "文本",
-    "multimodal": "视觉",
+    "multimodal": "多模态",
     "image_generation": "绘图",
     "search_enhanced": "检索",
     "speech_recognition": "语音"
   },
   "categoryHints": {
     "general_chat": "文本生成：生成文本回复、代码等，适用于大多数对话场景",
-    "multimodal": "图像理解：理解图片内容并进行图文混合对话",
+    "multimodal": "多模态：理解图片内容并进行图文混合对话",
     "image_generation": "图像生成：根据文字描述生成图片",
     "search_enhanced": "信息检索：搜索网络获取实时信息，只需配置名称、API地址和密钥",
     "speech_recognition": "语音识别：将语音转换为文字（如智谱 GLM-ASR）"
@@ -140,7 +140,7 @@
   },
   "capabilities": {
     "text_chat": "对话",
-    "image_understanding": "识图",
+    "image_understanding": "多模态",
     "image_generation": "绘图",
     "search": "搜索",
     "function_calling": "工具",
diff --git a/src/web-ui/src/locales/zh-CN/settings/default-model.json b/src/web-ui/src/locales/zh-CN/settings/default-model.json
index 53b74852..6ae7d1f4 100644
--- a/src/web-ui/src/locales/zh-CN/settings/default-model.json
+++ b/src/web-ui/src/locales/zh-CN/settings/default-model.json
@@ -25,11 +25,11 @@
     }
   },
   "optional": {
-    "title": "多模态模型配置",
+    "title": "扩展能力模型配置",
     "capabilities": {
       "image_understanding": {
-        "label": "图像理解",
-        "description": "分析图片、截图内容，支持图文混合对话"
+        "label": "图片理解模型",
+        "description": "当主模型不支持图片输入时，用于分析图片和截图内容"
       },
       "image_generation": {
         "label": "图像生成",