GCWing · GCWing · Mar 6, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -51,6 +51,7 @@ uuid = { version = "1.0", features = ["v4", "serde"] }
 chrono = { version = "0.4", features = ["serde", "clock"] }
 regex = "1.10"
 base64 = "0.21"
+image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp", "bmp"] }
 md5 = "0.7"
 once_cell = "1.19.0"
 lazy_static = "1.4"

diff --git a/src/apps/desktop/src/api/agentic_api.rs b/src/apps/desktop/src/api/agentic_api.rs
@@ -6,8 +6,10 @@ use std::sync::Arc;
 use tauri::{AppHandle, State};
 
 use crate::api::app_state::AppState;
+use crate::api::context_upload_api::get_image_context;
 use bitfun_core::agentic::coordination::ConversationCoordinator;
 use bitfun_core::agentic::core::*;
+use bitfun_core::agentic::image_analysis::ImageContextData;
 
 #[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -45,6 +47,8 @@ pub struct StartDialogTurnRequest {
     pub user_input: String,
     pub agent_type: String,
     pub turn_id: Option<String>,
+    #[serde(default)]
+    pub image_contexts: Option<Vec<ImageContextData>>,
 }
 
 #[derive(Debug, Serialize)]
@@ -179,23 +183,131 @@ pub async fn start_dialog_turn(
     coordinator: State<'_, Arc<ConversationCoordinator>>,
     request: StartDialogTurnRequest,
 ) -> Result<StartDialogTurnResponse, String> {
-    let _stream = coordinator
-        .start_dialog_turn(
-            request.session_id,
-            request.user_input,
-            request.turn_id,
-            request.agent_type,
-            false,
-        )
-        .await
-        .map_err(|e| format!("Failed to start dialog turn: {}", e))?;
+    let StartDialogTurnRequest {
+        session_id,
+        user_input,
+        agent_type,
+        turn_id,
+        image_contexts,
+    } = request;
+
+    if let Some(image_contexts) = image_contexts
+        .as_ref()
+        .filter(|images| !images.is_empty())
+        .cloned()
+    {
+        let resolved_image_contexts = resolve_missing_image_payloads(image_contexts)?;
+        coordinator
+            .start_dialog_turn_with_image_contexts(
+                session_id,
+                user_input,
+                resolved_image_contexts,
+                turn_id,
+                agent_type,
+            )
+            .await
+            .map_err(|e| format!("Failed to start dialog turn: {}", e))?;
+    } else {
+        coordinator
+            .start_dialog_turn(
+                session_id,
+                user_input,
+                turn_id,
+                agent_type,
+                false,
+            )
+            .await
+            .map_err(|e| format!("Failed to start dialog turn: {}", e))?;
+    }
 
     Ok(StartDialogTurnResponse {
         success: true,
         message: "Dialog turn started".to_string(),
     })
 }
 
+fn is_blank_text(value: Option<&String>) -> bool {
+    value.map(|s| s.trim().is_empty()).unwrap_or(true)
+}
+
+fn resolve_missing_image_payloads(
+    image_contexts: Vec<ImageContextData>,
+) -> Result<Vec<ImageContextData>, String> {
+    let mut resolved = Vec::with_capacity(image_contexts.len());
+
+    for mut image in image_contexts {
+        let missing_payload =
+            is_blank_text(image.image_path.as_ref()) && is_blank_text(image.data_url.as_ref());
+        if !missing_payload {
+            resolved.push(image);
+            continue;
+        }
+
+        let stored = get_image_context(&image.id).ok_or_else(|| {
+            format!(
+                "Image context not found for image_id={}. It may have expired. Please re-attach the image and retry.",
+                image.id
+            )
+        })?;
+
+        if is_blank_text(image.image_path.as_ref()) {
+            image.image_path = stored
+                .image_path
+                .clone()
+                .filter(|s| !s.trim().is_empty());
+        }
+        if is_blank_text(image.data_url.as_ref()) {
+            image.data_url = stored
+                .data_url
+                .clone()
+                .filter(|s| !s.trim().is_empty());
+        }
+        if image.mime_type.trim().is_empty() {
+            image.mime_type = stored.mime_type.clone();
+        }
+
+        let mut metadata = image.metadata.take().unwrap_or_else(|| serde_json::json!({}));
+        if !metadata.is_object() {
+            metadata = serde_json::json!({ "raw_metadata": metadata });
+        }
+        if let Some(obj) = metadata.as_object_mut() {
+            if !obj.contains_key("name") {
+                obj.insert("name".to_string(), serde_json::json!(stored.image_name));
+            }
+            if !obj.contains_key("width") {
+                obj.insert("width".to_string(), serde_json::json!(stored.width));
+            }
+            if !obj.contains_key("height") {
+                obj.insert("height".to_string(), serde_json::json!(stored.height));
+            }
+            if !obj.contains_key("file_size") {
+                obj.insert("file_size".to_string(), serde_json::json!(stored.file_size));
+            }
+            if !obj.contains_key("source") {
+                obj.insert("source".to_string(), serde_json::json!(stored.source));
+            }
+            obj.insert(
+                "resolved_from_upload_cache".to_string(),
+                serde_json::json!(true),
+            );
+        }
+        image.metadata = Some(metadata);
+
+        let still_missing =
+            is_blank_text(image.image_path.as_ref()) && is_blank_text(image.data_url.as_ref());
+        if still_missing {
+            return Err(format!(
+                "Image context {} is missing image_path/data_url after cache resolution",
+                image.id
+            ));
+        }
+
+        resolved.push(image);
+    }
+
+    Ok(resolved)
+}
+
 #[tauri::command]
 pub async fn cancel_dialog_turn(
     coordinator: State<'_, Arc<ConversationCoordinator>>,
@@ -394,6 +506,26 @@ fn message_to_dto(message: Message) -> MessageDTO {
 
     let content = match message.content {
         MessageContent::Text(text) => serde_json::json!({ "type": "text", "text": text }),
+        MessageContent::Multimodal { text, images } => {
+            let images: Vec<serde_json::Value> = images
+                .into_iter()
+                .map(|img| {
+                    serde_json::json!({
+                        "id": img.id,
+                        "image_path": img.image_path,
+                        "mime_type": img.mime_type,
+                        "metadata": img.metadata,
+                        "has_data_url": img.data_url.as_ref().is_some_and(|s| !s.is_empty()),
+                    })
+                })
+                .collect();
+
+            serde_json::json!({
+                "type": "multimodal",
+                "text": text,
+                "images": images,
+            })
+        }
         MessageContent::ToolResult {
             tool_id,
             tool_name,

diff --git a/src/apps/desktop/src/api/commands.rs b/src/apps/desktop/src/api/commands.rs
@@ -197,6 +197,21 @@ pub async fn test_ai_config_connection(
     request: TestAIConfigConnectionRequest,
 ) -> Result<bitfun_core::util::types::ConnectionTestResult, String> {
     let model_name = request.config.name.clone();
+    let supports_image_input = request
+        .config
+        .capabilities
+        .iter()
+        .any(|cap| {
+            matches!(
+                cap,
+                bitfun_core::service::config::types::ModelCapability::ImageUnderstanding
+            )
+        })
+        || matches!(
+            request.config.category,
+            bitfun_core::service::config::types::ModelCategory::Multimodal
+        );
+
     let ai_config = match request.config.try_into() {
         Ok(config) => config,
         Err(e) => {
@@ -209,6 +224,64 @@ pub async fn test_ai_config_connection(
 
     match ai_client.test_connection().await {
         Ok(result) => {
+            if !result.success {
+                info!(
+                    "AI config connection test completed: model={}, success={}, response_time={}ms",
+                    model_name, result.success, result.response_time_ms
+                );
+                return Ok(result);
+            }
+
+            if supports_image_input {
+                match ai_client.test_image_input_connection().await {
+                    Ok(image_result) => {
+                        let response_time_ms =
+                            result.response_time_ms + image_result.response_time_ms;
+
+                        if !image_result.success {
+                            let image_error = image_result
+                                .error_details
+                                .unwrap_or_else(|| "Unknown image input test error".to_string());
+                            let merged = bitfun_core::util::types::ConnectionTestResult {
+                                success: false,
+                                response_time_ms,
+                                model_response: image_result.model_response.or(result.model_response),
+                                error_details: Some(format!(
+                                    "Basic connection passed, but multimodal image input test failed: {}",
+                                    image_error
+                                )),
+                            };
+                            info!(
+                                "AI config connection test completed: model={}, success={}, response_time={}ms",
+                                model_name, merged.success, merged.response_time_ms
+                            );
+                            return Ok(merged);
+                        }
+
+                        let merged = bitfun_core::util::types::ConnectionTestResult {
+                            success: true,
+                            response_time_ms,
+                            model_response: image_result
+                                .model_response
+                                .or(result.model_response),
+                            error_details: None,
+                        };
+                        info!(
+                            "AI config connection test completed: model={}, success={}, response_time={}ms",
+                            model_name, merged.success, merged.response_time_ms
+                        );
+                        return Ok(merged);
+                    }
+                    Err(e) => {
+                        error!(
+                            "AI config multimodal image input test failed unexpectedly: model={}, error={}",
+                            model_name, e
+                        );
+                        return Err(format!("Connection test failed: {}", e));
+                    }
+                }
+            }
+
             info!(
                 "AI config connection test completed: model={}, success={}, response_time={}ms",
                 model_name, result.success, result.response_time_ms

diff --git a/src/apps/desktop/src/api/image_analysis_api.rs b/src/apps/desktop/src/api/image_analysis_api.rs
@@ -2,7 +2,10 @@
 
 use crate::api::app_state::AppState;
 use bitfun_core::agentic::coordination::ConversationCoordinator;
-use bitfun_core::agentic::image_analysis::*;
+use bitfun_core::agentic::image_analysis::{
+    resolve_vision_model_from_ai_config, AnalyzeImagesRequest, ImageAnalysisResult, ImageAnalyzer,
+    MessageEnhancer, SendEnhancedMessageRequest,
+};
 use log::error;
 use std::sync::Arc;
 use tauri::State;
@@ -21,65 +24,25 @@ pub async fn analyze_images(
             format!("Failed to get AI config: {}", e)
         })?;
 
-    let image_model_id = ai_config
-        .default_models
-        .image_understanding
-        .ok_or_else(|| {
-            error!("Image understanding model not configured");
-            "Image understanding model not configured".to_string()
-        })?;
-
-    let image_model_id = if image_model_id.is_empty() {
-        let vision_model = ai_config
-            .models
-            .iter()
-            .find(|m| {
-                m.enabled
-                    && m.capabilities.iter().any(|cap| {
-                        matches!(
-                        cap,
-                        bitfun_core::service::config::types::ModelCapability::ImageUnderstanding
-                    )
-                    })
-            })
-            .map(|m| m.id.as_str());
-
-        match vision_model {
-            Some(model_id) => model_id,
-            None => {
-                error!("No image understanding model found");
-                return Err(
-                    "Image understanding model not configured and no compatible model found.\n\n\
-                    Please add a model that supports image understanding\
-                    in [Settings → AI Model Config], enable 'image_understanding' capability, \
-                    and assign it in [Settings → Super Agent]."
-                        .to_string(),
-                );
-            }
-        }
-    } else {
-        &image_model_id
-    };
-
-    let image_model = ai_config
-        .models
-        .iter()
-        .find(|m| &m.id == image_model_id)
-        .ok_or_else(|| {
-            error!(
-                "Model not found: model_id={}, available_models={:?}",
-                image_model_id,
-                ai_config.models.iter().map(|m| &m.id).collect::<Vec<_>>()
-            );
-            format!("Model not found: {}", image_model_id)
-        })?
-        .clone();
+    let image_model = resolve_vision_model_from_ai_config(&ai_config).map_err(|e| {
+        error!(
+            "Image understanding model resolution failed: available_models={:?}, error={}",
+            ai_config.models.iter().map(|m| &m.id).collect::<Vec<_>>(),
+            e
+        );
+        format!(
+            "Image understanding model is not configured.\n\n\
+             Please select a model for [Settings → Default Model Config → Image Understanding Model].\n\n\
+             Details: {}",
+            e
+        )
+    })?;
 
     let workspace_path = state.workspace_path.read().await.clone();
 
     let ai_client = state
         .ai_client_factory
-        .get_client_by_id(image_model_id)
+        .get_client_by_id(&image_model.id)
         .await
         .map_err(|e| format!("Failed to create AI client: {}", e))?;