From 5d6cb4b0363266a63bab82c55b4f33b0234791d6 Mon Sep 17 00:00:00 2001 From: NoiAI Date: Tue, 12 May 2026 12:50:58 +0800 Subject: [PATCH] feat(agui): Add multimodal input support (image/video/audio) to AguiMessage and AguiMessageConverter Adds support for multimodal input (image, video, audio, document) in the AG-UI extension, aligning with the AG-UI Protocol InputContent specification. - AguiMessage.content: String -> Object (backward compatible) - AguiMessageConverter: InputContent[] -> ContentBlock conversion - Supports both url and data (base64) source types --- .../agui/converter/AguiMessageConverter.java | 118 ++++++++- .../core/agui/model/AguiMessage.java | 54 +++- .../converter/AguiMessageConverterTest.java | 242 ++++++++++++++++++ .../core/agui/model/AguiModelTest.java | 57 +++++ 4 files changed, 465 insertions(+), 6 deletions(-) diff --git a/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/converter/AguiMessageConverter.java b/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/converter/AguiMessageConverter.java index 78ee08a22..54aaab50e 100644 --- a/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/converter/AguiMessageConverter.java +++ b/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/converter/AguiMessageConverter.java @@ -19,12 +19,18 @@ import io.agentscope.core.agui.model.AguiFunctionCall; import io.agentscope.core.agui.model.AguiMessage; import io.agentscope.core.agui.model.AguiToolCall; +import io.agentscope.core.message.AudioBlock; +import io.agentscope.core.message.Base64Source; import io.agentscope.core.message.ContentBlock; +import io.agentscope.core.message.ImageBlock; import io.agentscope.core.message.Msg; import io.agentscope.core.message.MsgRole; +import io.agentscope.core.message.Source; import io.agentscope.core.message.TextBlock; import io.agentscope.core.message.ToolResultBlock; import io.agentscope.core.message.ToolUseBlock; +import io.agentscope.core.message.URLSource; +import io.agentscope.core.message.VideoBlock; import io.agentscope.core.util.JsonException; import io.agentscope.core.util.JsonUtils; import java.util.ArrayList; @@ -37,6 +43,17 @@ * *

This class handles the bidirectional conversion between the AG-UI protocol's * message format and AgentScope's internal message format. + * + *

Supports multimodal input per AG-UI protocol: + *

+ * + *

See https://docs.ag-ui.com/concepts/messages.md for AG-UI InputContent spec. */ public class AguiMessageConverter { /** @@ -54,8 +71,20 @@ public Msg toMsg(AguiMessage aguiMessage) { MsgRole role = convertRole(aguiMessage.getRole()); List blocks = new ArrayList<>(); - // Add text content if present - if (aguiMessage.getContent() != null && !aguiMessage.getContent().isEmpty()) { + // Handle multimodal content (InputContent array per AG-UI protocol) + if (aguiMessage.isMultimodalContent()) { + List> parts = aguiMessage.getMultimodalContent(); + if (parts != null) { + for (Map part : parts) { + ContentBlock block = convertInputContent(part); + if (block != null) { + blocks.add(block); + } + } + } + } + // Handle simple text content (backward compatible) + else if (aguiMessage.getContent() != null && !aguiMessage.getContent().isEmpty()) { if (aguiMessage.isToolMessage() && aguiMessage.getToolCallId() != null) { // For tool messages, wrap content in ToolResultBlock blocks.add( @@ -78,6 +107,91 @@ public Msg toMsg(AguiMessage aguiMessage) { return Msg.builder().id(aguiMessage.getId()).role(role).content(blocks).build(); } + /** + * Convert a single AG-UI InputContent part to an AgentScope ContentBlock. + * + * @param part The InputContent map from AG-UI protocol + * @return The converted ContentBlock, or null if type is unrecognized + */ + @SuppressWarnings("unchecked") + private ContentBlock convertInputContent(Map part) { + String type = (String) part.get("type"); + if (type == null) { + return null; + } + + switch (type) { + case "text": + String text = (String) part.get("text"); + return text != null ? TextBlock.builder().text(text).build() : null; + + case "image": + Source source = extractSource(part); + return source != null ? ImageBlock.builder().source(source).build() : null; + + case "video": + Source videoSource = extractSource(part); + return videoSource != null + ? VideoBlock.builder().source(videoSource).build() + : null; + + case "audio": + Source audioSource = extractSource(part); + return audioSource != null + ? AudioBlock.builder().source(audioSource).build() + : null; + + case "document": + // Convert document to TextBlock with description + Source docSource = extractSource(part); + if (docSource != null) { + String docDesc = "[Document: " + extractMimeType(part) + "]"; + return TextBlock.builder().text(docDesc).build(); + } + return null; + + default: + return null; + } + } + + /** + * Extract Source from an InputContent part. + * Supports both 'url' and 'data' (base64) source types. + */ + @SuppressWarnings("unchecked") + private Source extractSource(Map part) { + Map sourceMap = (Map) part.get("source"); + if (sourceMap == null) { + return null; + } + + String sourceType = (String) sourceMap.get("type"); + if ("url".equals(sourceType)) { + String url = (String) sourceMap.get("value"); + return url != null ? new URLSource(url) : null; + } else if ("data".equals(sourceType)) { + String data = (String) sourceMap.get("value"); + String mimeType = (String) sourceMap.get("mimeType"); + if (data != null && mimeType != null) { + return new Base64Source(data, mimeType); + } + } + return null; + } + + /** + * Extract mimeType from an InputContent part (for document type). + */ + private String extractMimeType(Map part) { + @SuppressWarnings("unchecked") + Map sourceMap = (Map) part.get("source"); + if (sourceMap != null) { + return (String) sourceMap.get("mimeType"); + } + return null; + } + /** * Convert an AgentScope message to an AG-UI message. * diff --git a/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/model/AguiMessage.java b/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/model/AguiMessage.java index 91ae8f98a..8058e10f1 100644 --- a/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/model/AguiMessage.java +++ b/agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/model/AguiMessage.java @@ -19,6 +19,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Objects; /** @@ -34,12 +35,24 @@ *

  • system - System instructions
  • *
  • tool - Tool execution results
  • * + * + *

    Content can be a simple string or a multimodal array of + * {@code InputContent} objects (per AG-UI protocol). + * See https://docs.ag-ui.com/concepts/messages.md for details. + * + *

    InputContent array element structure: + *

    {@code
    + * { "type": "text", "text": "Hello" }
    + * { "type": "image", "source": { "type": "url", "value": "https://...", "mimeType": "image/png" } }
    + * { "type": "video", "source": { "type": "url", "value": "https://...", "mimeType": "video/mp4" } }
    + * { "type": "audio", "source": { "type": "url", "value": "https://...", "mimeType": "audio/wav" } }
    + * }
    */ public class AguiMessage { private final String id; private final String role; - private final String content; + private final Object content; // String or List> for multimodal private final List toolCalls; private final String toolCallId; @@ -48,7 +61,8 @@ public class AguiMessage { * * @param id The unique message ID * @param role The message role (user, assistant, system, tool) - * @param content The message content + * @param content The message content - may be a String or a List of InputContent objects + * (multimodal input per AG-UI protocol) * @param toolCalls Tool calls for assistant messages (optional) * @param toolCallId Tool call ID for tool messages (optional) */ @@ -56,7 +70,7 @@ public class AguiMessage { public AguiMessage( @JsonProperty("id") String id, @JsonProperty("role") String role, - @JsonProperty("content") String content, + @JsonProperty("content") Object content, @JsonProperty("toolCalls") List toolCalls, @JsonProperty("toolCallId") String toolCallId) { this.id = Objects.requireNonNull(id, "id cannot be null"); @@ -135,12 +149,44 @@ public String getRole() { /** * Get the message content. * - * @return The content, may be null + * @return The content as a String if it is a simple text message, or null if + * the content is multimodal (InputContent array). Use {@link #getContentObject()} + * for full multimodal support. */ public String getContent() { + return content instanceof String ? (String) content : null; + } + + /** + * Get the raw content object. + * + * @return The content as an Object - either a String for simple text messages + * or a List of InputContent maps for multimodal messages. + */ + public Object getContentObject() { return content; } + /** + * Check if this message contains multimodal content (InputContent array). + * + * @return true if content is a List (multimodal), false if it's a String or null + */ + public boolean isMultimodalContent() { + return content instanceof List; + } + + /** + * Get the multimodal content as a list of InputContent objects. + * Each item is a Map with keys: type, text/source/etc. + * + * @return The content as a List if it is multimodal, or null if it's a simple String + */ + @SuppressWarnings("unchecked") + public List> getMultimodalContent() { + return content instanceof List ? (List>) content : null; + } + /** * Get the tool calls (for assistant messages). * diff --git a/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/converter/AguiMessageConverterTest.java b/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/converter/AguiMessageConverterTest.java index e0a8da8f1..c5cb66d99 100644 --- a/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/converter/AguiMessageConverterTest.java +++ b/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/converter/AguiMessageConverterTest.java @@ -17,6 +17,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -24,11 +25,17 @@ import io.agentscope.core.agui.model.AguiFunctionCall; import io.agentscope.core.agui.model.AguiMessage; import io.agentscope.core.agui.model.AguiToolCall; +import io.agentscope.core.message.AudioBlock; +import io.agentscope.core.message.Base64Source; +import io.agentscope.core.message.ContentBlock; +import io.agentscope.core.message.ImageBlock; import io.agentscope.core.message.Msg; import io.agentscope.core.message.MsgRole; import io.agentscope.core.message.TextBlock; import io.agentscope.core.message.ToolResultBlock; import io.agentscope.core.message.ToolUseBlock; +import io.agentscope.core.message.URLSource; +import io.agentscope.core.message.VideoBlock; import java.util.Collections; import java.util.List; import java.util.Map; @@ -370,4 +377,239 @@ void testConvertToolCallWithInvalidJson() { // Invalid JSON should result in empty map assertTrue(tub.getInput().isEmpty()); } + + // ===== Multimodal content conversion tests ===== + + @Test + void testConvertMultimodalTextContent() { + List> parts = + List.of(Map.of("type", "text", "text", "Hello multimodal")); + AguiMessage aguiMsg = new AguiMessage("msg-mm-1", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertEquals("msg-mm-1", msg.getId()); + assertEquals(MsgRole.USER, msg.getRole()); + assertTrue(msg.hasContentBlocks(TextBlock.class)); + assertEquals("Hello multimodal", msg.getTextContent()); + } + + @Test + void testConvertMultimodalImageWithUrl() { + Map source = Map.of("type", "url", "value", "https://example.com/img.png"); + List> parts = List.of(Map.of("type", "image", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-2", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(ImageBlock.class)); + ImageBlock imageBlock = msg.getFirstContentBlock(ImageBlock.class); + assertNotNull(imageBlock); + assertInstanceOf(URLSource.class, imageBlock.getSource()); + assertEquals("https://example.com/img.png", ((URLSource) imageBlock.getSource()).getUrl()); + } + + @Test + void testConvertMultimodalImageWithBase64() { + Map source = + Map.of("type", "data", "value", "iVBORw0KGgo=", "mimeType", "image/png"); + List> parts = List.of(Map.of("type", "image", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-3", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(ImageBlock.class)); + ImageBlock imageBlock = msg.getFirstContentBlock(ImageBlock.class); + assertNotNull(imageBlock); + assertInstanceOf(Base64Source.class, imageBlock.getSource()); + } + + @Test + void testConvertMultimodalVideoWithUrl() { + Map source = + Map.of("type", "url", "value", "https://example.com/video.mp4"); + List> parts = List.of(Map.of("type", "video", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-4", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(VideoBlock.class)); + VideoBlock videoBlock = msg.getFirstContentBlock(VideoBlock.class); + assertNotNull(videoBlock); + assertInstanceOf(URLSource.class, videoBlock.getSource()); + assertEquals( + "https://example.com/video.mp4", ((URLSource) videoBlock.getSource()).getUrl()); + } + + @Test + void testConvertMultimodalAudioWithUrl() { + Map source = + Map.of("type", "url", "value", "https://example.com/audio.wav"); + List> parts = List.of(Map.of("type", "audio", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-5", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(AudioBlock.class)); + AudioBlock audioBlock = msg.getFirstContentBlock(AudioBlock.class); + assertNotNull(audioBlock); + assertInstanceOf(URLSource.class, audioBlock.getSource()); + assertEquals( + "https://example.com/audio.wav", ((URLSource) audioBlock.getSource()).getUrl()); + } + + @Test + void testConvertMultimodalDocument() { + Map source = + Map.of( + "type", + "url", + "value", + "https://example.com/doc.pdf", + "mimeType", + "application/pdf"); + List> parts = List.of(Map.of("type", "document", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-6", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(TextBlock.class)); + assertEquals("[Document: application/pdf]", msg.getTextContent()); + } + + @Test + void testConvertMultimodalMixedContent() { + Map imgSource = + Map.of("type", "url", "value", "https://example.com/img.jpg"); + List> parts = + List.of( + Map.of("type", "text", "text", "Look at this image:"), + Map.of("type", "image", "source", imgSource)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-7", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + List blocks = msg.getContent(); + assertEquals(2, blocks.size()); + assertInstanceOf(TextBlock.class, blocks.get(0)); + assertInstanceOf(ImageBlock.class, blocks.get(1)); + } + + @Test + void testConvertMultimodalUnknownType() { + List> parts = List.of(Map.of("type", "unknown_type", "data", "xyz")); + AguiMessage aguiMsg = new AguiMessage("msg-mm-8", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // Unknown type should be skipped, resulting in no content blocks + assertTrue(msg.getContent().isEmpty()); + } + + @Test + void testConvertMultimodalNullType() { + List> parts = List.of(Map.of("data", "xyz")); + AguiMessage aguiMsg = new AguiMessage("msg-mm-9", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.getContent().isEmpty()); + } + + @Test + void testConvertMultimodalImageWithNullSource() { + List> parts = List.of(Map.of("type", "image")); + AguiMessage aguiMsg = new AguiMessage("msg-mm-10", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // Image without source should be skipped + assertFalse(msg.hasContentBlocks(ImageBlock.class)); + } + + @Test + void testConvertMultimodalTextWithNullText() { + List> parts = List.of(Map.of("type", "text")); + AguiMessage aguiMsg = new AguiMessage("msg-mm-11", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // Text without text value should be skipped + assertFalse(msg.hasContentBlocks(TextBlock.class)); + } + + @Test + void testConvertMultimodalBase64SourceMissingMimeType() { + Map source = Map.of("type", "data", "value", "iVBORw0KGgo="); + List> parts = List.of(Map.of("type", "image", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-12", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // Base64 source without mimeType should be skipped + assertFalse(msg.hasContentBlocks(ImageBlock.class)); + } + + @Test + void testConvertMultimodalUrlSourceMissingValue() { + Map source = Map.of("type", "url"); + List> parts = List.of(Map.of("type", "image", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-13", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // URL source without value should be skipped + assertFalse(msg.hasContentBlocks(ImageBlock.class)); + } + + @Test + void testConvertMultimodalDocumentWithoutSource() { + List> parts = List.of(Map.of("type", "document")); + AguiMessage aguiMsg = new AguiMessage("msg-mm-14", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // Document without source should be skipped + assertFalse(msg.hasContentBlocks(TextBlock.class)); + } + + @Test + void testConvertMultimodalSourceWithUnknownSourceType() { + Map source = Map.of("type", "ftp", "value", "ftp://file.dat"); + List> parts = List.of(Map.of("type", "image", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-15", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + // Unknown source type should be skipped + assertFalse(msg.hasContentBlocks(ImageBlock.class)); + } + + @Test + void testConvertMultimodalAudioWithBase64() { + Map source = + Map.of("type", "data", "value", "AAAA", "mimeType", "audio/wav"); + List> parts = List.of(Map.of("type", "audio", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-16", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(AudioBlock.class)); + AudioBlock audioBlock = msg.getFirstContentBlock(AudioBlock.class); + assertInstanceOf(Base64Source.class, audioBlock.getSource()); + } + + @Test + void testConvertMultimodalVideoWithBase64() { + Map source = + Map.of("type", "data", "value", "AAAA", "mimeType", "video/mp4"); + List> parts = List.of(Map.of("type", "video", "source", source)); + AguiMessage aguiMsg = new AguiMessage("msg-mm-17", "user", parts, null, null); + + Msg msg = converter.toMsg(aguiMsg); + + assertTrue(msg.hasContentBlocks(VideoBlock.class)); + VideoBlock videoBlock = msg.getFirstContentBlock(VideoBlock.class); + assertInstanceOf(Base64Source.class, videoBlock.getSource()); + } } diff --git a/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/model/AguiModelTest.java b/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/model/AguiModelTest.java index 98d03c973..b32e2e558 100644 --- a/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/model/AguiModelTest.java +++ b/agentscope-extensions/agentscope-extensions-agui/src/test/java/io/agentscope/core/agui/model/AguiModelTest.java @@ -176,6 +176,63 @@ void testNullContent() { assertNull(msg.getContent()); } + + @Test + void testMultimodalContentDetection() { + List> parts = List.of(Map.of("type", "text", "text", "Hello")); + AguiMessage msg = new AguiMessage("msg-1", "user", parts, null, null); + + assertTrue(msg.isMultimodalContent()); + assertNull(msg.getContent()); // String content returns null for multimodal + assertNotNull(msg.getMultimodalContent()); + assertEquals(1, msg.getMultimodalContent().size()); + assertEquals(parts, msg.getContentObject()); + } + + @Test + void testSimpleTextContentIsNotMultimodal() { + AguiMessage msg = AguiMessage.userMessage("msg-1", "Hello world"); + + assertFalse(msg.isMultimodalContent()); + assertEquals("Hello world", msg.getContent()); + assertNull(msg.getMultimodalContent()); + assertEquals("Hello world", msg.getContentObject()); + } + + @Test + void testNullContentIsNotMultimodal() { + AguiMessage msg = new AguiMessage("msg-1", "user", null, null, null); + + assertFalse(msg.isMultimodalContent()); + assertNull(msg.getContent()); + assertNull(msg.getMultimodalContent()); + assertNull(msg.getContentObject()); + } + + @Test + void testMultimodalJsonDeserialization() throws JsonProcessingException { + String json = + "{\"id\":\"msg-1\",\"role\":\"user\",\"content\":" + + "[{\"type\":\"text\",\"text\":\"Hello\"}]}"; + + AguiMessage msg = JsonUtils.getJsonCodec().fromJson(json, AguiMessage.class); + + assertEquals("msg-1", msg.getId()); + assertEquals("user", msg.getRole()); + assertTrue(msg.isMultimodalContent()); + assertNotNull(msg.getMultimodalContent()); + assertEquals(1, msg.getMultimodalContent().size()); + } + + @Test + void testMultimodalContentEquals() { + List> parts = List.of(Map.of("type", "text", "text", "Hello")); + AguiMessage msg1 = new AguiMessage("msg-1", "user", parts, null, null); + AguiMessage msg2 = new AguiMessage("msg-1", "user", parts, null, null); + + assertEquals(msg1, msg2); + assertEquals(msg1.hashCode(), msg2.hashCode()); + } } @Nested