diff --git a/sdk/go/agent/agent_test.go b/sdk/go/agent/agent_test.go index 354f5ded..aa14c0c2 100644 --- a/sdk/go/agent/agent_test.go +++ b/sdk/go/agent/agent_test.go @@ -512,7 +512,9 @@ func TestAI(t *testing.T) { Choices: []ai.Choice{ { Message: ai.Message{ - Content: "AI response", + Content: []ai.ContentPart{ + {Type: "text", Text: "AI response"}, + }, }, }, }, diff --git a/sdk/go/ai/README.md b/sdk/go/ai/README.md index dc80d364..c8b4e216 100644 --- a/sdk/go/ai/README.md +++ b/sdk/go/ai/README.md @@ -146,6 +146,32 @@ Functional options for customizing AI requests: - `ai.WithStream()` - Enable streaming - `ai.WithJSONMode()` - Enable JSON object mode - `ai.WithSchema(schema interface{})` - Enable structured outputs with schema +##### Multimodal +- `ai.WithImageFile(path string)` - Attach an image from a local file +- `ai.WithImageURL(url string)` - Attach an image from a remote URL +- `ai.WithImageBytes(data []byte, mimeType string)` - Add an image from raw bytes (SDK encodes automatically) + +### Multimodal Inputs (Images) + +You can attach images files to AI requests. + +```go +// Image from file +response, _ := agent.AI(ctx, "Describe this image", + ai.WithImageFile("./photo.jpg"), +) + +// Image from URL +response, _ = agent.AI(ctx, "Describe this image", + ai.WithImageURL("https://example.com/image.jpg"), +) + +// Image from bytes +data, _ := os.ReadFile("image.png") +response, _ = agent.AI(ctx, "What's in this image?", + ai.WithImageBytes(data, "image/png"), +) +``` ### Response Methods diff --git a/sdk/go/ai/client.go b/sdk/go/ai/client.go index 076dfad3..76764dc4 100644 --- a/sdk/go/ai/client.go +++ b/sdk/go/ai/client.go @@ -39,7 +39,12 @@ func (c *Client) Complete(ctx context.Context, prompt string, opts ...Option) (* // Build base request req := &Request{ Messages: []Message{ - {Role: "user", Content: prompt}, + { + Role: "user", + Content: []ContentPart{ + {Type: "text", Text: prompt}, + }, + }, }, Model: c.config.Model, Temperature: &c.config.Temperature, @@ -78,7 +83,17 @@ func (c *Client) CompleteWithMessages(ctx context.Context, messages []Message, o func (c *Client) doRequest(ctx context.Context, req *Request) (*Response, error) { // Marshal request - body, err := json.Marshal(req) + + var body []byte + var err error + + if c.config.IsOpenRouter() { + payload := transformForOpenRouter(req) + body, err = json.Marshal(payload) + fmt.Printf("[DEBUG] OpenRouter JSON:\n%s\n", string(body)) + } else { + body, err = json.Marshal(req) + } if err != nil { return nil, fmt.Errorf("marshal request: %w", err) } @@ -155,7 +170,12 @@ func (c *Client) StreamComplete(ctx context.Context, prompt string, opts ...Opti opts = append(opts, WithStream()) req := &Request{ Messages: []Message{ - {Role: "user", Content: prompt}, + { + Role: "user", + Content: []ContentPart{ + {Type: "text", Text: prompt}, + }, + }, }, Model: c.config.Model, Temperature: &c.config.Temperature, diff --git a/sdk/go/ai/client_test.go b/sdk/go/ai/client_test.go index d202c654..cbbbc1ac 100644 --- a/sdk/go/ai/client_test.go +++ b/sdk/go/ai/client_test.go @@ -73,7 +73,9 @@ func TestComplete(t *testing.T) { assert.NoError(t, err) assert.Len(t, req.Messages, 1) assert.Equal(t, "user", req.Messages[0].Role) - assert.Equal(t, "Hello", req.Messages[0].Content) + assert.Len(t, req.Messages[0].Content, 1) + assert.Equal(t, "text", req.Messages[0].Content[0].Type) + assert.Equal(t, "Hello", req.Messages[0].Content[0].Text) // Send response resp := Response{ @@ -85,8 +87,13 @@ func TestComplete(t *testing.T) { { Index: 0, Message: Message{ - Role: "assistant", - Content: "Hello! How can I help you?", + Role: "assistant", + Content: []ContentPart{ + { + Type: "text", + Text: "Hello! How can I help you?", + }, + }, }, FinishReason: "stop", }, @@ -97,6 +104,7 @@ func TestComplete(t *testing.T) { TotalTokens: 15, }, } + w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(resp) })) @@ -124,7 +132,17 @@ func TestComplete_WithAPIKeyOverride(t *testing.T) { resp := Response{ Choices: []Choice{ - {Message: Message{Content: "ok"}}, + { + Message: Message{ + Role: "assistant", + Content: []ContentPart{ + { + Type: "text", + Text: "ok", + }, + }, + }, + }, }, } w.WriteHeader(http.StatusOK) @@ -162,9 +180,20 @@ func TestComplete_WithOptions(t *testing.T) { resp := Response{ Choices: []Choice{ - {Message: Message{Content: "Response"}}, + { + Message: Message{ + Role: "assistant", + Content: []ContentPart{ + { + Type: "text", + Text: "Response", + }, + }, + }, + }, }, } + w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(resp) })) @@ -197,9 +226,20 @@ func TestComplete_WithOpenRouterHeaders(t *testing.T) { receivedHeaders = r.Header resp := Response{ Choices: []Choice{ - {Message: Message{Content: "Response"}}, + { + Message: Message{ + Role: "assistant", + Content: []ContentPart{ + { + Type: "text", + Text: "Reponse", + }, + }, + }, + }, }, } + w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(resp) })) @@ -325,9 +365,20 @@ func TestCompleteWithMessages(t *testing.T) { resp := Response{ Choices: []Choice{ - {Message: Message{Content: "Response"}}, + { + Message: Message{ + Role: "assistant", // optional but recommended + Content: []ContentPart{ + { + Type: "text", + Text: "Response", + }, + }, + }, + }, }, } + w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(resp) })) @@ -343,8 +394,18 @@ func TestCompleteWithMessages(t *testing.T) { require.NoError(t, err) messages := []Message{ - {Role: "system", Content: "You are helpful"}, - {Role: "user", Content: "Hello"}, + { + Role: "system", + Content: []ContentPart{ + {Type: "text", Text: "You are helpful"}, + }, + }, + { + Role: "user", + Content: []ContentPart{ + {Type: "text", Text: "Hello"}, + }, + }, } resp, err := client.CompleteWithMessages(context.Background(), messages) diff --git a/sdk/go/ai/config.go b/sdk/go/ai/config.go index 73328fb3..b96649a2 100644 --- a/sdk/go/ai/config.go +++ b/sdk/go/ai/config.go @@ -89,3 +89,58 @@ func (c *Config) IsOpenRouter() bool { return c.BaseURL == "https://openrouter.ai/api/v1" || c.BaseURL == "https://openrouter.ai/api/v1/" } + +type OpenRouterRequest struct { + Messages []OpenRouterMessage `json:"messages"` + Model string `json:"model,omitempty"` +} + +type OpenRouterMessage struct { + Role string `json:"role"` + Content []OpenRouterContentPart `json:"content"` +} + +type OpenRouterContentPart struct { + Type string `json:"type"` + Text string `json:"text,omitempty"` + ImageURL *ImageData `json:"image_url,omitempty"` +} + +type ImageData struct { + URL string `json:"url"` + Detail string `json:"detail,omitempty"` +} + +func transformForOpenRouter(req *Request) *OpenRouterRequest { + var messages []OpenRouterMessage + + for _, m := range req.Messages { + msg := OpenRouterMessage{ + Role: m.Role, + } + + for _, c := range m.Content { + part := OpenRouterContentPart{ + Type: c.Type, + Text: c.Text, + } + + // Map Go struct's input_image to OpenRouter's image_url + if c.Type == "input_image" && c.ImageURL != "" { + part.Type = "image_url" + part.ImageURL = &ImageData{ + URL: c.ImageURL, + } + } + + msg.Content = append(msg.Content, part) + } + + messages = append(messages, msg) + } + + return &OpenRouterRequest{ + Messages: messages, + Model: req.Model, + } +} diff --git a/sdk/go/ai/multimodal.go b/sdk/go/ai/multimodal.go new file mode 100644 index 00000000..1699db7d --- /dev/null +++ b/sdk/go/ai/multimodal.go @@ -0,0 +1,18 @@ +package ai + +import "strings" + +func detectMIMEType(path string) string { + switch { + case strings.HasSuffix(path, ".png"): + return "image/png" + case strings.HasSuffix(path, ".jpg"), strings.HasSuffix(path, ".jpeg"): + return "image/jpeg" + case strings.HasSuffix(path, ".gif"): + return "image/gif" + case strings.HasSuffix(path, ".webp"): + return "image/webp" + default: + return "application/octet-stream" + } +} diff --git a/sdk/go/ai/request.go b/sdk/go/ai/request.go index 41d07173..c56a645c 100644 --- a/sdk/go/ai/request.go +++ b/sdk/go/ai/request.go @@ -1,16 +1,14 @@ package ai import ( + "encoding/base64" "encoding/json" "fmt" + "os" "reflect" ) // Message represents a chat message. -type Message struct { - Role string `json:"role"` - Content string `json:"content"` -} // Request represents an AI completion request. type Request struct { @@ -37,6 +35,44 @@ type Request struct { ResponseFormat *ResponseFormat `json:"response_format,omitempty"` } +type Message struct { + Role string `json:"role"` + Content []ContentPart `json:"content"` +} + +type ContentPart struct { + Type string `json:"type"` // "text" or "image_url" + Text string `json:"text,omitempty"` + ImageURL string `json:"image_url,omitempty"` +} + +func (m *Message) UnmarshalJSON(data []byte) error { + type Alias Message + aux := &struct { + Content json.RawMessage `json:"content"` + *Alias + }{ + Alias: (*Alias)(m), + } + + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + + var s string + if err := json.Unmarshal(aux.Content, &s); err == nil { + m.Content = []ContentPart{{Type: "text", Text: s}} + return nil + } + + var arr []ContentPart + if err := json.Unmarshal(aux.Content, &arr); err != nil { + return err + } + m.Content = arr + return nil +} + // ResponseFormat specifies the desired output format. type ResponseFormat struct { Type string `json:"type"` // "json_object" or "json_schema" @@ -56,7 +92,14 @@ type Option func(*Request) error // WithSystem adds a system message to the request. func WithSystem(content string) Option { return func(r *Request) error { - r.Messages = append([]Message{{Role: "system", Content: content}}, r.Messages...) + r.Messages = append([]Message{ + { + Role: "system", + Content: []ContentPart{ + {Type: "text", Text: content}, + }, + }, + }, r.Messages...) return nil } } @@ -153,6 +196,78 @@ func WithSchema(schema interface{}) Option { } } +// Image options +func WithImageFile(path string) Option { + return func(r *Request) error { + data, err := os.ReadFile(path) + if err != nil { + return fmt.Errorf("read image file: %w", err) + } + + mimeType := detectMIMEType(path) + encoded := base64.StdEncoding.EncodeToString(data) + + if len(r.Messages) == 0 { + r.Messages = append(r.Messages, Message{ + Role: "user", + Content: []ContentPart{}, + }) + } + + last := &r.Messages[len(r.Messages)-1] + last.Content = append(last.Content, ContentPart{ + Type: "input_image", + ImageURL: "data:" + mimeType + ";base64," + encoded, + }) + + return nil + } +} + +func WithImageURL(url string) Option { + return func(r *Request) error { + if len(r.Messages) == 0 { + r.Messages = append(r.Messages, Message{ + Role: "user", + Content: []ContentPart{}, + }) + } + + last := &r.Messages[len(r.Messages)-1] + last.Content = append(last.Content, ContentPart{ + Type: "input_image", + ImageURL: url, + }) + + return nil + } +} + +func WithImageBytes(data []byte, mimeType string) Option { + return func(r *Request) error { + if len(data) == 0 { + return nil + } + + encoded := base64.StdEncoding.EncodeToString(data) + + if len(r.Messages) == 0 { + r.Messages = append(r.Messages, Message{ + Role: "user", + Content: []ContentPart{}, + }) + } + + last := &r.Messages[len(r.Messages)-1] + last.Content = append(last.Content, ContentPart{ + Type: "input_image", + ImageURL: "data:" + mimeType + ";base64," + encoded, + }) + + return nil + } +} + // structToJSONSchema converts a Go struct to a JSON schema. // This is a simplified version - you may want to use a library like // github.com/invopop/jsonschema for production. diff --git a/sdk/go/ai/request_test.go b/sdk/go/ai/request_test.go index 5310aa6f..5c691bf7 100644 --- a/sdk/go/ai/request_test.go +++ b/sdk/go/ai/request_test.go @@ -2,6 +2,7 @@ package ai import ( "encoding/json" + "os" "reflect" "testing" @@ -11,16 +12,30 @@ import ( func TestWithSystem(t *testing.T) { req := &Request{ Messages: []Message{ - {Role: "user", Content: "Hello"}, + { + Role: "user", + Content: []ContentPart{ + {Type: "text", Text: "Hello"}, + }, + }, }, } err := WithSystem("You are a helpful assistant")(req) assert.NoError(t, err) assert.Len(t, req.Messages, 2) - assert.Equal(t, "system", req.Messages[0].Role) - assert.Equal(t, "You are a helpful assistant", req.Messages[0].Content) - assert.Equal(t, "user", req.Messages[1].Role) + + systemMsg := req.Messages[0] + assert.Equal(t, "system", systemMsg.Role) + assert.Len(t, systemMsg.Content, 1) + assert.Equal(t, "text", systemMsg.Content[0].Type) + assert.Equal(t, "You are a helpful assistant", systemMsg.Content[0].Text) + + userMsg := req.Messages[1] + assert.Equal(t, "user", userMsg.Role) + assert.Len(t, userMsg.Content, 1) + assert.Equal(t, "text", userMsg.Content[0].Type) + assert.Equal(t, "Hello", userMsg.Content[0].Text) } func TestWithModel(t *testing.T) { @@ -143,6 +158,123 @@ func TestWithSchema_InvalidType(t *testing.T) { assert.Error(t, err) } +func TestWithImageFile(t *testing.T) { + tempFile, err := os.CreateTemp("", "test_image_*.jpg") + assert.NoError(t, err) + defer os.Remove(tempFile.Name()) + + _, err = tempFile.Write([]byte{0xFF, 0xD8, 0xFF}) + assert.NoError(t, err) + tempFile.Close() + + req := &Request{} + err = WithImageFile(tempFile.Name())(req) + + assert.NoError(t, err) + + assert.Len(t, req.Messages, 1) + assert.Len(t, req.Messages[0].Content, 1) + + part := req.Messages[0].Content[0] + assert.Equal(t, "input_image", part.Type) + assert.Contains(t, part.ImageURL, "data:image/jpeg;base64,") +} + +func TestWithImageURL(t *testing.T) { + req := &Request{} + testURL := "https://example.com/image.jpg" + + err := WithImageURL(testURL)(req) + + assert.NoError(t, err) + + assert.Len(t, req.Messages, 1) + assert.Len(t, req.Messages[0].Content, 1) + + part := req.Messages[0].Content[0] + assert.Equal(t, "input_image", part.Type) + assert.Equal(t, testURL, part.ImageURL) +} + +func TestWithImageBytes(t *testing.T) { + req := &Request{} + testBytes := []byte{0xFF, 0xD8, 0xFF} + testMIMEType := "image/jpeg" + + err := WithImageBytes(testBytes, testMIMEType)(req) + + assert.NoError(t, err) + + assert.Len(t, req.Messages, 1) + assert.Len(t, req.Messages[0].Content, 1) + + part := req.Messages[0].Content[0] + assert.Equal(t, "input_image", part.Type) + assert.Contains(t, part.ImageURL, "data:image/jpeg;base64,") +} + +func TestWithImageFile_Error(t *testing.T) { + req := &Request{} + + err := WithImageFile("non_existent_file.jpg")(req) + + assert.Error(t, err) + assert.Len(t, req.Messages, 0) +} + +func TestWithImageBytes_EmptyInput(t *testing.T) { + req := &Request{} + + err := WithImageBytes(nil, "")(req) + + assert.NoError(t, err) + assert.Len(t, req.Messages, 0) +} + +func TestMultipleImages(t *testing.T) { + req := &Request{} + + req.Messages = append(req.Messages, Message{ + Role: "user", + Content: []ContentPart{}, + }) + + // Image via URL + err := WithImageURL("https://example.com/image1.jpg")(req) + assert.NoError(t, err) + + // Image via file + tempFile, err := os.CreateTemp("", "test_image_*.jpg") + assert.NoError(t, err) + defer os.Remove(tempFile.Name()) + + _, err = tempFile.Write([]byte{0xFF, 0xD8, 0xFF}) + assert.NoError(t, err) + tempFile.Close() + + err = WithImageFile(tempFile.Name())(req) + assert.NoError(t, err) + + testBytes := []byte{0x89, 0x50, 0x4E, 0x47} + err = WithImageBytes(testBytes, "image/png")(req) + assert.NoError(t, err) + + assert.Len(t, req.Messages, 1) + assert.Len(t, req.Messages[0].Content, 3) + + part1 := req.Messages[0].Content[0] + assert.Equal(t, "input_image", part1.Type) + assert.Equal(t, "https://example.com/image1.jpg", part1.ImageURL) + + part2 := req.Messages[0].Content[1] + assert.Equal(t, "input_image", part2.Type) + assert.Contains(t, part2.ImageURL, "data:image/jpeg;base64,") + + part3 := req.Messages[0].Content[2] + assert.Equal(t, "input_image", part3.Type) + assert.Contains(t, part3.ImageURL, "data:image/png;base64,") +} + func TestStructToJSONSchema(t *testing.T) { type User struct { ID int `json:"id"` @@ -249,7 +381,12 @@ func TestGoTypeToJSONType_WithPointer(t *testing.T) { func TestMultipleOptions(t *testing.T) { req := &Request{ Messages: []Message{ - {Role: "user", Content: "Hello"}, + { + Role: "user", + Content: []ContentPart{ + {Type: "text", Text: "Hello"}, + }, + }, }, } diff --git a/sdk/go/ai/response.go b/sdk/go/ai/response.go index 7e4e634d..045588a0 100644 --- a/sdk/go/ai/response.go +++ b/sdk/go/ai/response.go @@ -3,6 +3,7 @@ package ai import ( "encoding/json" "fmt" + "strings" ) // Response represents the API response from OpenAI/OpenRouter. @@ -65,10 +66,18 @@ type ErrorDetail struct { // Text returns the text content from the first choice. func (r *Response) Text() string { - if len(r.Choices) == 0 { + if len(r.Choices) == 0 || len(r.Choices[0].Message.Content) == 0 { return "" } - return r.Choices[0].Message.Content + + var sb strings.Builder + for _, part := range r.Choices[0].Message.Content { + if part.Type == "text" { + sb.WriteString(part.Text) + } + } + + return sb.String() } // JSON parses the response content as JSON into the provided destination. diff --git a/sdk/go/ai/response_test.go b/sdk/go/ai/response_test.go index f6998d99..1b1cda86 100644 --- a/sdk/go/ai/response_test.go +++ b/sdk/go/ai/response_test.go @@ -19,8 +19,10 @@ func TestResponse_Text(t *testing.T) { Choices: []Choice{ { Message: Message{ - Role: "assistant", - Content: "Hello, world!", + Role: "assistant", + Content: []ContentPart{ + {Type: "text", Text: "Hello, world!"}, + }, }, }, }, @@ -47,12 +49,16 @@ func TestResponse_Text(t *testing.T) { Choices: []Choice{ { Message: Message{ - Content: "First", + Content: []ContentPart{ + {Type: "text", Text: "First"}, + }, }, }, { Message: Message{ - Content: "Second", + Content: []ContentPart{ + {Type: "text", Text: "Second"}, + }, }, }, }, @@ -83,7 +89,9 @@ func TestResponse_JSON(t *testing.T) { Choices: []Choice{ { Message: Message{ - Content: `{"name":"John","age":30}`, + Content: []ContentPart{ + {Type: "text", Text: `{"name":"John","age":30}`}, + }, }, }, }, @@ -108,7 +116,9 @@ func TestResponse_JSON(t *testing.T) { Choices: []Choice{ { Message: Message{ - Content: "", + Content: []ContentPart{ + {Type: "text", Text: ""}, + }, }, }, }, @@ -122,7 +132,9 @@ func TestResponse_JSON(t *testing.T) { Choices: []Choice{ { Message: Message{ - Content: "not json", + Content: []ContentPart{ + {Type: "text", Text: "not json"}, + }, }, }, }, @@ -160,7 +172,9 @@ func TestResponse_Into(t *testing.T) { Choices: []Choice{ { Message: Message{ - Content: `{"value":42}`, + Content: []ContentPart{ + {Type: "text", Text: `{"value":42}`}, + }, }, }, }, @@ -258,8 +272,10 @@ func TestResponse_MarshalUnmarshal(t *testing.T) { { Index: 0, Message: Message{ - Role: "assistant", - Content: "Hello!", + Role: "assistant", + Content: []ContentPart{ + {Type: "text", Text: "Hello"}, + }, }, FinishReason: "stop", },