diff --git a/pkg/attachment/attachment.go b/pkg/attachment/attachment.go
new file mode 100644
index 000000000..28125ffb4
--- /dev/null
+++ b/pkg/attachment/attachment.go
@@ -0,0 +1,61 @@
+// Package attachment provides MIME-aware routing for document attachments.
+//
+// It defines how a chat.Document should be sent to a model: either dropped
+// (unsupported), wrapped in a plain-text envelope (StrategyTXT), or encoded
+// as inline base64 data (StrategyB64).
+package attachment
+
+import (
+ "fmt"
+
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// Strategy describes how an attachment should be handled before sending to the
+// provider.
+type Strategy int
+
+const (
+ // StrategyDrop means the attachment is not supported by the model or has no
+ // inline content, and should be silently skipped (with a log warning).
+ StrategyDrop Strategy = iota
+
+ // StrategyTXT means the attachment should be wrapped in a TXTEnvelope and
+ // sent as plain text. Used for text/* MIME types whose content is already
+ // in Source.InlineText.
+ StrategyTXT
+
+ // StrategyB64 means the attachment content (Source.InlineData) should be
+ // base64-encoded and sent as a native provider image/document block.
+ StrategyB64
+)
+
+// Decide returns the routing Strategy for a document given the current model's
+// capabilities.
+//
+// Algorithm:
+// 1. If the model does not support the document's MIME type → (Drop, reason).
+// 2. If Source.InlineData is non-empty → (B64, "").
+// 3. If Source.InlineText is non-empty → (TXT, "").
+// 4. Otherwise → (Drop, "no inline content").
+func Decide(doc chat.Document, mc modelcaps.ModelCapabilities) (Strategy, string) {
+ if !mc.Supports(doc.MimeType) {
+ return StrategyDrop, fmt.Sprintf("model does not support MIME type %q", doc.MimeType)
+ }
+ if len(doc.Source.InlineData) > 0 {
+ return StrategyB64, ""
+ }
+ if doc.Source.InlineText != "" {
+ return StrategyTXT, ""
+ }
+ return StrategyDrop, "no inline content"
+}
+
+// TXTEnvelope wraps a text document body in an XML-like tag that models can
+// parse as a named attachment.
+//
+// …body…
+func TXTEnvelope(name, mimeType, body string) string {
+ return fmt.Sprintf("%s", name, mimeType, body)
+}
diff --git a/pkg/attachment/decide_test.go b/pkg/attachment/decide_test.go
new file mode 100644
index 000000000..6eee38153
--- /dev/null
+++ b/pkg/attachment/decide_test.go
@@ -0,0 +1,140 @@
+package attachment_test
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/docker/docker-agent/pkg/attachment"
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// testCaps is a small helper that builds a ModelCapabilities directly.
+func visionCaps() modelcaps.ModelCapabilities {
+ return modelcaps.CapsWith(true, true)
+}
+
+func textOnlyCaps() modelcaps.ModelCapabilities {
+ return modelcaps.CapsWith(false, false)
+}
+
+func imageNoPDFCaps() modelcaps.ModelCapabilities {
+ return modelcaps.CapsWith(true, false)
+}
+
+func TestDecide(t *testing.T) {
+ tests := []struct {
+ name string
+ doc chat.Document
+ caps modelcaps.ModelCapabilities
+ wantStrategy attachment.Strategy
+ wantReasonHas string // non-empty: reason must contain this substring
+ }{
+ {
+ name: "b64 image supported",
+ doc: chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: []byte{0xFF, 0xD8}},
+ },
+ caps: visionCaps(),
+ wantStrategy: attachment.StrategyB64,
+ },
+ {
+ name: "txt text plain",
+ doc: chat.Document{
+ Name: "notes.txt",
+ MimeType: "text/plain",
+ Source: chat.DocumentSource{InlineText: "hello world"},
+ },
+ caps: textOnlyCaps(),
+ wantStrategy: attachment.StrategyTXT,
+ },
+ {
+ name: "drop image when model has no vision",
+ doc: chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: []byte{0xFF, 0xD8}},
+ },
+ caps: textOnlyCaps(),
+ wantStrategy: attachment.StrategyDrop,
+ wantReasonHas: "does not support MIME type",
+ },
+ {
+ name: "drop pdf when model has no pdf support",
+ doc: chat.Document{
+ Name: "doc.pdf",
+ MimeType: "application/pdf",
+ Source: chat.DocumentSource{InlineData: []byte{0x25, 0x50, 0x44, 0x46}},
+ },
+ caps: imageNoPDFCaps(),
+ wantStrategy: attachment.StrategyDrop,
+ wantReasonHas: "does not support MIME type",
+ },
+ {
+ name: "drop no inline content",
+ doc: chat.Document{
+ Name: "empty.md",
+ MimeType: "text/markdown",
+ Source: chat.DocumentSource{},
+ },
+ caps: textOnlyCaps(),
+ wantStrategy: attachment.StrategyDrop,
+ wantReasonHas: "no inline content",
+ },
+ {
+ name: "b64 pdf when pdf supported",
+ doc: chat.Document{
+ Name: "spec.pdf",
+ MimeType: "application/pdf",
+ Source: chat.DocumentSource{InlineData: []byte{0x25, 0x50, 0x44, 0x46}},
+ },
+ caps: visionCaps(),
+ wantStrategy: attachment.StrategyB64,
+ },
+ {
+ name: "drop office doc (DOCX is binary, not supported without models.dev office modality)",
+ doc: chat.Document{
+ Name: "report.docx",
+ MimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ Source: chat.DocumentSource{InlineData: []byte{0x50, 0x4B}}, // ZIP magic bytes
+ },
+ caps: visionCaps(), // even full caps can't send DOCX — no modality
+ wantStrategy: attachment.StrategyDrop,
+ wantReasonHas: "does not support MIME type",
+ },
+ {
+ name: "b64 wins over txt when both inline sources present",
+ doc: chat.Document{
+ Name: "data.txt",
+ MimeType: "text/plain",
+ Source: chat.DocumentSource{InlineData: []byte("hello"), InlineText: "hello"},
+ },
+ caps: textOnlyCaps(),
+ wantStrategy: attachment.StrategyB64,
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ gotStrategy, gotReason := attachment.Decide(tc.doc, tc.caps)
+ if gotStrategy != tc.wantStrategy {
+ t.Errorf("strategy: got %d, want %d", gotStrategy, tc.wantStrategy)
+ }
+ if tc.wantReasonHas != "" {
+ if !strings.Contains(gotReason, tc.wantReasonHas) {
+ t.Errorf("reason %q does not contain %q", gotReason, tc.wantReasonHas)
+ }
+ }
+ })
+ }
+}
+
+func TestTXTEnvelope(t *testing.T) {
+ got := attachment.TXTEnvelope("readme.md", "text/markdown", "# Hello")
+ want := `# Hello`
+ if got != want {
+ t.Errorf("TXTEnvelope:\ngot %q\nwant %q", got, want)
+ }
+}
diff --git a/pkg/attachment/modelcaps/modelcaps.go b/pkg/attachment/modelcaps/modelcaps.go
new file mode 100644
index 000000000..bbc1680d5
--- /dev/null
+++ b/pkg/attachment/modelcaps/modelcaps.go
@@ -0,0 +1,157 @@
+// Package modelcaps provides model capability queries for the attachment system.
+// It translates models.dev modality information into MIME-type support decisions
+// used by the attachment routing logic.
+package modelcaps
+
+import (
+ "context"
+ "log/slog"
+ "strings"
+ "time"
+
+ "github.com/docker/docker-agent/pkg/modelsdev"
+)
+
+// ModelCapabilities describes what MIME types a given model can accept as
+// document attachments.
+type ModelCapabilities struct {
+ // supportsImage is true when the model accepts image/* MIME types.
+ supportsImage bool
+ // supportsPDF is true when the model accepts application/pdf.
+ supportsPDF bool
+ // modelFound is false when models.dev has no record for this model,
+ // which causes conservative fallback behaviour (text-only).
+ modelFound bool
+}
+
+// isOfficeMIME returns true for Office document binary formats
+// (OOXML, legacy Office, RTF). These are ZIP-based or binary formats
+// that cannot be naively TXT-enveloped and require explicit model support.
+func isOfficeMIME(mt string) bool {
+ switch mt {
+ case "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.ms-excel",
+ "application/vnd.ms-powerpoint",
+ "application/msword",
+ "application/rtf",
+ "text/rtf":
+ return true
+ }
+ return false
+}
+
+// Supports returns true when the model can accept an attachment with the given
+// MIME type.
+//
+// Resolution rules (in order):
+// 1. image/* → requires supportsImage (models.dev "image" modality)
+// 2. application/pdf → requires supportsPDF (models.dev "pdf" modality)
+// 3. text/* → always supported (plain text; TXT envelope is universally safe)
+// 4. Office/binary document MIMEs (DOCX, XLSX, PPTX, etc.) → not supported unless
+// models.dev explicitly declares a document modality. models.dev currently has
+// no "document" or "office" modality field, so these return false for all
+// models until the schema is extended.
+// 5. Everything else (audio/*, video/*, unknown binary) → false
+func (mc ModelCapabilities) Supports(mimeType string) bool {
+ mt := strings.ToLower(mimeType)
+ if strings.HasPrefix(mt, "image/") {
+ return mc.supportsImage
+ }
+ if mt == "application/pdf" {
+ return mc.supportsPDF
+ }
+ // text/* MIMEs (text/plain, text/markdown, text/html, text/csv, …) are always
+ // supported — they are actual text and TXT envelope works universally.
+ if strings.HasPrefix(mt, "text/") {
+ return true
+ }
+ // Office document formats (DOCX, XLSX, PPTX, etc.) are ZIP-based binaries;
+ // they cannot be naively TXT-enveloped. models.dev does not yet declare an
+ // "office" or "document" modality, so we conservatively return false until
+ // the schema provides explicit capability data.
+ if isOfficeMIME(mt) {
+ return false
+ }
+ // audio/*, video/*, and all other unknown binary types are not supported.
+ return false
+}
+
+// loadTimeout is the maximum time allowed for a models.dev capability lookup.
+// If the fetch takes longer, Load falls back to conservative text-only caps.
+const loadTimeout = 10 * time.Second
+
+// Load fetches (or returns from cache) the capability record for the given
+// model ID. The model ID should be in "provider/model" format as used by
+// models.dev (e.g. "anthropic/claude-3-5-sonnet-20241022").
+//
+// When the model is not found in the models.dev database, Load returns a
+// conservative capability set that only allows text MIME types. The returned
+// error is always nil; capability detection failures are silent and safe.
+func Load(modelID string) (ModelCapabilities, error) {
+ ctx, cancel := context.WithTimeout(context.Background(), loadTimeout)
+ defer cancel()
+
+ store, err := modelsdev.NewStore()
+ if err != nil {
+ slog.Warn("modelcaps: failed to load models.dev store, using conservative caps",
+ "error", err, "model", modelID)
+ return ModelCapabilities{modelFound: false}, nil
+ }
+
+ model, err := store.GetModel(ctx, modelID)
+ if err != nil {
+ if ctx.Err() != nil {
+ slog.Warn("modelcaps: models.dev lookup timed out, using conservative caps",
+ "model", modelID, "timeout", loadTimeout)
+ }
+ // Model not found or context cancelled — conservative: text-only.
+ return ModelCapabilities{modelFound: false}, nil
+ }
+
+ mc := ModelCapabilities{modelFound: true}
+ for _, input := range model.Modalities.Input {
+ switch strings.ToLower(input) {
+ case "image":
+ mc.supportsImage = true
+ case "pdf":
+ mc.supportsPDF = true
+ }
+ }
+ return mc, nil
+}
+
+// CapsWith constructs a ModelCapabilities value directly from booleans. This is
+// intended for use in tests and provider implementations that need to create a
+// capabilities value without hitting the network.
+func CapsWith(supportsImage, supportsPDF bool) ModelCapabilities {
+ return ModelCapabilities{
+ supportsImage: supportsImage,
+ supportsPDF: supportsPDF,
+ modelFound: true,
+ }
+}
+
+// LoadFromStore is like Load but accepts an explicit *modelsdev.Store, making
+// it convenient for tests that inject a pre-populated in-memory store.
+func LoadFromStore(store *modelsdev.Store, modelID string) ModelCapabilities {
+ ctx, cancel := context.WithTimeout(context.Background(), loadTimeout)
+ defer cancel()
+
+ model, err := store.GetModel(ctx, modelID)
+ if err != nil {
+ return ModelCapabilities{modelFound: false}
+ }
+
+ mc := ModelCapabilities{modelFound: true}
+ for _, input := range model.Modalities.Input {
+ switch strings.ToLower(input) {
+ case "image":
+ mc.supportsImage = true
+ case "pdf":
+ mc.supportsPDF = true
+ }
+ }
+ return mc
+}
diff --git a/pkg/attachment/modelcaps/modelcaps_test.go b/pkg/attachment/modelcaps/modelcaps_test.go
new file mode 100644
index 000000000..a473887a6
--- /dev/null
+++ b/pkg/attachment/modelcaps/modelcaps_test.go
@@ -0,0 +1,167 @@
+package modelcaps_test
+
+import (
+ "testing"
+
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/modelsdev"
+)
+
+// buildStore creates an in-memory Store with the given models for testing.
+func buildStore(providers map[string]modelsdev.Provider) *modelsdev.Store {
+ db := &modelsdev.Database{Providers: providers}
+ return modelsdev.NewDatabaseStore(db)
+}
+
+func TestLoadFromStore_VisionModel(t *testing.T) {
+ store := buildStore(map[string]modelsdev.Provider{
+ "anthropic": {
+ Models: map[string]modelsdev.Model{
+ "claude-3-5-sonnet": {
+ Name: "Claude 3.5 Sonnet",
+ Modalities: modelsdev.Modalities{
+ Input: []string{"text", "image", "pdf"},
+ Output: []string{"text"},
+ },
+ },
+ },
+ },
+ })
+
+ mc := modelcaps.LoadFromStore(store, "anthropic/claude-3-5-sonnet")
+
+ if !mc.Supports("image/jpeg") {
+ t.Error("expected image/jpeg to be supported for vision model")
+ }
+ if !mc.Supports("image/png") {
+ t.Error("expected image/png to be supported for vision model")
+ }
+ if !mc.Supports("application/pdf") {
+ t.Error("expected application/pdf to be supported for pdf model")
+ }
+ if !mc.Supports("text/plain") {
+ t.Error("expected text/plain to always be supported")
+ }
+}
+
+func TestLoadFromStore_TextOnlyModel(t *testing.T) {
+ store := buildStore(map[string]modelsdev.Provider{
+ "openai": {
+ Models: map[string]modelsdev.Model{
+ "gpt-3.5-turbo": {
+ Name: "GPT-3.5 Turbo",
+ Modalities: modelsdev.Modalities{
+ Input: []string{"text"},
+ Output: []string{"text"},
+ },
+ },
+ },
+ },
+ })
+
+ mc := modelcaps.LoadFromStore(store, "openai/gpt-3.5-turbo")
+
+ if mc.Supports("image/jpeg") {
+ t.Error("expected image/jpeg NOT to be supported for text-only model")
+ }
+ if mc.Supports("application/pdf") {
+ t.Error("expected application/pdf NOT to be supported for text-only model")
+ }
+ // Text MIMEs are always allowed
+ if !mc.Supports("text/plain") {
+ t.Error("expected text/plain to always be supported")
+ }
+ if !mc.Supports("text/markdown") {
+ t.Error("expected text/markdown to always be supported")
+ }
+}
+
+func TestLoadFromStore_ModelNotFound(t *testing.T) {
+ store := buildStore(map[string]modelsdev.Provider{})
+
+ mc := modelcaps.LoadFromStore(store, "unknown/nonexistent-model")
+
+ // Conservative fallback: only text is allowed
+ if mc.Supports("image/jpeg") {
+ t.Error("expected image/jpeg NOT to be supported for unknown model")
+ }
+ if mc.Supports("application/pdf") {
+ t.Error("expected application/pdf NOT to be supported for unknown model")
+ }
+ if !mc.Supports("text/plain") {
+ t.Error("expected text/plain to always be supported even for unknown model")
+ }
+}
+
+func TestLoadFromStore_OfficeDocsNotAllowed(t *testing.T) {
+ // Office document MIMEs (DOCX, XLSX, etc.) are ZIP-based binaries and
+ // cannot be naively TXT-enveloped. models.dev has no "office" or
+ // "document" modality, so they must return false for all models.
+ store := buildStore(map[string]modelsdev.Provider{
+ "openai": {
+ Models: map[string]modelsdev.Model{
+ "gpt-4o": {
+ Name: "GPT-4o",
+ Modalities: modelsdev.Modalities{
+ Input: []string{"text", "image", "pdf"},
+ Output: []string{"text"},
+ },
+ },
+ },
+ },
+ })
+
+ mc := modelcaps.LoadFromStore(store, "openai/gpt-4o")
+
+ for _, officeMIME := range []string{
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/msword",
+ "application/vnd.ms-excel",
+ "application/rtf",
+ } {
+ if mc.Supports(officeMIME) {
+ t.Errorf("expected Office MIME %q NOT to be supported (models.dev has no document modality)", officeMIME)
+ }
+ }
+}
+
+func TestCapsWith(t *testing.T) {
+ mc := modelcaps.CapsWith(true, false)
+ if !mc.Supports("image/jpeg") {
+ t.Error("expected image/jpeg to be supported")
+ }
+ if mc.Supports("application/pdf") {
+ t.Error("expected pdf NOT to be supported")
+ }
+
+ mc2 := modelcaps.CapsWith(false, false)
+ if mc2.Supports("image/png") {
+ t.Error("expected image/png NOT to be supported")
+ }
+}
+
+// TestSupports_AudioVideoRejected verifies that audio/video MIMEs and Office
+// document binaries are NOT allowed — they require explicit model support
+// declarations which Phase 1 does not implement (models.dev has no such modality).
+func TestSupports_AudioVideoRejected(t *testing.T) {
+ // Even a vision+pdf capable model should reject audio/video/office.
+ mc := modelcaps.CapsWith(true, true)
+
+ for _, mime := range []string{
+ "audio/mp3",
+ "audio/wav",
+ "audio/ogg",
+ "video/mp4",
+ "video/webm",
+ "application/octet-stream",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "application/msword",
+ } {
+ if mc.Supports(mime) {
+ t.Errorf("expected %q to NOT be supported (not in Phase 1 allowlist)", mime)
+ }
+ }
+}
diff --git a/pkg/chat/chat.go b/pkg/chat/chat.go
index 9e4a2ced0..519df89f0 100644
--- a/pkg/chat/chat.go
+++ b/pkg/chat/chat.go
@@ -29,9 +29,11 @@ const (
type MessagePartType string
const (
- MessagePartTypeText MessagePartType = "text"
+ MessagePartTypeText MessagePartType = "text"
+ // MessagePartTypeImageURL is superseded by MessagePartTypeDocument. Will be removed in a future release.
MessagePartTypeImageURL MessagePartType = "image_url"
- MessagePartTypeFile MessagePartType = "file"
+ // MessagePartTypeFile is superseded by MessagePartTypeDocument. Will be removed in a future release.
+ MessagePartTypeFile MessagePartType = "file"
)
type ImageURLDetail string
@@ -106,10 +108,14 @@ type MessageFile struct {
}
type MessagePart struct {
- Type MessagePartType `json:"type,omitempty"`
- Text string `json:"text,omitempty"`
+ Type MessagePartType `json:"type,omitempty"`
+ Text string `json:"text,omitempty"`
+ // Note: superseded by Document+MessagePartTypeDocument. Will be removed in a future release.
ImageURL *MessageImageURL `json:"image_url,omitempty"`
- File *MessageFile `json:"file,omitempty"`
+ // Note: superseded by Document+MessagePartTypeDocument. Will be removed in a future release.
+ File *MessageFile `json:"file,omitempty"`
+ // Document is set when Type is MessagePartTypeDocument.
+ Document *Document `json:"document,omitempty"`
}
// FinishReason represents the reason why the model finished generating a response
diff --git a/pkg/chat/document.go b/pkg/chat/document.go
new file mode 100644
index 000000000..b24a85f9c
--- /dev/null
+++ b/pkg/chat/document.go
@@ -0,0 +1,46 @@
+package chat
+
+// MessagePartTypeDocument is the part type for a structured document attachment.
+// Use this type when attaching files (images, PDFs, text, Office docs, etc.) to
+// a message. The Document field must be set when this type is used.
+//
+// This supersedes MessagePartTypeFile and MessagePartTypeImageURL, which are
+// deprecated but remain supported for backward compatibility.
+const MessagePartTypeDocument MessagePartType = "document"
+
+// DocumentSource holds the actual content of a document. Exactly one of the
+// fields should be set.
+type DocumentSource struct {
+ // InlineText holds the raw text for text/* MIME types (TXT, MD, HTML, CSV, …).
+ // Used for StrategyTXT attachments.
+ InlineText string `json:"inline_text,omitempty"`
+
+ // InlineData holds binary content (images, PDFs, Office docs, …) that is
+ // base64-encoded when sent to the provider. Used for StrategyB64 attachments.
+ InlineData []byte `json:"inline_data,omitempty"`
+}
+
+// Document represents a file attachment in a message part. It carries
+// the file name, post-processing MIME type, and the actual content via Source.
+//
+// The MimeType field always reflects the final MIME that the attachment system
+// will use when sending to the provider (e.g. "image/jpeg" after image
+// normalisation, never the original "image/bmp").
+type Document struct {
+ // Name is the display name of the document (e.g. "report.pdf").
+ Name string `json:"name"`
+
+ // MimeType is the post-processing MIME type of the document. For images
+ // this is always "image/jpeg" or "image/png" regardless of the original
+ // format. For text files it is the exact MIME (e.g. "text/plain",
+ // "text/markdown", "text/html"). For binary documents it is the original
+ // MIME (e.g. "application/pdf").
+ MimeType string `json:"mime_type"`
+
+ // Size is the byte length of the document content (InlineData or InlineText).
+ // Optional; zero means unknown.
+ Size int64 `json:"size,omitempty"`
+
+ // Source holds the actual document content.
+ Source DocumentSource `json:"source"`
+}
diff --git a/pkg/model/provider/anthropic/attachments.go b/pkg/model/provider/anthropic/attachments.go
new file mode 100644
index 000000000..0ee4d73e4
--- /dev/null
+++ b/pkg/model/provider/anthropic/attachments.go
@@ -0,0 +1,91 @@
+package anthropic
+
+import (
+ "context"
+ "encoding/base64"
+ "fmt"
+ "log/slog"
+ "strings"
+
+ "github.com/anthropics/anthropic-sdk-go"
+
+ "github.com/docker/docker-agent/pkg/attachment"
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// convertDocument converts a chat.Document to standard Anthropic SDK content blocks
+// (not the Beta API).
+//
+// Routing:
+// - image/* with InlineData → ImageBlockParam (base64 source)
+// - application/pdf with InlineData → DocumentBlockParam (base64)
+// - text with InlineText → TextBlockParam with TXTEnvelope
+// - unsupported / no content → nil (logged as warning)
+func convertDocument(ctx context.Context, doc chat.Document, modelID string) ([]anthropic.ContentBlockParamUnion, error) {
+ mc, _ := modelcaps.Load(modelID)
+ return convertDocumentWithCaps(ctx, doc, mc)
+}
+
+// convertDocumentWithCaps is the caps-injectable variant used by tests.
+func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelcaps.ModelCapabilities) ([]anthropic.ContentBlockParamUnion, error) {
+ strategy, reason := attachment.Decide(doc, mc)
+
+ switch strategy {
+ case attachment.StrategyDrop:
+ slog.WarnContext(ctx, "attachment dropped", "reason", reason, "doc", doc.Name)
+ return nil, nil
+
+ case attachment.StrategyB64:
+ mime := strings.ToLower(doc.MimeType)
+ b64Data := base64.StdEncoding.EncodeToString(doc.Source.InlineData)
+
+ if IsImageMime(mime) {
+ return []anthropic.ContentBlockParamUnion{
+ {
+ OfImage: &anthropic.ImageBlockParam{
+ Source: anthropic.ImageBlockParamSourceUnion{
+ OfBase64: &anthropic.Base64ImageSourceParam{
+ Data: b64Data,
+ MediaType: anthropic.Base64ImageSourceMediaType(mime),
+ },
+ },
+ },
+ },
+ }, nil
+ }
+
+ if IsAnthropicDocumentMime(mime) {
+ // application/pdf → native document block
+ return []anthropic.ContentBlockParamUnion{
+ {
+ OfDocument: &anthropic.DocumentBlockParam{
+ Source: anthropic.DocumentBlockParamSourceUnion{
+ OfBase64: &anthropic.Base64PDFSourceParam{
+ Data: b64Data,
+ MediaType: "application/pdf",
+ },
+ },
+ },
+ },
+ }, nil
+ }
+
+ // Other binary: fall back to TXT envelope.
+ slog.DebugContext(ctx, "anthropic: no native block for MIME, falling back to TXT envelope",
+ "mime", doc.MimeType, "doc", doc.Name)
+ envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType, b64Data)
+ return []anthropic.ContentBlockParamUnion{
+ {OfText: &anthropic.TextBlockParam{Text: envelope}},
+ }, nil
+
+ case attachment.StrategyTXT:
+ envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType, doc.Source.InlineText)
+ return []anthropic.ContentBlockParamUnion{
+ {OfText: &anthropic.TextBlockParam{Text: envelope}},
+ }, nil
+
+ default:
+ return nil, fmt.Errorf("unknown attachment strategy %d", strategy)
+ }
+}
diff --git a/pkg/model/provider/anthropic/attachments_test.go b/pkg/model/provider/anthropic/attachments_test.go
new file mode 100644
index 000000000..f5ccd2117
--- /dev/null
+++ b/pkg/model/provider/anthropic/attachments_test.go
@@ -0,0 +1,109 @@
+package anthropic
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// minJPEG is a minimal JPEG magic-byte header for use in tests.
+var minJPEG = []byte{0xFF, 0xD8, 0xFF, 0xE0}
+
+// minPDF is a minimal PDF magic-byte header for use in tests.
+var minPDF = []byte{0x25, 0x50, 0x44, 0x46, 0x2D} // %PDF-
+
+// TestConvertDocumentAnthropic_StrategyB64_Image verifies that an image document
+// with InlineData and a vision-capable model produces a native ImageBlockParam.
+func TestConvertDocumentAnthropic_StrategyB64_Image(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ visionCaps := modelcaps.CapsWith(true, true)
+ blocks, err := convertDocumentWithCaps(t.Context(), doc, visionCaps)
+ require.NoError(t, err)
+ require.Len(t, blocks, 1, "expected exactly one block")
+ require.NotNil(t, blocks[0].OfImage, "expected image block")
+ assert.Nil(t, blocks[0].OfText, "expected no text block for image")
+}
+
+// TestConvertDocumentAnthropic_StrategyB64_PDF verifies that a PDF document
+// produces a native BetaRequestDocumentBlock when the model supports PDFs.
+func TestConvertDocumentAnthropic_StrategyB64_PDF(t *testing.T) {
+ doc := chat.Document{
+ Name: "spec.pdf",
+ MimeType: "application/pdf",
+ Source: chat.DocumentSource{InlineData: minPDF},
+ }
+
+ pdfCaps := modelcaps.CapsWith(true, true)
+ blocks, err := convertDocumentWithCaps(t.Context(), doc, pdfCaps)
+ require.NoError(t, err)
+ require.Len(t, blocks, 1, "expected exactly one block")
+ require.NotNil(t, blocks[0].OfDocument, "expected document block for PDF")
+ assert.Nil(t, blocks[0].OfText, "expected no text block for PDF")
+}
+
+func TestConvertDocumentAnthropic_StrategyTXT(t *testing.T) {
+ doc := chat.Document{
+ Name: "spec.md",
+ MimeType: "text/markdown",
+ Source: chat.DocumentSource{InlineText: "## Specification"},
+ }
+
+ blocks, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.Len(t, blocks, 1)
+ require.NotNil(t, blocks[0].OfText)
+ assert.Contains(t, blocks[0].OfText.Text, "spec.md")
+ assert.Contains(t, blocks[0].OfText.Text, "text/markdown")
+ assert.Contains(t, blocks[0].OfText.Text, "## Specification")
+}
+
+func TestConvertDocumentAnthropic_StrategyTXT_Envelope(t *testing.T) {
+ doc := chat.Document{
+ Name: "notes.txt",
+ MimeType: "text/plain",
+ Source: chat.DocumentSource{InlineText: "some notes"},
+ }
+
+ blocks, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.Len(t, blocks, 1)
+ require.NotNil(t, blocks[0].OfText)
+ text := blocks[0].OfText.Text
+ assert.True(t, strings.HasPrefix(text, "= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' {
+ sb.WriteRune(r)
+ } else {
+ sb.WriteRune('-')
+ }
+ }
+ result := sb.String()
+ if result == "" {
+ return "document"
+ }
+ return result
+}
diff --git a/pkg/model/provider/bedrock/attachments_test.go b/pkg/model/provider/bedrock/attachments_test.go
new file mode 100644
index 000000000..fd2bc2516
--- /dev/null
+++ b/pkg/model/provider/bedrock/attachments_test.go
@@ -0,0 +1,118 @@
+package bedrock
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/aws/aws-sdk-go-v2/service/bedrockruntime/types"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// minJPEG is a minimal JPEG magic-byte header for use in tests.
+var minJPEG = []byte{0xFF, 0xD8, 0xFF, 0xE0}
+
+// minPDF is a minimal PDF magic-byte header for use in tests.
+var minPDF = []byte{0x25, 0x50, 0x44, 0x46, 0x2D} // %PDF-
+
+// TestConvertDocumentBedrock_StrategyB64_Image verifies that an image document
+// with InlineData and a vision-capable model produces a ContentBlockMemberImage.
+func TestConvertDocumentBedrock_StrategyB64_Image(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ visionCaps := modelcaps.CapsWith(true, true)
+ blocks, err := convertDocumentWithCaps(t.Context(), doc, visionCaps)
+ require.NoError(t, err)
+ require.Len(t, blocks, 1, "expected exactly one block")
+ imageBlock, ok := blocks[0].(*types.ContentBlockMemberImage)
+ require.True(t, ok, "expected ContentBlockMemberImage, got %T", blocks[0])
+ assert.Equal(t, types.ImageFormatJpeg, imageBlock.Value.Format)
+ srcBytes, ok := imageBlock.Value.Source.(*types.ImageSourceMemberBytes)
+ require.True(t, ok, "expected ImageSourceMemberBytes")
+ assert.Equal(t, minJPEG, srcBytes.Value)
+}
+
+// TestConvertDocumentBedrock_StrategyB64_PDF verifies that a PDF document
+// produces a ContentBlockMemberDocument when the model supports PDFs.
+func TestConvertDocumentBedrock_StrategyB64_PDF(t *testing.T) {
+ doc := chat.Document{
+ Name: "spec.pdf",
+ MimeType: "application/pdf",
+ Source: chat.DocumentSource{InlineData: minPDF},
+ }
+
+ pdfCaps := modelcaps.CapsWith(true, true)
+ blocks, err := convertDocumentWithCaps(t.Context(), doc, pdfCaps)
+ require.NoError(t, err)
+ require.Len(t, blocks, 1, "expected exactly one block")
+ docBlock, ok := blocks[0].(*types.ContentBlockMemberDocument)
+ require.True(t, ok, "expected ContentBlockMemberDocument, got %T", blocks[0])
+ assert.Equal(t, types.DocumentFormatPdf, docBlock.Value.Format)
+}
+
+// TestConvertDocumentBedrock_StrategyB64_ImageDropped verifies that an image
+// is dropped when the model does not support vision.
+func TestConvertDocumentBedrock_StrategyB64_ImageDropped(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ textOnlyCaps := modelcaps.CapsWith(false, false)
+ blocks, err := convertDocumentWithCaps(t.Context(), doc, textOnlyCaps)
+ require.NoError(t, err)
+ assert.Nil(t, blocks, "image should be dropped for text-only model")
+}
+
+func TestConvertDocumentBedrock_StrategyTXT(t *testing.T) {
+ doc := chat.Document{
+ Name: "notes.md",
+ MimeType: "text/markdown",
+ Source: chat.DocumentSource{InlineText: "## Notes"},
+ }
+
+ blocks, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.Len(t, blocks, 1)
+ textBlock, ok := blocks[0].(*types.ContentBlockMemberText)
+ require.True(t, ok, "expected text block for TXT strategy")
+ assert.Contains(t, textBlock.Value, "notes.md")
+ assert.Contains(t, textBlock.Value, "text/markdown")
+ assert.Contains(t, textBlock.Value, "## Notes")
+}
+
+func TestConvertDocumentBedrock_StrategyTXT_Envelope(t *testing.T) {
+ doc := chat.Document{
+ Name: "data.csv",
+ MimeType: "text/csv",
+ Source: chat.DocumentSource{InlineText: "a,b"},
+ }
+
+ blocks, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.Len(t, blocks, 1)
+ textBlock, ok := blocks[0].(*types.ContentBlockMemberText)
+ require.True(t, ok, "expected text block")
+ assert.True(t, strings.HasPrefix(textBlock.Value, " 0 {
bedrockMessages = append(bedrockMessages, types.Message{
Role: types.ConversationRoleUser,
@@ -119,7 +120,7 @@ func applyCachePointsToMessages(messages []types.Message) {
}
}
-func convertUserContent(msg *chat.Message) []types.ContentBlock {
+func convertUserContent(ctx context.Context, msg *chat.Message, modelID string) []types.ContentBlock {
var blocks []types.ContentBlock
if len(msg.MultiContent) > 0 {
@@ -130,11 +131,21 @@ func convertUserContent(msg *chat.Message) []types.ContentBlock {
Value: part.Text,
})
case chat.MessagePartTypeImageURL:
+ // Note: superseded by MessagePartTypeDocument.
if part.ImageURL != nil {
if imageBlock := convertImageURL(part.ImageURL); imageBlock != nil {
blocks = append(blocks, imageBlock)
}
}
+ case chat.MessagePartTypeDocument:
+ if part.Document != nil {
+ docBlocks, err := convertDocument(ctx, *part.Document, modelID)
+ if err != nil {
+ slog.WarnContext(ctx, "failed to convert document attachment", "error", err, "doc", part.Document.Name)
+ continue
+ }
+ blocks = append(blocks, docBlocks...)
+ }
}
}
} else {
diff --git a/pkg/model/provider/dmr/client.go b/pkg/model/provider/dmr/client.go
index bfbe52988..d3693a54b 100644
--- a/pkg/model/provider/dmr/client.go
+++ b/pkg/model/provider/dmr/client.go
@@ -147,8 +147,8 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
// convertMessages converts chat messages to OpenAI format and merges consecutive
// system/user messages, which is needed by some local models run by DMR.
-func convertMessages(messages []chat.Message) []openai.ChatCompletionMessageParamUnion {
- openaiMessages := oaistream.ConvertMessages(messages)
+func (c *Client) convertMessages(ctx context.Context, messages []chat.Message) []openai.ChatCompletionMessageParamUnion {
+ openaiMessages := oaistream.ConvertMessages(ctx, messages, c.ModelConfig.Model)
return oaistream.MergeConsecutiveMessages(openaiMessages)
}
@@ -171,7 +171,7 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
params := openai.ChatCompletionNewParams{
Model: c.ModelConfig.Model,
- Messages: convertMessages(messages),
+ Messages: c.convertMessages(ctx, messages),
StreamOptions: openai.ChatCompletionStreamOptionsParam{
IncludeUsage: openai.Bool(trackUsage),
},
diff --git a/pkg/model/provider/gemini/attachments.go b/pkg/model/provider/gemini/attachments.go
new file mode 100644
index 000000000..9d5b53d00
--- /dev/null
+++ b/pkg/model/provider/gemini/attachments.go
@@ -0,0 +1,46 @@
+package gemini
+
+import (
+ "context"
+ "fmt"
+ "log/slog"
+
+ "google.golang.org/genai"
+
+ "github.com/docker/docker-agent/pkg/attachment"
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// convertDocument converts a chat.Document to a Gemini genai.Part.
+//
+// Routing:
+// - image/* or binary with InlineData → genai.Blob part
+// - text MIMEs with InlineText → genai.Text part with TXTEnvelope
+// - unsupported / no content → nil (logged as warning)
+func convertDocument(ctx context.Context, doc chat.Document, modelID string) (*genai.Part, error) {
+ mc, _ := modelcaps.Load(modelID)
+ return convertDocumentWithCaps(ctx, doc, mc)
+}
+
+// convertDocumentWithCaps is the caps-injectable variant used by tests.
+func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelcaps.ModelCapabilities) (*genai.Part, error) {
+ strategy, reason := attachment.Decide(doc, mc)
+
+ switch strategy {
+ case attachment.StrategyDrop:
+ slog.WarnContext(ctx, "attachment dropped", "reason", reason, "doc", doc.Name)
+ return nil, nil
+
+ case attachment.StrategyB64:
+ // Gemini's genai.NewPartFromBytes wraps binary data as an inline blob.
+ return genai.NewPartFromBytes(doc.Source.InlineData, doc.MimeType), nil
+
+ case attachment.StrategyTXT:
+ envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType, doc.Source.InlineText)
+ return genai.NewPartFromText(envelope), nil
+
+ default:
+ return nil, fmt.Errorf("unknown attachment strategy %d", strategy)
+ }
+}
diff --git a/pkg/model/provider/gemini/attachments_test.go b/pkg/model/provider/gemini/attachments_test.go
new file mode 100644
index 000000000..ea85f5514
--- /dev/null
+++ b/pkg/model/provider/gemini/attachments_test.go
@@ -0,0 +1,90 @@
+package gemini
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// minJPEG is a minimal JPEG magic-byte header for use in tests.
+var minJPEG = []byte{0xFF, 0xD8, 0xFF, 0xE0}
+
+// TestConvertDocumentGemini_StrategyB64_Image verifies that an image document
+// with InlineData and a vision-capable model produces a Blob part (not a text part).
+func TestConvertDocumentGemini_StrategyB64_Image(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ visionCaps := modelcaps.CapsWith(true, true)
+ part, err := convertDocumentWithCaps(t.Context(), doc, visionCaps)
+ require.NoError(t, err)
+ require.NotNil(t, part, "expected a non-nil part for B64 image")
+ // For a blob part the Text field is empty; the inline blob carries the data.
+ assert.Empty(t, part.Text, "expected blob part, not text part")
+ assert.Equal(t, minJPEG, part.InlineData.Data, "inline data should match input bytes")
+ assert.Equal(t, "image/jpeg", part.InlineData.MIMEType)
+}
+
+// TestConvertDocumentGemini_StrategyB64_ImageDropped verifies that an image is
+// dropped when the model does not support vision.
+func TestConvertDocumentGemini_StrategyB64_ImageDropped(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ textOnlyCaps := modelcaps.CapsWith(false, false)
+ part, err := convertDocumentWithCaps(t.Context(), doc, textOnlyCaps)
+ require.NoError(t, err)
+ assert.Nil(t, part, "image should be dropped for text-only model")
+}
+
+func TestConvertDocumentGemini_StrategyTXT(t *testing.T) {
+ doc := chat.Document{
+ Name: "readme.md",
+ MimeType: "text/markdown",
+ Source: chat.DocumentSource{InlineText: "# Read Me"},
+ }
+
+ part, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.NotNil(t, part)
+ assert.Contains(t, part.Text, "readme.md")
+ assert.Contains(t, part.Text, "text/markdown")
+ assert.Contains(t, part.Text, "# Read Me")
+}
+
+func TestConvertDocumentGemini_StrategyTXT_Envelope(t *testing.T) {
+ doc := chat.Document{
+ Name: "data.csv",
+ MimeType: "text/csv",
+ Source: chat.DocumentSource{InlineText: "col1,col2"},
+ }
+
+ part, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.NotNil(t, part)
+ assert.True(t, strings.HasPrefix(part.Text, " 0 {
- parts := convertMultiContent(msg.MultiContent, msg.ThoughtSignature)
+ parts := convertMultiContent(ctx, msg.MultiContent, msg.ThoughtSignature, modelID)
if len(parts) > 0 {
contents = append(contents, genai.NewContentFromParts(parts, role))
}
@@ -287,16 +287,28 @@ func newTextPartWithSignature(text string, signature []byte) *genai.Part {
}
// convertMultiContent converts multi-part content to Gemini parts
-func convertMultiContent(multiContent []chat.MessagePart, thoughtSignature []byte) []*genai.Part {
+func convertMultiContent(ctx context.Context, multiContent []chat.MessagePart, thoughtSignature []byte, modelID string) []*genai.Part {
parts := make([]*genai.Part, 0, len(multiContent))
for _, part := range multiContent {
switch part.Type {
case chat.MessagePartTypeText:
parts = append(parts, newTextPartWithSignature(part.Text, thoughtSignature))
case chat.MessagePartTypeImageURL:
+ // Note: superseded by MessagePartTypeDocument.
if imgPart := convertImageURLToPart(part.ImageURL); imgPart != nil {
parts = append(parts, imgPart)
}
+ case chat.MessagePartTypeDocument:
+ if part.Document != nil {
+ docPart, err := convertDocument(ctx, *part.Document, modelID)
+ if err != nil {
+ slog.WarnContext(ctx, "failed to convert document attachment", "error", err, "doc", part.Document.Name)
+ continue
+ }
+ if docPart != nil {
+ parts = append(parts, docPart)
+ }
+ }
}
}
return parts
@@ -589,7 +601,7 @@ func (c *Client) CreateChatCompletionStream(
}
}
- contents := convertMessagesToGemini(messages)
+ contents := convertMessagesToGemini(ctx, messages, c.ModelConfig.Model)
// Debug: Log the messages we're sending
slog.Debug("Gemini messages", "count", len(contents))
diff --git a/pkg/model/provider/gemini/client_test.go b/pkg/model/provider/gemini/client_test.go
index 2f81688bf..8aeb7b205 100644
--- a/pkg/model/provider/gemini/client_test.go
+++ b/pkg/model/provider/gemini/client_test.go
@@ -362,10 +362,10 @@ func TestConvertMessagesToGemini_ThoughtSignature(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
- contents := convertMessagesToGemini([]chat.Message{
+ contents := convertMessagesToGemini(t.Context(), []chat.Message{
{Role: chat.MessageRoleUser, Content: "go"},
tt.message,
- })
+ }, "")
require.Len(t, contents, 2)
assistant := contents[1]
diff --git a/pkg/model/provider/oaistream/attachments.go b/pkg/model/provider/oaistream/attachments.go
new file mode 100644
index 000000000..738120642
--- /dev/null
+++ b/pkg/model/provider/oaistream/attachments.go
@@ -0,0 +1,73 @@
+package oaistream
+
+import (
+ "context"
+ "encoding/base64"
+ "fmt"
+ "log/slog"
+ "strings"
+
+ "github.com/openai/openai-go/v3"
+
+ "github.com/docker/docker-agent/pkg/attachment"
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// convertDocument converts a chat.Document to zero or more
+// ChatCompletionContentPartUnionParam values using the OpenAI Chat Completions
+// format. It is also used by all oaistream-based providers (Mistral, xAI,
+// Ollama, Nebius, MiniMax, GitHub Copilot, Azure, Requesty).
+//
+// Routing:
+// - image/* with InlineData → data-URI image part
+// - other binary MIMEs with InlineData → text part with TXTEnvelope fallback
+// - text MIMEs with InlineText → text part with TXTEnvelope
+// - unsupported / no content → nil (logged as warning)
+func convertDocument(ctx context.Context, doc chat.Document, modelID string) ([]openai.ChatCompletionContentPartUnionParam, error) {
+ mc, _ := modelcaps.Load(modelID)
+ return convertDocumentWithCaps(ctx, doc, mc)
+}
+
+// convertDocumentWithCaps is the caps-injectable variant used by tests.
+func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelcaps.ModelCapabilities) ([]openai.ChatCompletionContentPartUnionParam, error) {
+ strategy, reason := attachment.Decide(doc, mc)
+
+ switch strategy {
+ case attachment.StrategyDrop:
+ slog.WarnContext(ctx, "attachment dropped", "reason", reason, "doc", doc.Name)
+ return nil, nil
+
+ case attachment.StrategyB64:
+ mime := strings.ToLower(doc.MimeType)
+ if strings.HasPrefix(mime, "image/") {
+ // Build an OpenAI image part with a data URI.
+ dataURI := fmt.Sprintf("data:%s;base64,%s",
+ doc.MimeType,
+ base64.StdEncoding.EncodeToString(doc.Source.InlineData))
+ return []openai.ChatCompletionContentPartUnionParam{
+ openai.ImageContentPart(openai.ChatCompletionContentPartImageImageURLParam{
+ URL: dataURI,
+ }),
+ }, nil
+ }
+ // Non-image binary (PDF, Office docs…): OpenAI Chat Completions has no
+ // native document block, so fall back to a TXT envelope.
+ slog.DebugContext(ctx, "oaistream: no native block for MIME, falling back to TXT envelope",
+ "mime", doc.MimeType, "doc", doc.Name)
+ envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType,
+ base64.StdEncoding.EncodeToString(doc.Source.InlineData))
+ return []openai.ChatCompletionContentPartUnionParam{
+ openai.TextContentPart(envelope),
+ }, nil
+
+ case attachment.StrategyTXT:
+ envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType, doc.Source.InlineText)
+ return []openai.ChatCompletionContentPartUnionParam{
+ openai.TextContentPart(envelope),
+ }, nil
+
+ default:
+ return nil, fmt.Errorf("unknown attachment strategy %d", strategy)
+ }
+}
diff --git a/pkg/model/provider/oaistream/attachments_test.go b/pkg/model/provider/oaistream/attachments_test.go
new file mode 100644
index 000000000..ea82661e4
--- /dev/null
+++ b/pkg/model/provider/oaistream/attachments_test.go
@@ -0,0 +1,99 @@
+package oaistream
+
+import (
+ "encoding/base64"
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "github.com/docker/docker-agent/pkg/attachment/modelcaps"
+ "github.com/docker/docker-agent/pkg/chat"
+)
+
+// minJPEG is a minimal JPEG magic-byte header for use in tests.
+var minJPEG = []byte{0xFF, 0xD8, 0xFF, 0xE0}
+
+// TestConvertDocument_StrategyB64_Image verifies that an image document with
+// InlineData and a vision-capable model produces an image content part with
+// a data-URI, not a text part.
+func TestConvertDocument_StrategyB64_Image(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ visionCaps := modelcaps.CapsWith(true, true)
+ parts, err := convertDocumentWithCaps(t.Context(), doc, visionCaps)
+ require.NoError(t, err)
+ require.Len(t, parts, 1, "expected exactly one image part")
+ require.NotNil(t, parts[0].OfImageURL, "expected image part, got non-image")
+ assert.Nil(t, parts[0].OfText, "expected no text part for B64 image")
+
+ // Data URI must embed the base64-encoded payload.
+ wantB64 := base64.StdEncoding.EncodeToString(minJPEG)
+ assert.Contains(t, parts[0].OfImageURL.ImageURL.URL, "data:image/jpeg;base64,")
+ assert.Contains(t, parts[0].OfImageURL.ImageURL.URL, wantB64)
+}
+
+// TestConvertDocument_StrategyB64_ImageDropped verifies that an image is
+// dropped when the model does not support vision.
+func TestConvertDocument_StrategyB64_ImageDropped(t *testing.T) {
+ doc := chat.Document{
+ Name: "photo.jpg",
+ MimeType: "image/jpeg",
+ Source: chat.DocumentSource{InlineData: minJPEG},
+ }
+
+ textOnlyCaps := modelcaps.CapsWith(false, false)
+ parts, err := convertDocumentWithCaps(t.Context(), doc, textOnlyCaps)
+ require.NoError(t, err)
+ assert.Nil(t, parts, "image should be dropped for text-only model")
+}
+
+func TestConvertDocument_StrategyTXT(t *testing.T) {
+ doc := chat.Document{
+ Name: "readme.md",
+ MimeType: "text/markdown",
+ Source: chat.DocumentSource{InlineText: "# Hello World"},
+ }
+
+ parts, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.Len(t, parts, 1)
+ require.NotNil(t, parts[0].OfText)
+ assert.Contains(t, parts[0].OfText.Text, "readme.md")
+ assert.Contains(t, parts[0].OfText.Text, "text/markdown")
+ assert.Contains(t, parts[0].OfText.Text, "# Hello World")
+}
+
+func TestConvertDocument_StrategyTXT_Envelope(t *testing.T) {
+ doc := chat.Document{
+ Name: "data.csv",
+ MimeType: "text/csv",
+ Source: chat.DocumentSource{InlineText: "a,b,c\n1,2,3"},
+ }
+
+ parts, err := convertDocument(t.Context(), doc, "")
+ require.NoError(t, err)
+ require.Len(t, parts, 1)
+ require.NotNil(t, parts[0].OfText)
+ text := parts[0].OfText.Text
+ assert.True(t, strings.HasPrefix(text, "