Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions pkg/attachment/attachment.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Package attachment provides MIME-aware routing for document attachments.
//
// It defines how a chat.Document should be sent to a model: either dropped
// (unsupported), wrapped in a plain-text envelope (StrategyTXT), or encoded
// as inline base64 data (StrategyB64).
package attachment

import (
"fmt"

"github.com/docker/docker-agent/pkg/attachment/modelcaps"
"github.com/docker/docker-agent/pkg/chat"
)

// Strategy describes how an attachment should be handled before sending to the
// provider.
type Strategy int

const (
// StrategyDrop means the attachment is not supported by the model or has no
// inline content, and should be silently skipped (with a log warning).
StrategyDrop Strategy = iota

// StrategyTXT means the attachment should be wrapped in a TXTEnvelope and
// sent as plain text. Used for text/* MIME types whose content is already
// in Source.InlineText.
StrategyTXT

// StrategyB64 means the attachment content (Source.InlineData) should be
// base64-encoded and sent as a native provider image/document block.
StrategyB64
)

// Decide returns the routing Strategy for a document given the current model's
// capabilities.
//
// Algorithm:
// 1. If the model does not support the document's MIME type → (Drop, reason).
// 2. If Source.InlineData is non-empty → (B64, "").
// 3. If Source.InlineText is non-empty → (TXT, "").
// 4. Otherwise → (Drop, "no inline content").
func Decide(doc chat.Document, mc modelcaps.ModelCapabilities) (Strategy, string) {
if !mc.Supports(doc.MimeType) {
return StrategyDrop, fmt.Sprintf("model does not support MIME type %q", doc.MimeType)
}
if len(doc.Source.InlineData) > 0 {
return StrategyB64, ""
}
if doc.Source.InlineText != "" {
return StrategyTXT, ""
}
return StrategyDrop, "no inline content"
}

// TXTEnvelope wraps a text document body in an XML-like tag that models can
// parse as a named attachment.
//
// <document name="report.md" mime-type="text/markdown">…body…</document>
func TXTEnvelope(name, mimeType, body string) string {
return fmt.Sprintf("<document name=%q mime-type=%q>%s</document>", name, mimeType, body)
}
140 changes: 140 additions & 0 deletions pkg/attachment/decide_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package attachment_test

import (
"strings"
"testing"

"github.com/docker/docker-agent/pkg/attachment"
"github.com/docker/docker-agent/pkg/attachment/modelcaps"
"github.com/docker/docker-agent/pkg/chat"
)

// testCaps is a small helper that builds a ModelCapabilities directly.
func visionCaps() modelcaps.ModelCapabilities {
return modelcaps.CapsWith(true, true)
}

func textOnlyCaps() modelcaps.ModelCapabilities {
return modelcaps.CapsWith(false, false)
}

func imageNoPDFCaps() modelcaps.ModelCapabilities {
return modelcaps.CapsWith(true, false)
}

func TestDecide(t *testing.T) {
tests := []struct {
name string
doc chat.Document
caps modelcaps.ModelCapabilities
wantStrategy attachment.Strategy
wantReasonHas string // non-empty: reason must contain this substring
}{
{
name: "b64 image supported",
doc: chat.Document{
Name: "photo.jpg",
MimeType: "image/jpeg",
Source: chat.DocumentSource{InlineData: []byte{0xFF, 0xD8}},
},
caps: visionCaps(),
wantStrategy: attachment.StrategyB64,
},
{
name: "txt text plain",
doc: chat.Document{
Name: "notes.txt",
MimeType: "text/plain",
Source: chat.DocumentSource{InlineText: "hello world"},
},
caps: textOnlyCaps(),
wantStrategy: attachment.StrategyTXT,
},
{
name: "drop image when model has no vision",
doc: chat.Document{
Name: "photo.jpg",
MimeType: "image/jpeg",
Source: chat.DocumentSource{InlineData: []byte{0xFF, 0xD8}},
},
caps: textOnlyCaps(),
wantStrategy: attachment.StrategyDrop,
wantReasonHas: "does not support MIME type",
},
{
name: "drop pdf when model has no pdf support",
doc: chat.Document{
Name: "doc.pdf",
MimeType: "application/pdf",
Source: chat.DocumentSource{InlineData: []byte{0x25, 0x50, 0x44, 0x46}},
},
caps: imageNoPDFCaps(),
wantStrategy: attachment.StrategyDrop,
wantReasonHas: "does not support MIME type",
},
{
name: "drop no inline content",
doc: chat.Document{
Name: "empty.md",
MimeType: "text/markdown",
Source: chat.DocumentSource{},
},
caps: textOnlyCaps(),
wantStrategy: attachment.StrategyDrop,
wantReasonHas: "no inline content",
},
{
name: "b64 pdf when pdf supported",
doc: chat.Document{
Name: "spec.pdf",
MimeType: "application/pdf",
Source: chat.DocumentSource{InlineData: []byte{0x25, 0x50, 0x44, 0x46}},
},
caps: visionCaps(),
wantStrategy: attachment.StrategyB64,
},
{
name: "drop office doc (DOCX is binary, not supported without models.dev office modality)",
doc: chat.Document{
Name: "report.docx",
MimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Source: chat.DocumentSource{InlineData: []byte{0x50, 0x4B}}, // ZIP magic bytes
},
caps: visionCaps(), // even full caps can't send DOCX — no modality
wantStrategy: attachment.StrategyDrop,
wantReasonHas: "does not support MIME type",
},
{
name: "b64 wins over txt when both inline sources present",
doc: chat.Document{
Name: "data.txt",
MimeType: "text/plain",
Source: chat.DocumentSource{InlineData: []byte("hello"), InlineText: "hello"},
},
caps: textOnlyCaps(),
wantStrategy: attachment.StrategyB64,
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
gotStrategy, gotReason := attachment.Decide(tc.doc, tc.caps)
if gotStrategy != tc.wantStrategy {
t.Errorf("strategy: got %d, want %d", gotStrategy, tc.wantStrategy)
}
if tc.wantReasonHas != "" {
if !strings.Contains(gotReason, tc.wantReasonHas) {
t.Errorf("reason %q does not contain %q", gotReason, tc.wantReasonHas)
}
}
})
}
}

func TestTXTEnvelope(t *testing.T) {
got := attachment.TXTEnvelope("readme.md", "text/markdown", "# Hello")
want := `<document name="readme.md" mime-type="text/markdown"># Hello</document>`
if got != want {
t.Errorf("TXTEnvelope:\ngot %q\nwant %q", got, want)
}
}
157 changes: 157 additions & 0 deletions pkg/attachment/modelcaps/modelcaps.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Package modelcaps provides model capability queries for the attachment system.
// It translates models.dev modality information into MIME-type support decisions
// used by the attachment routing logic.
package modelcaps

import (
"context"
"log/slog"
"strings"
"time"

"github.com/docker/docker-agent/pkg/modelsdev"
)

// ModelCapabilities describes what MIME types a given model can accept as
// document attachments.
type ModelCapabilities struct {
// supportsImage is true when the model accepts image/* MIME types.
supportsImage bool
// supportsPDF is true when the model accepts application/pdf.
supportsPDF bool
// modelFound is false when models.dev has no record for this model,
// which causes conservative fallback behaviour (text-only).
modelFound bool
}

// isOfficeMIME returns true for Office document binary formats
// (OOXML, legacy Office, RTF). These are ZIP-based or binary formats
// that cannot be naively TXT-enveloped and require explicit model support.
func isOfficeMIME(mt string) bool {
switch mt {
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
"application/msword",
"application/rtf",
"text/rtf":
return true
}
return false
}

// Supports returns true when the model can accept an attachment with the given
// MIME type.
//
// Resolution rules (in order):
// 1. image/* → requires supportsImage (models.dev "image" modality)
// 2. application/pdf → requires supportsPDF (models.dev "pdf" modality)
// 3. text/* → always supported (plain text; TXT envelope is universally safe)
// 4. Office/binary document MIMEs (DOCX, XLSX, PPTX, etc.) → not supported unless
// models.dev explicitly declares a document modality. models.dev currently has
// no "document" or "office" modality field, so these return false for all
// models until the schema is extended.
// 5. Everything else (audio/*, video/*, unknown binary) → false
func (mc ModelCapabilities) Supports(mimeType string) bool {
mt := strings.ToLower(mimeType)
if strings.HasPrefix(mt, "image/") {
return mc.supportsImage
}
if mt == "application/pdf" {
return mc.supportsPDF
}
// text/* MIMEs (text/plain, text/markdown, text/html, text/csv, …) are always
// supported — they are actual text and TXT envelope works universally.
if strings.HasPrefix(mt, "text/") {
return true
}
// Office document formats (DOCX, XLSX, PPTX, etc.) are ZIP-based binaries;
// they cannot be naively TXT-enveloped. models.dev does not yet declare an
// "office" or "document" modality, so we conservatively return false until
// the schema provides explicit capability data.
if isOfficeMIME(mt) {
return false
}
// audio/*, video/*, and all other unknown binary types are not supported.
return false
}

// loadTimeout is the maximum time allowed for a models.dev capability lookup.
// If the fetch takes longer, Load falls back to conservative text-only caps.
const loadTimeout = 10 * time.Second

// Load fetches (or returns from cache) the capability record for the given
// model ID. The model ID should be in "provider/model" format as used by
// models.dev (e.g. "anthropic/claude-3-5-sonnet-20241022").
//
// When the model is not found in the models.dev database, Load returns a
// conservative capability set that only allows text MIME types. The returned
// error is always nil; capability detection failures are silent and safe.
func Load(modelID string) (ModelCapabilities, error) {
ctx, cancel := context.WithTimeout(context.Background(), loadTimeout)
defer cancel()

store, err := modelsdev.NewStore()
if err != nil {
slog.Warn("modelcaps: failed to load models.dev store, using conservative caps",
"error", err, "model", modelID)
return ModelCapabilities{modelFound: false}, nil
}

model, err := store.GetModel(ctx, modelID)
if err != nil {
if ctx.Err() != nil {
slog.Warn("modelcaps: models.dev lookup timed out, using conservative caps",
"model", modelID, "timeout", loadTimeout)
}
// Model not found or context cancelled — conservative: text-only.
return ModelCapabilities{modelFound: false}, nil
}

mc := ModelCapabilities{modelFound: true}
for _, input := range model.Modalities.Input {
switch strings.ToLower(input) {
case "image":
mc.supportsImage = true
case "pdf":
mc.supportsPDF = true
}
}
return mc, nil
}

// CapsWith constructs a ModelCapabilities value directly from booleans. This is
// intended for use in tests and provider implementations that need to create a
// capabilities value without hitting the network.
func CapsWith(supportsImage, supportsPDF bool) ModelCapabilities {
return ModelCapabilities{
supportsImage: supportsImage,
supportsPDF: supportsPDF,
modelFound: true,
}
}

// LoadFromStore is like Load but accepts an explicit *modelsdev.Store, making
// it convenient for tests that inject a pre-populated in-memory store.
func LoadFromStore(store *modelsdev.Store, modelID string) ModelCapabilities {
ctx, cancel := context.WithTimeout(context.Background(), loadTimeout)
defer cancel()

model, err := store.GetModel(ctx, modelID)
if err != nil {
return ModelCapabilities{modelFound: false}
}

mc := ModelCapabilities{modelFound: true}
for _, input := range model.Modalities.Input {
switch strings.ToLower(input) {
case "image":
mc.supportsImage = true
case "pdf":
mc.supportsPDF = true
}
}
return mc
}
Loading
Loading