Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require (
golang.org/x/sync v0.19.0
golang.org/x/sys v0.41.0
google.golang.org/grpc v1.80.0
google.golang.org/protobuf v1.36.11
oras.land/oras-go/v2 v2.6.0
)

Expand Down Expand Up @@ -164,7 +165,6 @@ require (
google.golang.org/api v0.214.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20260226221140-a57be14db171 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260226221140-a57be14db171 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
4 changes: 4 additions & 0 deletions pkg/modelfile/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ var (
"*.ftz", // FastText compressed model
"*.ark", // Kaldi ark format (speech/audio models)
"*.db", // Database files (LMDB, etc.)

// TensorFlow SavedModel literal-name files (no extension).
"feature_map", // TF SavedModel feature map definition
"checkpoint", // TF checkpoint pointer file (literal name)
}

// Code file patterns - supported script and notebook files.
Expand Down
9 changes: 9 additions & 0 deletions pkg/modelfile/constants_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,15 @@ func TestInferFileType(t *testing.T) {
{"at threshold", "borderline", WeightFileSizeThreshold, FileTypeCode},
// Just above threshold should be model
{"above threshold", "borderline", WeightFileSizeThreshold + 1, FileTypeModel},

// TF SavedModel literal-name files: must be MODEL even when 0 bytes,
// independent of the size heuristic that would otherwise classify them as CODE.
{"feature_map literal", "feature_map", 0, FileTypeModel},
{"feature_map small", "feature_map", 1024, FileTypeModel},
{"checkpoint literal small", "checkpoint", 32, FileTypeModel},
// Negative: the literal patterns must not match same-stem-different-extension files.
{"feature_map.json is config", "feature_map.json", 1024, FileTypeConfig},
{"checkpoint.bin is model via *.bin", "checkpoint.bin", 1024, FileTypeModel},
}

assert := assert.New(t)
Expand Down
151 changes: 151 additions & 0 deletions pkg/modelfile/modelfile.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ func NewModelfileByWorkspace(workspace string, config *configmodelfile.GenerateC
}

mf.generateByConfig(config)

// Best-effort: fill mf.format from MODEL file evidence when the user did not
// pass --format. Failure (no recognizable signal, panic in the loop, etc.)
// MUST NOT abort generation — Format is metadata, not load-bearing.
mf.inferFormat()

return mf, nil
}

Expand Down Expand Up @@ -346,13 +352,158 @@ func (mf *modelfile) generateByWorkspace(config *configmodelfile.GenerateConfig)
return err
}

// ONNX external_data post-processing: any tensor file referenced by an .onnx
// file via external_data.location is unconditionally a model weight file,
// regardless of its name or size. Walker may have classified small external
// tensor files as code/config/doc by extension/size heuristic; reclassify them.
mf.reclassifyONNXExternalData()

if mf.model.Size() == 0 && mf.code.Size() == 0 && mf.dataset.Size() == 0 {
return fmt.Errorf("no model/code/dataset found - you have to create the Modelfile by yourself")
}

return nil
}

// reclassifyONNXExternalData scans every .onnx file already in mf.model,
// extracts external_data.location paths, and moves those paths from whichever
// bucket the walker placed them in into mf.model.
//
// To avoid bypassing the walker's filtering (ExcludePatterns, isSkippable, file
// count / size limits, workspace boundary), this function ONLY reclassifies
// paths that are already present in one of the existing hashsets (config /
// code / doc / model). Paths that the walker excluded — including paths
// outside the workspace produced by a malformed `../` location — are silently
// ignored. ONNX parse failures degrade gracefully: a WARNING is printed and
// the affected .onnx's external tensors keep whatever classification the
// walker assigned (the pre-fix behavior).
func (mf *modelfile) reclassifyONNXExternalData() {
walkerCollected := func(rel string) bool {
return mf.model.Contains(rel) || mf.code.Contains(rel) ||
mf.config.Contains(rel) || mf.doc.Contains(rel)
}

for _, raw := range mf.model.Values() {
modelRel, ok := raw.(string)
if !ok || !strings.HasSuffix(strings.ToLower(modelRel), ".onnx") {
continue
}
onnxAbs := filepath.Join(mf.workspace, modelRel)
extPaths, err := ExtractONNXExternalDataPaths(onnxAbs)
if err != nil {
fmt.Fprintf(os.Stderr,
"WARNING: modelfile: failed to parse ONNX external_data from %s: %v "+
"-- external tensor files (if any) will keep walker-assigned classification\n",
modelRel, err)
continue
}
onnxDir := filepath.Dir(modelRel)
for _, ext := range extPaths {
// Reject absolute external_data.location values outright. ONNX
// spec defines location as relative to the .onnx file's
// directory, so an absolute path is malformed; worse,
// filepath.Join silently strips the leading separator
// (Join(".", "/etc/secret") -> "etc/secret"), which would let
// an unrelated workspace file get reclassified to MODEL.
if filepath.IsAbs(ext) {
continue
}
relExt := filepath.Clean(filepath.Join(onnxDir, ext))
// Walker membership check absorbs all of:
// - exclude pattern (walker dropped it -> not in any bucket)
// - skippable directories (.git, etc.)
// - file count / size limits (walker errored before adding)
// - workspace boundary (walker never sees ../outside paths)
// - file simply doesn't exist on disk
if !walkerCollected(relExt) {
continue
}
mf.code.Remove(relExt)
mf.config.Remove(relExt)
mf.doc.Remove(relExt)
mf.model.Add(relExt)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve include/exclude filters for ONNX reclassification

The ONNX post-pass unconditionally does mf.model.Add(relExt) for every referenced tensor path, but this runs after the walk/filter phase and does not reapply GenerateConfig include/exclude rules. As a result, files that users explicitly excluded can be reintroduced into MODEL and then included in builds/uploads, which violates CLI filtering intent. Reclassification should only move paths that were already admitted by the workspace filter logic.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 660ae9b. The walker-membership check is exactly the proposed fix: a path can only be reclassified if it survived the walker's filter pass — which already enforces ExcludePatterns, isSkippable, file count / size limits, and workspace boundary. So an --exclude'd tensor can never be re-added by the ONNX post-pass.

}
}
}

// inferFormat fills mf.format from filename evidence collected by the walker
// when the user did not pass --format on the CLI. It only emits a value for
// highly specific signals (saved_model.pb[txt] / *.onnx / *.gguf /
// *.safetensors); generic extensions like *.bin / *.pt are left alone because
// they appear in many formats and would produce false positives.
//
// Priority order, when multiple signals coexist:
//
// 1. tensorflow — saved_model.pb / saved_model.pbtxt (SavedModel directory)
// 2. onnx — *.onnx
// 3. gguf — *.gguf
// 4. safetensors — *.safetensors
//
// SavedModel and ONNX are listed first because their layouts are uniquely
// identifiable; safetensors is last because it sometimes coexists with raw
// PyTorch shards in HF repos.
//
// We scan ALL four walker buckets (model / config / code / doc), not just
// mf.model. Reason: signals like `saved_model.pbtxt` are not in
// ModelFilePatterns and the walker therefore lands them in code/doc; if we
// scanned only mf.model, a SavedModel that ships only the .pbtxt variant would
// silently fall through. A set-based scan over every bucket closes that gap
// without changing how the walker classifies each individual file.
//
// Failure modes (no recognized signal, panic from a malformed value in the
// hashset, etc.) MUST NOT abort generation. The recover() guard ensures any
// unexpected panic degrades to "format stays empty" rather than killing the
// whole modelfile build. Format is best-effort metadata; the package gracefully
// handles a blank Format throughout the build/push/pull pipeline.
func (mf *modelfile) inferFormat() {
defer func() {
if r := recover(); r != nil {
fmt.Fprintf(os.Stderr,
"WARNING: modelfile: format inference panicked, leaving Format empty: %v\n", r)
}
}()

if mf.format != "" {
return
}

var hasSavedModel, hasONNX, hasGGUF, hasSafetensors bool
scan := func(set *hashset.Set) {
for _, raw := range set.Values() {
rel, ok := raw.(string)
if !ok {
continue
}
base := strings.ToLower(filepath.Base(rel))
switch {
case base == "saved_model.pb" || base == "saved_model.pbtxt":
hasSavedModel = true
case strings.HasSuffix(base, ".onnx"):
hasONNX = true
case strings.HasSuffix(base, ".gguf"):
hasGGUF = true
case strings.HasSuffix(base, ".safetensors"):
hasSafetensors = true
}
}
}
scan(mf.model)
scan(mf.config)
scan(mf.code)
scan(mf.doc)

switch {
case hasSavedModel:
mf.format = "tensorflow"
case hasONNX:
mf.format = "onnx"
case hasGGUF:
mf.format = "gguf"
case hasSafetensors:
mf.format = "safetensors"
}
}

// generateByModelConfig generates the modelfile by the model config, such as config.json and generation_config.json.
func (mf *modelfile) generateByModelConfig() error {
// Get config map from json files. Collect all the keys and values from the config files
Expand Down
Loading
Loading