Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
4e65858
feat(retrypolicy): add dynamic retry package with file-size-based par…
aftersnow Mar 23, 2026
92c9ca3
feat(cli): add --no-retry and --retry-max-time flags to push/pull/bui…
aftersnow Mar 23, 2026
cf1bcf2
feat(backend): use retrypolicy.Do with independent retry, remove casc…
aftersnow Mar 23, 2026
be391fd
refactor: remove legacy defaultRetryOpts, delegate to retrypolicy
aftersnow Mar 23, 2026
2fcdba2
fix(push): add OnRetry handler for config and manifest retry
aftersnow Mar 23, 2026
4cdd8ae
fix: add non-retryable local errors and thread retryCtx in pull
aftersnow Mar 23, 2026
5047acc
fix: propagate context cancellation after Wait to prevent incomplete …
aftersnow Mar 23, 2026
fcdbcb0
fix: handle DeadlineExceeded in IsRetryable, fix ctx shadow, speed up…
aftersnow Mar 23, 2026
f155ccb
refactor(retrypolicy): fix off-by-one in computeBackoff logging
aftersnow Apr 23, 2026
514f604
refactor(backend): extract getAnnotationFilepath helper
aftersnow Apr 23, 2026
973f508
fix(backend): surface errgroup cancellation and use Placeholder on retry
aftersnow Apr 23, 2026
fc3ac13
feat(retrypolicy): decouple per-attempt timeout from retry budget
aftersnow May 6, 2026
4eaa8b4
style(retrypolicy): apply gci and golines formatting
aftersnow May 6, 2026
4cb2b96
fix(retrypolicy): apply per-attempt timeout when NoRetry is set
aftersnow May 6, 2026
a31fbb9
feat(cli)!: remove --no-retry flag, use --retry-attempts=1 instead
aftersnow May 6, 2026
7e3f3e8
feat(cli)!: drop retry CLI flags, defaults only
aftersnow May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions cmd/push.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ import (
"context"
"fmt"

"github.com/modelpack/modctl/pkg/backend"
"github.com/modelpack/modctl/pkg/config"

"github.com/spf13/cobra"
"github.com/spf13/viper"

"github.com/modelpack/modctl/pkg/backend"
"github.com/modelpack/modctl/pkg/config"
)

var pushConfig = config.NewPush()
Expand All @@ -48,10 +48,25 @@ var pushCmd = &cobra.Command{
// init initializes push command.
func init() {
flags := pushCmd.Flags()
flags.IntVar(&pushConfig.Concurrency, "concurrency", pushConfig.Concurrency, "specify the number of concurrent push operations")
flags.IntVar(
&pushConfig.Concurrency,
"concurrency",
pushConfig.Concurrency,
"specify the number of concurrent push operations",
)
flags.BoolVar(&pushConfig.PlainHTTP, "plain-http", false, "use plain HTTP instead of HTTPS")
flags.BoolVar(&pushConfig.Insecure, "insecure", false, "turning on this flag will disable TLS verification")
flags.BoolVar(&pushConfig.Nydusify, "nydusify", false, "[EXPERIMENTAL] nydusify the model artifact")
flags.BoolVar(
&pushConfig.Insecure,
"insecure",
false,
"turning on this flag will disable TLS verification",
)
flags.BoolVar(
&pushConfig.Nydusify,
"nydusify",
false,
"[EXPERIMENTAL] nydusify the model artifact",
)
flags.MarkHidden("nydusify")

if err := viper.BindPFlags(flags); err != nil {
Expand Down
22 changes: 22 additions & 0 deletions internal/pb/pb.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,28 @@ func (p *ProgressBar) Add(prompt, name string, size int64, reader io.Reader) io.
return reader
}

// Placeholder creates or resets a progress bar entry without a reader.
// It is used during retry backoff to keep a visible bar for the item.
func (p *ProgressBar) Placeholder(name string, prompt string, size int64) {
if disableProgress {
return
}

p.mu.RLock()
existing := p.bars[name]
p.mu.RUnlock()

// If the bar already exists, just reset its message.
if existing != nil {
existing.msg = fmt.Sprintf("%s %s", prompt, name)
existing.Bar.SetCurrent(0)
return
Comment on lines +151 to +154
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The Placeholder method should encapsulate the full reset logic used during retries, including setting the refill value and resetting the EWMA speed calculation. This ensures consistency across different transfer paths and avoids manual bar manipulation in the backend packages.

Suggested change
if existing != nil {
existing.msg = fmt.Sprintf("%s %s", prompt, name)
existing.Bar.SetCurrent(0)
return
if existing != nil {
existing.msg = fmt.Sprintf("%s %s", prompt, name)
existing.SetRefill(existing.Current())
existing.SetCurrent(0)
existing.EwmaSetCurrent(0, time.Second)
return
}

}

// Create a new placeholder bar.
p.Add(prompt, name, size, nil)
Comment on lines +146 to +158
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

There is a data race condition here. The existing.msg field is being modified without a lock, while it can be concurrently read by the progress bar's rendering goroutine. This can lead to unpredictable behavior or crashes.

To fix this, you should use a write lock to protect both the read from the p.bars map and the subsequent write to the msg field. The lock should be released before calling p.Add to avoid deadlocks, as p.Add acquires its own locks.

Suggested change
p.mu.RLock()
existing := p.bars[name]
p.mu.RUnlock()
// If the bar already exists, just reset its message.
if existing != nil {
existing.msg = fmt.Sprintf("%s %s", prompt, name)
existing.Bar.SetCurrent(0)
return
}
// Create a new placeholder bar.
p.Add(prompt, name, size, nil)
p.mu.Lock()
if existing, ok := p.bars[name]; ok {
// If the bar already exists, just reset its message.
existing.msg = fmt.Sprintf("%s %s", prompt, name)
existing.Bar.SetCurrent(0)
p.mu.Unlock()
return
}
p.mu.Unlock()
// Create a new placeholder bar.
p.Add(prompt, name, size, nil)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be resolved after PR #474 merges — that PR converts bar.msg to atomic.Value, which removes this race. Rebasing here to add a different lock-based fix would conflict with #474's approach, so I'd rather wait and let the rebase pick it up.

}

// Get returns the progress bar.
func (p *ProgressBar) Get(name string) *progressBar {
p.mu.RLock()
Expand Down
22 changes: 14 additions & 8 deletions pkg/backend/retry.go → pkg/backend/annotation.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,20 @@
package backend

import (
"time"

retry "github.com/avast/retry-go/v4"
legacymodelspec "github.com/dragonflyoss/model-spec/specs-go/v1"
modelspec "github.com/modelpack/model-spec/specs-go/v1"
)

var defaultRetryOpts = []retry.Option{
retry.Attempts(6),
retry.DelayType(retry.BackOffDelay),
retry.Delay(5 * time.Second),
retry.MaxDelay(60 * time.Second),
// getAnnotationFilepath returns the filepath stored on a descriptor's
// annotations, preferring the modelpack key and falling back to the legacy
// dragonflyoss key so older artifacts remain readable. Returns empty string
// when neither key is present.
func getAnnotationFilepath(annotations map[string]string) string {
if annotations == nil {
return ""
}
if path := annotations[modelspec.AnnotationFilepath]; path != "" {
return path
}
return annotations[legacymodelspec.AnnotationFilepath]
}
53 changes: 41 additions & 12 deletions pkg/backend/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"os"
"path/filepath"

retry "github.com/avast/retry-go/v4"
modelspec "github.com/modelpack/model-spec/specs-go/v1"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/sirupsen/logrus"
Expand All @@ -35,6 +34,7 @@ import (
"github.com/modelpack/modctl/pkg/backend/processor"
"github.com/modelpack/modctl/pkg/config"
"github.com/modelpack/modctl/pkg/modelfile"
"github.com/modelpack/modctl/pkg/retrypolicy"
"github.com/modelpack/modctl/pkg/source"
)

Expand All @@ -44,7 +44,11 @@ const (
)

// Build builds the user materials into the model artifact which follows the Model Spec.
func (b *backend) Build(ctx context.Context, modelfilePath, workDir, target string, cfg *config.Build) error {
func (b *backend) Build(
ctx context.Context,
modelfilePath, workDir, target string,
cfg *config.Build,
) error {
logrus.Infof("build: building artifact %s", target)
// parse the repo name and tag name from target.
ref, err := ParseReference(target)
Expand Down Expand Up @@ -123,8 +127,8 @@ func (b *backend) Build(ctx context.Context, modelfilePath, workDir, target stri

var configDesc ocispec.Descriptor
// Build the model config.
if err := retry.Do(func() error {
configDesc, err = builder.BuildConfig(ctx, config, hooks.NewHooks(
if err := retrypolicy.Do(ctx, func(rctx context.Context) error {
configDesc, err = builder.BuildConfig(rctx, config, hooks.NewHooks(
hooks.WithOnStart(func(name string, size int64, reader io.Reader) io.Reader {
return pb.Add(internalpb.NormalizePrompt("Building config"), name, size, reader)
}),
Expand All @@ -136,13 +140,16 @@ func (b *backend) Build(ctx context.Context, modelfilePath, workDir, target stri
}),
))
return err
}, append(defaultRetryOpts, retry.Context(ctx))...); err != nil {
}, retrypolicy.DoOpts{
FileSize: 0, // config is small
FileName: "config",
}); err != nil {
return fmt.Errorf("failed to build model config: %w", err)
}

// Build the model manifest.
if err := retry.Do(func() error {
_, err = builder.BuildManifest(ctx, layers, configDesc, manifestAnnotation(modelfile), hooks.NewHooks(
if err := retrypolicy.Do(ctx, func(rctx context.Context) error {
_, err = builder.BuildManifest(rctx, layers, configDesc, manifestAnnotation(modelfile), hooks.NewHooks(
hooks.WithOnStart(func(name string, size int64, reader io.Reader) io.Reader {
return pb.Add(internalpb.NormalizePrompt("Building manifest"), name, size, reader)
}),
Expand All @@ -154,23 +161,32 @@ func (b *backend) Build(ctx context.Context, modelfilePath, workDir, target stri
}),
))
return err
}, append(defaultRetryOpts, retry.Context(ctx))...); err != nil {
}, retrypolicy.DoOpts{
FileSize: 0, // manifest is small
FileName: "manifest",
}); err != nil {
return fmt.Errorf("failed to build model manifest: %w", err)
}

logrus.Infof("build: built artifact %s", target)
return nil
}

func (b *backend) getProcessors(modelfile modelfile.Modelfile, cfg *config.Build) []processor.Processor {
func (b *backend) getProcessors(
modelfile modelfile.Modelfile,
cfg *config.Build,
) []processor.Processor {
processors := []processor.Processor{}

if configs := modelfile.GetConfigs(); len(configs) > 0 {
mediaType := modelspec.MediaTypeModelWeightConfig
if cfg.Raw {
mediaType = modelspec.MediaTypeModelWeightConfigRaw
}
processors = append(processors, processor.NewModelConfigProcessor(b.store, mediaType, configs, ""))
processors = append(
processors,
processor.NewModelConfigProcessor(b.store, mediaType, configs, ""),
)
}

if models := modelfile.GetModels(); len(models) > 0 {
Expand Down Expand Up @@ -201,10 +217,23 @@ func (b *backend) getProcessors(modelfile modelfile.Modelfile, cfg *config.Build
}

// process walks the user work directory and process the identified files.
func (b *backend) process(ctx context.Context, builder build.Builder, workDir string, pb *internalpb.ProgressBar, cfg *config.Build, processors ...processor.Processor) ([]ocispec.Descriptor, error) {
func (b *backend) process(
ctx context.Context,
builder build.Builder,
workDir string,
pb *internalpb.ProgressBar,
cfg *config.Build,
processors ...processor.Processor,
) ([]ocispec.Descriptor, error) {
descriptors := []ocispec.Descriptor{}
for _, p := range processors {
descs, err := p.Process(ctx, builder, workDir, processor.WithConcurrency(cfg.Concurrency), processor.WithProgressTracker(pb))
descs, err := p.Process(
ctx,
builder,
workDir,
processor.WithConcurrency(cfg.Concurrency),
processor.WithProgressTracker(pb),
)
if err != nil {
return nil, err
}
Expand Down
46 changes: 34 additions & 12 deletions pkg/backend/fetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@ package backend
import (
"context"
"encoding/json"
"errors"
"fmt"
"sync"
"time"

"github.com/bmatcuk/doublestar/v4"
legacymodelspec "github.com/dragonflyoss/model-spec/specs-go/v1"
modelspec "github.com/modelpack/model-spec/specs-go/v1"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"

internalpb "github.com/modelpack/modctl/internal/pb"
"github.com/modelpack/modctl/pkg/backend/remote"
"github.com/modelpack/modctl/pkg/config"
"github.com/modelpack/modctl/pkg/retrypolicy"
)

// Fetch fetches partial files to the output.
Expand Down Expand Up @@ -74,10 +76,7 @@ func (b *backend) Fetch(ctx context.Context, target string, cfg *config.Fetch) e
for _, layer := range manifest.Layers {
for _, pattern := range cfg.Patterns {
if anno := layer.Annotations; anno != nil {
path := anno[modelspec.AnnotationFilepath]
if path == "" {
path = anno[legacymodelspec.AnnotationFilepath]
}
path := getAnnotationFilepath(anno)
// Use doublestar.PathMatch for pattern matching to support ** recursive matching
// PathMatch uses the system's native path separator (like filepath.Match) while
// also supporting recursive patterns like **/*.json
Expand All @@ -101,9 +100,12 @@ func (b *backend) Fetch(ctx context.Context, target string, cfg *config.Fetch) e
pb.Start()
defer pb.Stop()

g, ctx := errgroup.WithContext(ctx)
g := new(errgroup.Group)
g.SetLimit(cfg.Concurrency)

var mu sync.Mutex
var errs []error

logrus.Infof("fetch: fetching %d matched layers", len(layers))
for _, layer := range layers {
g.Go(func() error {
Expand All @@ -113,17 +115,37 @@ func (b *backend) Fetch(ctx context.Context, target string, cfg *config.Fetch) e
default:
}

annoFilepath := getAnnotationFilepath(layer.Annotations)

logrus.Debugf("fetch: processing layer %s", layer.Digest)
if err := pullAndExtractFromRemote(ctx, pb, internalpb.NormalizePrompt("Fetching blob"), client, cfg.Output, layer); err != nil {
return err
if err := retrypolicy.Do(ctx, func(rctx context.Context) error {
return pullAndExtractFromRemote(rctx, pb, internalpb.NormalizePrompt("Fetching blob"), client, cfg.Output, layer)
}, retrypolicy.DoOpts{
FileSize: layer.Size,
FileName: annoFilepath,
OnRetry: func(attempt uint, reason string, backoff time.Duration) {
if bar := pb.Get(layer.Digest.String()); bar != nil {
bar.SetRefill(bar.Current())
bar.SetCurrent(0)
bar.EwmaSetCurrent(0, time.Second)
}
},
Comment on lines +126 to +132
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use the pb.Placeholder method to reset the progress bar during retries. This avoids manual manipulation of the progress bar and ensures that the reset logic (including refill and EWMA reset) is applied consistently.

				OnRetry: func(attempt uint, reason string, backoff time.Duration) {
					pb.Placeholder(layer.Digest.String(), internalpb.NormalizePrompt("Fetching blob"), layer.Size)
				},

}); err != nil {
mu.Lock()
errs = append(errs, err)
mu.Unlock()
} else {
logrus.Debugf("fetch: successfully processed layer %s", layer.Digest)
}

logrus.Debugf("fetch: successfully processed layer %s", layer.Digest)
return nil
})
}

if err := g.Wait(); err != nil {
_ = g.Wait()
if ctx.Err() != nil {
return fmt.Errorf("fetch cancelled: %w", ctx.Err())
}
if err := errors.Join(errs...); err != nil {
return err
}

Expand Down
Loading