diff --git a/pkg/registry/auth/transport.go b/pkg/registry/auth/transport.go index 44ac0d614a..183d66a98f 100644 --- a/pkg/registry/auth/transport.go +++ b/pkg/registry/auth/transport.go @@ -23,7 +23,10 @@ func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) { // Get token from source token, err := t.Source.Token(req.Context()) if err != nil { - return nil, fmt.Errorf("failed to get auth token: %w", err) + // Any token acquisition failure is an auth error regardless of cause. + // Wrapping with ErrRegistryAuthRequired ensures refreshCache() and other + // callers can distinguish auth failures from transient network errors. + return nil, fmt.Errorf("%w: failed to get auth token: %w", ErrRegistryAuthRequired, err) } // If token is empty, pass through without auth diff --git a/pkg/registry/provider_cached.go b/pkg/registry/provider_cached.go index 85bdced249..e22001be49 100644 --- a/pkg/registry/provider_cached.go +++ b/pkg/registry/provider_cached.go @@ -6,6 +6,7 @@ package registry import ( "context" "encoding/json" + "errors" "fmt" "os" "path/filepath" @@ -112,7 +113,9 @@ func (p *CachedAPIRegistryProvider) GetRegistry() (*types.Registry, error) { } // refreshCache fetches fresh data from the API and updates the cache. -// If the API fetch fails, returns stale cache if available. +// Auth errors (ErrRegistryAuthRequired, ErrRegistryUnauthorized) are always +// propagated — stale cache must never mask a changed authentication state. +// For transient failures (network blip, 5xx) stale cache is returned if available. func (p *CachedAPIRegistryProvider) refreshCache() (*types.Registry, error) { p.cacheMu.Lock() defer p.cacheMu.Unlock() @@ -120,7 +123,11 @@ func (p *CachedAPIRegistryProvider) refreshCache() (*types.Registry, error) { // Fetch from API registry, err := p.APIRegistryProvider.GetRegistry() if err != nil { - // If fetch fails and we have stale cache, return it + // Auth errors must propagate — stale cache must not mask a changed auth state. + if errors.Is(err, auth.ErrRegistryAuthRequired) || errors.Is(err, api.ErrRegistryUnauthorized) { + return nil, err + } + // Transient failures (network blip, 5xx): degrade gracefully to stale cache. if p.cachedData != nil { return p.cachedData, nil } diff --git a/pkg/registry/provider_cached_authbug_test.go b/pkg/registry/provider_cached_authbug_test.go new file mode 100644 index 0000000000..4815940b4f --- /dev/null +++ b/pkg/registry/provider_cached_authbug_test.go @@ -0,0 +1,172 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package registry + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/stacklok/toolhive/pkg/registry/api" + "github.com/stacklok/toolhive/pkg/registry/auth" +) + +// TestCachedProvider_AuthErrorNotMaskedByStaleCache reproduces a bug in +// refreshCache() at provider_cached.go:116. +// +// Scenario: a user has an existing cache populated from a previous successful +// fetch. Later, their registry returns 401 (token revoked, credentials +// rejected server-side). The CLI should surface the auth error, because the +// user's authorization state may have changed since the cache was populated. +// +// Current behavior: refreshCache() treats ANY error from the upstream fetch +// as a signal to fall back to stale cached data, including authentication +// errors. The CLI silently prints stale registry contents with no error. +// +// This test covers the 401/403 branch (server-side rejection). +func TestCachedProvider_AuthErrorNotMaskedByStaleCache(t *testing.T) { + t.Parallel() + + var returnUnauthorized atomic.Bool + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + if returnUnauthorized.Load() { + w.WriteHeader(http.StatusUnauthorized) + return + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"servers":[],"metadata":{"next_cursor":""}}`)) + })) + t.Cleanup(srv.Close) + + // allowPrivateIp=true because httptest binds to 127.0.0.1 + // usePersistent=false exercises only the in-memory cache path + // tokenSource=nil so the constructor's validation probe runs against the healthy server + provider, err := NewCachedAPIRegistryProvider(srv.URL, true, false, nil) + require.NoError(t, err, "constructor should succeed while server is healthy") + + // Populate the in-memory cache with a successful fetch. + _, err = provider.ListServers() + require.NoError(t, err, "first fetch should succeed and populate cache") + + // Flip the server to 401 — equivalent to the registry now rejecting + // the user's credentials server-side. + returnUnauthorized.Store(true) + + // Force a refresh — mirrors what happens when the in-memory cache TTL + // expires (default 1h) and the next `thv registry list` triggers a + // fresh fetch. The upstream API call will fail with 401. + err = provider.ForceRefresh() + + require.Error(t, err, + "expected auth error to propagate on refresh; got nil — "+ + "stale cache is masking the auth failure (bug)") + require.True(t, + errors.Is(err, api.ErrRegistryUnauthorized) || + errors.Is(err, auth.ErrRegistryAuthRequired), + "expected ErrRegistryUnauthorized or ErrRegistryAuthRequired; got: %v", err) +} + +// failingTokenSource is a test double for auth.TokenSource. +// It returns an empty token (no Authorization header added) when fail is +// false, and returns an error wrapped like oauthTokenSource.Token() would +// when its browser OAuth flow fails, when fail is true. +type failingTokenSource struct { + fail atomic.Bool +} + +// Token mirrors the exact error-wrapping oauthTokenSource.Token() applies +// when its browser flow times out / is cancelled. See: +// - pkg/registry/auth/oauth_token_source.go:61-67 (outer wrap) +// - pkg/registry/auth/oauth_token_source.go:110-113 (inner wrap) +// +// The point of matching the wrapping is to verify that the eventual +// refreshCache() fallback behavior does not depend on the specific error +// type — any error causes stale cache to be served. +func (s *failingTokenSource) Token(_ context.Context) (string, error) { + if s.fail.Load() { + inner := fmt.Errorf("oauth flow start failed: %w", + errors.New("authorization timed out waiting for browser callback")) + return "", fmt.Errorf("oauth flow failed: %w", inner) + } + // No auth header needed — the httptest server does not validate credentials. + return "", nil +} + +// TestCachedProvider_OAuthFlowFailureNotMaskedByStaleCache reproduces the +// same masking bug via the OAuth-browser-flow-failure code path — the one +// actually described in the bug report: +// +// "thv registry list sent me a URL to OAuth. I did nothing. After a minute +// it exited the OAuth flow and went to another OAuth flow. I also did +// nothing. In another minute it exited that OAuth flow and then it +// showed me everything in the registry." +// +// This path differs from the 401/403 path because the error returned by +// oauthTokenSource.Token() when the browser flow times out is a generic +// wrapped error — it does NOT match errors.Is(err, auth.ErrRegistryAuthRequired) +// or errors.Is(err, api.ErrRegistryUnauthorized). It is propagated through: +// +// oauthTokenSource.Token ("oauth flow failed: ...") +// -> auth.Transport.RoundTrip ("failed to get auth token: ...") +// -> api.Client.fetchServersPage ("failed to fetch servers: ...") +// -> APIRegistryProvider.GetRegistry (wrapped into *UnavailableError) +// -> refreshCache (swallowed; stale cache returned with nil err) +// +// A sentinel-check fix like +// +// if errors.Is(err, auth.ErrRegistryAuthRequired) || +// errors.Is(err, api.ErrRegistryUnauthorized) { return nil, err } +// +// will NOT catch this path — the OAuth browser-flow error carries neither +// sentinel. A correct fix has to classify token-acquisition failures as +// auth errors before they are flattened into UnavailableError. +func TestCachedProvider_OAuthFlowFailureNotMaskedByStaleCache(t *testing.T) { + t.Parallel() + + // Server always serves a valid empty list. All failures in this test + // come from the token source, not the server — simulating an OAuth + // flow that never completes while the registry itself is reachable. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"servers":[],"metadata":{"next_cursor":""}}`)) + })) + t.Cleanup(srv.Close) + + ts := &failingTokenSource{} + + // Non-nil tokenSource causes NewAPIRegistryProvider to skip its + // construction-time validation probe (provider_api.go:57) — matching + // the real OAuth-configured code path where the probe is skipped + // because a browser flow cannot complete within 10 seconds. + provider, err := NewCachedAPIRegistryProvider(srv.URL, true, false, ts) + require.NoError(t, err, "constructor should succeed (probe skipped when tokenSource != nil)") + + // Populate the in-memory cache with a successful fetch. + // ts.fail == false, so Token() returns ("", nil) and the request passes + // through the auth transport without an Authorization header. + _, err = provider.ListServers() + require.NoError(t, err, "first fetch should succeed and populate cache") + + // Simulate the OAuth browser flow failing / timing out on the next + // token acquisition. + ts.fail.Store(true) + + err = provider.ForceRefresh() + + // DESIRED: an OAuth-flow failure must not be hidden by stale cache. + // CURRENT BUG: refreshCache returns cached data with nil error. + require.Error(t, err, + "expected OAuth flow failure to propagate on refresh; got nil — "+ + "stale cache is masking the OAuth failure (bug, "+ + "matches user-reported scenario)") +}