diff --git a/cmd/entire/cli/activity_cmd_test.go b/cmd/entire/cli/activity_cmd_test.go index 32ccafbfc..d932cad41 100644 --- a/cmd/entire/cli/activity_cmd_test.go +++ b/cmd/entire/cli/activity_cmd_test.go @@ -36,6 +36,10 @@ func TestRunActivity_SilencesContextCanceled(t *testing.T) { return nil, context.Canceled }) t.Cleanup(auth.SetManagerForTest(t, mgr)) + // Force discovery-unavailable so ResolveDataAPIToken takes the static + // fallback through the singleton test manager above, rather than making a + // real network fetch to the configured data host. + t.Cleanup(auth.SetResolveContextForAPIForTest(t, auth.DiscoveryUnavailableForTest)) var out, errOut bytes.Buffer err := runActivity(t.Context(), &out, &errOut) @@ -67,6 +71,10 @@ func TestRunActivity_PrintsLoginHintOnNotLoggedIn(t *testing.T) { return nil, errors.New("unreachable") }) t.Cleanup(auth.SetManagerForTest(t, mgr)) + // Force discovery-unavailable so ResolveDataAPIToken takes the static + // fallback through the singleton test manager above, rather than making a + // real network fetch to the configured data host. + t.Cleanup(auth.SetResolveContextForAPIForTest(t, auth.DiscoveryUnavailableForTest)) var out, errOut bytes.Buffer err := runActivity(t.Context(), &out, &errOut) diff --git a/cmd/entire/cli/api_client.go b/cmd/entire/cli/api_client.go index 5981fc049..3d87bbde4 100644 --- a/cmd/entire/cli/api_client.go +++ b/cmd/entire/cli/api_client.go @@ -38,10 +38,12 @@ func NewAuthenticatedAPIClient(ctx context.Context, insecureHTTP bool) (*api.Cli } } - // tokenmanager validates Resource as a strict origin URL; strip any path - // the operator may have included in ENTIRE_API_BASE_URL before handing - // it across the package boundary. - token, err := auth.TokenForResource(ctx, api.OriginOnly(dataURL)) + // ResolveDataAPIToken discovers which login context the data host trusts + // (via its /.well-known/entire-api.json) and exchanges that context's + // token for the advertised audience, falling back to static resolution + // when the host doesn't advertise discovery. It normalises dataURL to an + // origin internally. + token, err := auth.ResolveDataAPIToken(ctx, dataURL) if err != nil { if errors.Is(err, auth.ErrNotLoggedIn) { // Wrap the original err (not the sentinel) so any context diff --git a/cmd/entire/cli/auth.go b/cmd/entire/cli/auth.go index 99ded1799..34f23c5fb 100644 --- a/cmd/entire/cli/auth.go +++ b/cmd/entire/cli/auth.go @@ -75,18 +75,6 @@ func newAuthSessionsClient(coreURL, token string) *api.Client { return api.NewClientWithBaseURL(token, coreURL).WithAuthSessionsPath(coreAuthSessionsPath) } -// resolveAuthHostToken returns a bearer scoped for the auth host (entire-core). -// For the auth host's own origin the tokenmanager hits the same-host shortcut -// and returns the stored login JWT unchanged — keeping the entire:session -// scope that core's session endpoints (and /me) require, with no STS exchange. -func resolveAuthHostToken(ctx context.Context) (string, error) { - token, err := auth.TokenForResource(ctx, api.OriginOnly(api.AuthBaseURL())) - if err != nil { - return "", fmt.Errorf("resolve auth-host token: %w", err) - } - return token, nil -} - // isKeychainTokenRejected reports whether err indicates the stored // keyring token can't authenticate against entire-core. Failure modes that // collapse into the single "the user must re-login" branch: @@ -167,7 +155,7 @@ func newAuthStatusCmd() *cobra.Command { if err := requireSecureBaseURL(insecureHTTPAuth); err != nil { return err } - target, err := resolveStatusTarget(auth.NewContextStore(), auth.Contexts, api.AuthBaseURL()) + target, err := resolveStatusTarget(cmd.Context(), auth.NewContextStore(), auth.Contexts, auth.RefreshedLoginToken, api.AuthBaseURL()) if err != nil { return err } @@ -208,6 +196,11 @@ type authSessionLister func(ctx context.Context, coreURL, token string) ([]api.A // name. Injected for testability; production wires auth.Contexts. type contextsProvider func() ([]*contexts.Context, string, error) +// loginTokenResolver returns a usable login JWT for a context, transparently +// re-minting an expired one from the stored refresh token. Injected so status +// tests don't reach the network; production wires auth.RefreshedLoginToken. +type loginTokenResolver func(ctx context.Context, c *contexts.Context) (string, error) + // statusTarget is the resolved core to act against: the active context's // CoreURL + its session token, or (no active context) the configured // AuthBaseURL + legacy keyring entry. Shared by `auth status` (profile + @@ -219,17 +212,29 @@ type statusTarget struct { totalContexts int } -// resolveStatusTarget picks the core + token for `entire auth status`. The -// active contexts.json context wins (so `auth use` retargets status onto that -// login server); otherwise it falls back to the legacy keyring entry keyed by -// the configured auth host. +// resolveStatusTarget picks the core + token for `entire auth status` (and +// `logout`). The active contexts.json context wins (so `auth use` retargets +// status onto that login server); otherwise it falls back to the legacy keyring +// entry keyed by the configured auth host. +// +// For the active context the token is resolved through resolveLogin, which +// transparently re-mints an expired login JWT from the stored refresh token. +// This is the point of the refresh: an expired-but-refreshable session must +// report "logged in", not "re-login" — the same false negative +// auth.ResolveControlPlaneTarget already avoids for org/repo/project/grant. +// `logout` benefits too: the refreshed bearer can authenticate the revoke call +// instead of failing on an expired token. When refresh fails (revoked family, +// network, opaque token), we fall back to the stored token and let the /me +// liveness probe be the arbiter — preserving the accurate "no longer valid" +// outcome for a genuinely dead session (ErrReauthRequired → expired token → +// 401 → re-login). // // A genuine contexts.json read/parse error is surfaced, not swallowed — a // missing file reads as "no contexts" (no error), so an error here means the // file is corrupt or unreadable, which the user must see. This keeps status // symmetric with the control-plane commands (auth.ResolveControlPlaneTarget), // which fail the same way rather than silently degrading to a stale identity. -func resolveStatusTarget(store tokenStore, listContexts contextsProvider, fallbackBaseURL string) (statusTarget, error) { +func resolveStatusTarget(ctx context.Context, store tokenStore, listContexts contextsProvider, resolveLogin loginTokenResolver, fallbackBaseURL string) (statusTarget, error) { all, current, err := listContexts() if err != nil { return statusTarget{}, fmt.Errorf("load contexts: %w", err) @@ -239,6 +244,12 @@ func resolveStatusTarget(store tokenStore, listContexts contextsProvider, fallba if c.Name != current || c.CoreURL == "" { continue } + // Prefer a refreshed token; fall back to the raw stored token so a + // refresh failure degrades to today's behaviour rather than dropping + // to the legacy entry. + if tok, terr := resolveLogin(ctx, c); terr == nil && tok != "" { + return statusTarget{coreURL: c.CoreURL, token: tok, activeContext: c.Name, totalContexts: total}, nil + } if tok, terr := auth.LoginTokenForContext(c); terr == nil && tok != "" { return statusTarget{coreURL: c.CoreURL, token: tok, activeContext: c.Name, totalContexts: total}, nil } diff --git a/cmd/entire/cli/auth/data_api.go b/cmd/entire/cli/auth/data_api.go new file mode 100644 index 000000000..4773f25a9 --- /dev/null +++ b/cmd/entire/cli/auth/data_api.go @@ -0,0 +1,118 @@ +package auth + +import ( + "context" + "errors" + "net/http" + "net/url" + "time" + + "github.com/entireio/cli/cmd/entire/cli/api" + "github.com/entireio/cli/internal/entireclient/clusterdiscovery" + "github.com/entireio/cli/internal/entireclient/contexts" + "github.com/entireio/cli/internal/entireclient/discovery" +) + +// dataAPIDiscoveryTimeout bounds the one /.well-known/entire-api.json GET we +// add per data-API command. Kept short: on any failure we fall back to static +// resolution, so a slow or absent endpoint must not stall the command. +const dataAPIDiscoveryTimeout = 8 * time.Second + +// resolveContextForAPIFunc is the shape of the discovery seam: it mirrors +// clusterdiscovery.ResolveContextForAPI (ctx, configDir, cacheDir, apiHost, +// httpClient, debugf). +type resolveContextForAPIFunc func(context.Context, string, string, string, *http.Client, clusterdiscovery.DebugFunc) (*contexts.Context, error) + +// resolveContextForAPI is the discovery seam, swapped in tests so they don't +// reach the network. See SetResolveContextForAPIForTest for cross-package tests. +var resolveContextForAPI resolveContextForAPIFunc = clusterdiscovery.ResolveContextForAPI + +// SetResolveContextForAPIForTest overrides the /.well-known/entire-api.json +// discovery seam and returns a cleanup func. Tests in other packages that +// exercise a data-API command (activity/search/dispatch/recap) MUST install +// this — otherwise ResolveDataAPIToken makes a real network call to the +// configured data host and bypasses any SetManagerForTest fallback seam. Pass +// a func returning clusterdiscovery.ErrDiscoveryUnavailable to force the static +// fallback path. Test-only. +func SetResolveContextForAPIForTest(t interface{ Helper() }, fn resolveContextForAPIFunc) func() { + t.Helper() + prev := resolveContextForAPI + resolveContextForAPI = fn + return func() { resolveContextForAPI = prev } +} + +// DiscoveryUnavailableForTest is a ready-made SetResolveContextForAPIForTest +// value that forces the discovery-unavailable fallback (no network), so a +// cross-package test exercises the static TokenForResource path deterministically. +func DiscoveryUnavailableForTest(context.Context, string, string, string, *http.Client, clusterdiscovery.DebugFunc) (*contexts.Context, error) { + return nil, clusterdiscovery.ErrDiscoveryUnavailable +} + +// ResolveDataAPIToken returns a bearer for the data API at dataBaseURL. +// +// It dials the API's /.well-known/entire-api.json to learn which login +// server(s) the API trusts and which audience to exchange for, picks the +// matching local auth context (active-wins-if-eligible → sole → explicit +// choice), and exchanges that context's login JWT for the advertised audience +// at that context's core. This is what makes +// +// ENTIRE_API_BASE_URL=https://partial.to entire activity +// +// authenticate as the partial.to login even while the active context is a +// prod entire.io login — without the operator also setting ENTIRE_AUTH_BASE_URL. +// +// When the API doesn't advertise discovery (404 / unreachable / 503 / +// malformed — e.g. a deployment predating the well-known), it falls back to +// the pre-discovery static path (TokenForResource through the singleton +// manager) so behaviour is never worse than before. A reachable API whose +// context selection fails (no eligible context, or several with none active) +// surfaces that error directly — the user must log in or pick one. +// +// Callers that honour --insecure-http-auth must call EnableInsecureHTTP before +// invoking this (as they already do); the per-context exchange and the static +// fallback both read that global opt-in. +func ResolveDataAPIToken(ctx context.Context, dataBaseURL string) (string, error) { + dataOrigin := api.OriginOnly(dataBaseURL) + host, ok := hostOf(dataOrigin) + if !ok { + // Can't derive a host to discover against — use static resolution. + return TokenForResource(ctx, dataOrigin) + } + + // Bridge any pre-contexts.json login so the resolver can match it, mirroring + // the git remote helper's cold-boot path. Best-effort: a migration failure + // must not block resolution. + _, _ = MigrateLegacyLoginContext() //nolint:errcheck // best-effort bridge; resolution proceeds regardless + + dctx, cancel := context.WithTimeout(ctx, dataAPIDiscoveryTimeout) + defer cancel() + httpClient := &http.Client{Timeout: dataAPIDiscoveryTimeout} + + selected, err := resolveContextForAPI(dctx, contexts.DefaultConfigDir(), discovery.DefaultCacheDir(), host, httpClient, nil) + if errors.Is(err, clusterdiscovery.ErrDiscoveryUnavailable) { + // Old deployment / not rolled out / transient — preserve today's behaviour. + return TokenForResource(ctx, dataOrigin) + } + if err != nil { + return "", err + } + + // Exchange for the data host origin; the token manager derives the RFC 8693 + // audience from it, which is the aud the API requires (aud == base URI). + allowInsecure := insecureHTTPEnabled() || isLoopbackHTTP(selected.CoreURL) + provider, err := NewRefreshingResourceProvider(selected, dataOrigin, nil, allowInsecure) + if err != nil { + return "", err + } + return provider(ctx) +} + +// hostOf returns the host[:port] of an origin URL, ok=false when it can't be +// parsed into a host. +func hostOf(origin string) (string, bool) { + u, err := url.Parse(origin) + if err != nil || u.Host == "" { + return "", false + } + return u.Host, true +} diff --git a/cmd/entire/cli/auth/data_api_test.go b/cmd/entire/cli/auth/data_api_test.go new file mode 100644 index 000000000..e9d2ac41a --- /dev/null +++ b/cmd/entire/cli/auth/data_api_test.go @@ -0,0 +1,182 @@ +package auth + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "path/filepath" + "testing" + "time" + + "github.com/entireio/auth-go/sts" + "github.com/entireio/auth-go/tokenmanager" + + "github.com/entireio/cli/cmd/entire/cli/api" + "github.com/entireio/cli/internal/entireclient/clusterdiscovery" + "github.com/entireio/cli/internal/entireclient/contexts" + "github.com/entireio/cli/internal/entireclient/tokenstore" +) + +// These tests drive process-global state (the token-store backend, the +// discovery seam, the provider singleton) so they cannot run in parallel. + +// stubResolveContextForAPI swaps the discovery seam for the duration of the +// test, restoring it after. +func stubResolveContextForAPI(t *testing.T, fn resolveContextForAPIFunc) { + t.Helper() + prev := resolveContextForAPI + resolveContextForAPI = fn + t.Cleanup(func() { resolveContextForAPI = prev }) +} + +// When the API doesn't advertise discovery, resolution falls back to the static +// path — so with no login it surfaces ErrNotLoggedIn, exactly as before the +// discovery layer existed (proving we took the fallback branch, not the +// per-context one, which would name a context instead). +func TestResolveDataAPIToken_FallbackWhenDiscoveryUnavailable(t *testing.T) { + t.Setenv("ENTIRE_CONFIG_DIR", t.TempDir()) + t.Setenv(api.AuthBaseURLEnvVar, "") + restore := tokenstore.UseFileBackendForTesting(filepath.Join(t.TempDir(), "tokens.json")) + t.Cleanup(restore) + + stubResolveContextForAPI(t, func(context.Context, string, string, string, *http.Client, clusterdiscovery.DebugFunc) (*contexts.Context, error) { + return nil, fmt.Errorf("%w: 404", clusterdiscovery.ErrDiscoveryUnavailable) + }) + + // Pin the singleton manager to an empty store so the static fallback's + // TokenForResource reports not-logged-in deterministically — the + // process-global manager is otherwise frozen by whichever earlier test + // built it first. + mgr, err := tokenmanager.New(tokenmanager.Config{ + Issuer: "https://entire.io", + ClientID: "entire-cli", + Store: contextTokenStore{service: "empty-service", handle: "nobody"}, + }) + if err != nil { + t.Fatalf("build empty manager: %v", err) + } + t.Cleanup(SetManagerForTest(t, mgr)) + + _, err = ResolveDataAPIToken(context.Background(), "https://entire.io") + if !errors.Is(err, ErrNotLoggedIn) { + t.Fatalf("want ErrNotLoggedIn from the static fallback, got %v", err) + } +} + +// A reachable API whose context selection fails is a real error the user must +// act on — it must surface, not silently fall back to static resolution. +func TestResolveDataAPIToken_SurfacesSelectionError(t *testing.T) { + t.Setenv("ENTIRE_CONFIG_DIR", t.TempDir()) + restore := tokenstore.UseFileBackendForTesting(filepath.Join(t.TempDir(), "tokens.json")) + t.Cleanup(restore) + + sentinel := errors.New("multiple login contexts can authenticate against API host entire.io") + stubResolveContextForAPI(t, func(context.Context, string, string, string, *http.Client, clusterdiscovery.DebugFunc) (*contexts.Context, error) { + return nil, sentinel + }) + + _, err := ResolveDataAPIToken(context.Background(), "https://entire.io") + if !errors.Is(err, sentinel) { + t.Fatalf("want the selection error surfaced verbatim, got %v", err) + } +} + +// The success path: discovery picks a context, and the provider exchanges that +// context's login JWT at its core for an audience equal to the data host +// origin (the aud the API requires), returning the exchanged token. The +// audience is derived from the resource origin by the token manager, not read +// from discovery. +func TestResolveDataAPIToken_ExchangesForDataHostOrigin(t *testing.T) { + t.Setenv("ENTIRE_CONFIG_DIR", t.TempDir()) + restore := tokenstore.UseFileBackendForTesting(filepath.Join(t.TempDir(), "tokens.json")) + t.Cleanup(restore) + + // v2 provider so the exchange POSTs to the core's /oauth/token STS path. + SetProviderForTest(t, Provider{ClientID: "entire-cli", TokenPath: "/oauth/token", STSPath: "/oauth/token"}) + + const dataOrigin = "https://data.example" + const wantAudience = "https://data.example" + + var gotAudience, gotResource, gotGrant string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = r.ParseForm() //nolint:errcheck // test handler + gotGrant = r.FormValue("grant_type") + gotAudience = r.FormValue("audience") + gotResource = r.FormValue("resource") + w.Header().Set("Content-Type", "application/json") + _, _ = fmt.Fprint(w, `{"access_token":"exchanged-token","token_type":"Bearer","expires_in":3600}`) + })) + defer srv.Close() + + // Seed a fresh login JWT for a context whose core is the STS server, so the + // provider needs no refresh and goes straight to the exchange. + svc := tokenstore.CoreKeyringService(srv.URL) + jwt := makeJWT(t, fmt.Sprintf(`{"iss":%q,"handle":"me","exp":%d}`, srv.URL, time.Now().Add(2*time.Hour).Unix())) + if err := tokenstore.Set(svc, "me", tokenstore.EncodeTokenWithExpiration(jwt, 7200)); err != nil { + t.Fatalf("seed token: %v", err) + } + ctxObj := &contexts.Context{Name: "me@core", CoreURL: srv.URL, Handle: "me", KeychainService: svc} + + stubResolveContextForAPI(t, func(context.Context, string, string, string, *http.Client, clusterdiscovery.DebugFunc) (*contexts.Context, error) { + return ctxObj, nil + }) + + // allowInsecure flows from the loopback http core (srv.URL) automatically. + token, err := ResolveDataAPIToken(context.Background(), dataOrigin) + if err != nil { + t.Fatalf("ResolveDataAPIToken: %v", err) + } + if token != "exchanged-token" { + t.Fatalf("token = %q, want the exchanged token", token) + } + if gotGrant != sts.GrantTypeTokenExchange { + t.Fatalf("grant_type = %q, want token-exchange", gotGrant) + } + if gotAudience != wantAudience { + t.Fatalf("audience = %q, want the data host origin %q (derived from the resource)", gotAudience, wantAudience) + } + if want := mustOrigin(t, dataOrigin); gotResource != want { + t.Fatalf("resource = %q, want the data origin %q", gotResource, want) + } +} + +func TestNewRefreshingResourceProvider_Validation(t *testing.T) { + t.Parallel() + if _, err := NewRefreshingResourceProvider(nil, "https://data.example", nil, false); err == nil { + t.Fatal("want error for nil context") + } + if _, err := NewRefreshingResourceProvider(&contexts.Context{Name: "x", CoreURL: "https://core.example"}, "https://data.example", nil, false); err == nil { + t.Fatal("want error for a context with no keychain slot") + } +} + +// When the selected context has no stored token, the provider's error must +// still unwrap to ErrNotLoggedIn so callers (NewAuthenticatedAPIClient, search, +// dispatch) that branch on errors.Is render their login guidance — the +// regression the PR review flagged on the discovery path. +func TestNewRefreshingResourceProvider_NotLoggedInPreservesSentinel(t *testing.T) { + restore := tokenstore.UseFileBackendForTesting(filepath.Join(t.TempDir(), "tokens.json")) + t.Cleanup(restore) + + c := &contexts.Context{Name: "me@core", CoreURL: "https://core.example", Handle: "me", KeychainService: "kc:me"} + provider, err := NewRefreshingResourceProvider(c, "https://data.example", nil, false) + if err != nil { + t.Fatalf("NewRefreshingResourceProvider: %v", err) + } + _, err = provider(context.Background()) + if !errors.Is(err, ErrNotLoggedIn) { + t.Fatalf("provider error must unwrap to ErrNotLoggedIn, got %v", err) + } +} + +func mustOrigin(t *testing.T, raw string) string { + t.Helper() + u, err := url.Parse(raw) + if err != nil { + t.Fatalf("parse %q: %v", raw, err) + } + return u.Scheme + "://" + u.Host +} diff --git a/cmd/entire/cli/auth/refresh.go b/cmd/entire/cli/auth/refresh.go index 2e1d1451a..6399796e0 100644 --- a/cmd/entire/cli/auth/refresh.go +++ b/cmd/entire/cli/auth/refresh.go @@ -104,6 +104,81 @@ func (s contextTokenStore) DeleteTokens(string) error { return nil } +// newContextTokenManager builds the per-context auth-go tokenmanager that both +// NewRefreshingLoginProvider and NewRefreshingResourceProvider sit on. Keying +// Issuer on c.CoreURL is the whole point: store reads, the refresh grant, and +// the STS exchange all target that context's core (the bug the singleton +// manager — pinned to AuthBaseURL — has when the active context lives on a +// different core). +// +// STSPath is set unconditionally even for the login-only provider: Refresh() +// never reaches the exchange path, so an unused STSPath is harmless, and a +// single config keeps the two providers from drifting. +// +// transport carries the caller's TLS configuration; allowInsecureHTTP permits +// an http:// core/resource for loopback/dev. +func newContextTokenManager(c *contexts.Context, transport http.RoundTripper, allowInsecureHTTP bool) (*tokenmanager.Manager, error) { + if c == nil { + return nil, errors.New("nil context") + } + if c.KeychainService == "" || c.Handle == "" { + return nil, fmt.Errorf("context %q has no keychain slot", c.Name) + } + mgr, err := tokenmanager.New(tokenmanager.Config{ + Issuer: strings.TrimRight(c.CoreURL, "/"), + ClientID: CurrentProvider().ClientID, + STSPath: CurrentProvider().STSPath, + RefreshPath: CurrentProvider().TokenPath, + Store: contextTokenStore{service: c.KeychainService, handle: c.Handle}, + Transport: transport, + AllowInsecureHTTP: allowInsecureHTTP, + UserAgent: CurrentProvider().ClientID, + }) + if err != nil { + return nil, fmt.Errorf("init token manager for context %q: %w", c.Name, err) + } + return mgr, nil +} + +// reauthError carries a friendly, context-named re-login message while still +// unwrapping to the underlying tokenmanager sentinel. Callers that branch on +// errors.Is(err, ErrNotLoggedIn) (NewAuthenticatedAPIClient, search, dispatch) +// keep matching — without this, the discovery path turned a missing keyring +// token into an opaque string and those callers fell through to their generic +// error, a regression vs the pre-discovery TokenForResource path. Error() +// returns only msg so the sentinel's terse text ("not logged in") doesn't leak +// into the rendered message. +type reauthError struct { + msg string + sentinel error +} + +func (e *reauthError) Error() string { return e.msg } +func (e *reauthError) Unwrap() error { return e.sentinel } + +// contextReauthError maps the two re-auth sentinels a per-context manager can +// return into a friendly message that names the context and its core (so a +// multi-core user logs back into the right one — matching +// clusterdiscovery.RenderLoginHint's idiom), preserving the sentinel for +// errors.Is. Returns nil when err is neither sentinel, leaving the caller to +// wrap the residual error in its own terms (refresh vs exchange). +func contextReauthError(c *contexts.Context, err error) error { + coreURL := strings.TrimRight(c.CoreURL, "/") + switch { + case errors.Is(err, tokenmanager.ErrReauthRequired): + return &reauthError{ + msg: fmt.Sprintf("login session for %q (%s) expired; run `entire login` to re-authenticate", c.Name, coreURL), + sentinel: tokenmanager.ErrReauthRequired, + } + case errors.Is(err, tokenmanager.ErrNotLoggedIn): + return &reauthError{ + msg: fmt.Sprintf("no usable login for %q (%s); run `entire login`", c.Name, coreURL), + sentinel: tokenmanager.ErrNotLoggedIn, + } + } + return nil +} + // NewRefreshingLoginProvider returns a login-JWT provider (the shape // repocreds wants) for context c that transparently re-mints an expired // login JWT from the stored refresh token. @@ -125,38 +200,73 @@ func (s contextTokenStore) DeleteTokens(string) error { // transport carries the caller's TLS configuration; allowInsecureHTTP // permits an http:// core for loopback/dev. func NewRefreshingLoginProvider(c *contexts.Context, transport http.RoundTripper, allowInsecureHTTP bool) (func(context.Context) (string, error), error) { + mgr, err := newContextTokenManager(c, transport, allowInsecureHTTP) + if err != nil { + return nil, err + } + return func(ctx context.Context) (string, error) { + tok, err := mgr.Refresh(ctx) + if mapped := contextReauthError(c, err); mapped != nil { + return "", mapped + } + if err != nil { + return "", fmt.Errorf("refresh login token: %w", err) + } + return tok, nil + }, nil +} + +// RefreshedLoginToken returns context c's login JWT, transparently re-minting +// an expired one from the stored refresh token. It is the convenience form of +// NewRefreshingLoginProvider for callers that want a single token now (e.g. +// `auth status` / `logout`, which must report a refreshable session as alive +// rather than telling the user to re-login). The insecure-HTTP decision mirrors +// the control-plane resolver: loopback cores and the --insecure-http-auth +// opt-in are permitted, everything else requires https. +// +// Errors preserve the tokenmanager sentinels (ErrReauthRequired when the +// session is genuinely dead, ErrNotLoggedIn when no credential is usable) so +// callers can branch on errors.Is. +func RefreshedLoginToken(ctx context.Context, c *contexts.Context) (string, error) { if c == nil { - return nil, errors.New("nil context") + return "", errors.New("nil context") } - if c.KeychainService == "" || c.Handle == "" { - return nil, fmt.Errorf("context %q has no keychain slot", c.Name) + provider, err := NewRefreshingLoginProvider(c, nil, insecureHTTPEnabled() || isLoopbackHTTP(c.CoreURL)) + if err != nil { + return "", err } - mgr, err := tokenmanager.New(tokenmanager.Config{ - Issuer: strings.TrimRight(c.CoreURL, "/"), - ClientID: CurrentProvider().ClientID, - RefreshPath: CurrentProvider().TokenPath, - Store: contextTokenStore{service: c.KeychainService, handle: c.Handle}, - Transport: transport, - AllowInsecureHTTP: allowInsecureHTTP, - UserAgent: CurrentProvider().ClientID, - }) + return provider(ctx) +} + +// NewRefreshingResourceProvider returns a provider that mints a bearer valid +// for resourceOrigin, by exchanging context c's login JWT at c's own core (RFC +// 8693). It is NewRefreshingLoginProvider's sibling for resource servers: where +// that returns the bare login JWT (the control plane / cluster cases, where the +// host is the core), this performs the token exchange the data API requires. +// +// Both the silent login-JWT re-mint and the exchange run through the shared +// per-context tokenmanager (newContextTokenManager). resourceOrigin must +// already be origin-only (no path). No audience is passed: the token manager +// defaults the RFC 8693 audience to the resource origin, which is exactly what +// the data API requires (aud == its base URI), so the audience is derived from +// the host being dialed rather than read from discovery. Exchanged tokens are +// cached in-process by the tokenmanager for the life of this process. +// +// transport carries the caller's TLS configuration; allowInsecureHTTP permits +// an http:// core/resource for loopback/dev. +func NewRefreshingResourceProvider(c *contexts.Context, resourceOrigin string, transport http.RoundTripper, allowInsecureHTTP bool) (func(context.Context) (string, error), error) { + mgr, err := newContextTokenManager(c, transport, allowInsecureHTTP) if err != nil { - return nil, fmt.Errorf("init token manager for context %q: %w", c.Name, err) + return nil, err } - name := c.Name - coreURL := strings.TrimRight(c.CoreURL, "/") - // Name the core in the re-login hint so a multi-core user logs back - // into the right one; matches clusterdiscovery.RenderLoginHint's idiom. - relogin := fmt.Sprintf("ENTIRE_AUTH_BASE_URL=%s entire login", coreURL) + req := tokenmanager.TokenRequest{Resource: resourceOrigin} return func(ctx context.Context) (string, error) { - tok, err := mgr.Refresh(ctx) - switch { - case errors.Is(err, tokenmanager.ErrReauthRequired): - return "", fmt.Errorf("login session for %q (%s) expired; run `%s` to re-authenticate", name, coreURL, relogin) - case errors.Is(err, tokenmanager.ErrNotLoggedIn): - return "", fmt.Errorf("no usable login for %q (%s); run `%s`", name, coreURL, relogin) - case err != nil: - return "", fmt.Errorf("refresh login token: %w", err) + tok, err := mgr.Token(ctx, req) + if mapped := contextReauthError(c, err); mapped != nil { + return "", mapped + } + if err != nil { + return "", fmt.Errorf("exchange token for %s: %w", resourceOrigin, err) } return tok, nil }, nil diff --git a/cmd/entire/cli/auth/repo_token.go b/cmd/entire/cli/auth/repo_token.go index 2f90fe610..89b966cfe 100644 --- a/cmd/entire/cli/auth/repo_token.go +++ b/cmd/entire/cli/auth/repo_token.go @@ -60,18 +60,21 @@ func SetRepoExchangeTransportForTest(rt http.RoundTripper) func() { // surface verbatim from the STS endpoint (e.g. invalid_target when no // mirror matches the slug+cluster). // -// The subject token is the stored login access token read directly, -// rather than routed through the refresh-aware tokenmanager. That's -// deliberate for two reasons: (1) `entire login` (device flow) stores only -// a bare access token — no refresh token — so there is nothing the manager -// could refresh that this path can't equally use; an expired login token -// fails both ways. (2) The manager's exchange also emits an RFC 8693 -// `resource` parameter alongside `audience`, whereas the data-plane gate +// The subject token is the stored login access token read directly, rather +// than routed through the refresh-aware tokenmanager — so this path does NOT +// silently re-mint an expired login JWT, even though one is now refreshable. +// This is a known gap slated for removal: COR-395 reworks RepoScopedToken to +// go through cluster discovery + the per-context refreshing provider (and +// deletes the dead resolveAuthHostToken alongside it). Two reasons it stayed +// direct until then: (1) historically `entire login` stored only a bare access +// token, so there was nothing to refresh — no longer true now that login +// requests `offline_access` and persists a refresh token, which is exactly why +// this needs the COR-395 rework. (2) The manager's exchange also emits an RFC +// 8693 `resource` parameter alongside `audience`, whereas the data-plane gate // keys solely on `audience`; going direct keeps the wire form byte-for-byte // what git-remote-entire (and the standalone entiredb CLI) already send. // Each call performs a fresh exchange and does not cache — callers that -// poll (e.g. the mirror clone wait) re-invoke on token expiry. If the CLI -// gains refresh tokens, route this through the tokenmanager instead. +// poll (e.g. the mirror clone wait) re-invoke on token expiry. func RepoScopedToken(ctx context.Context, clusterBaseURL, repoSlug, action string) (string, error) { provider := CurrentProvider() if strings.TrimSpace(provider.STSPath) == "" { diff --git a/cmd/entire/cli/auth_context_test.go b/cmd/entire/cli/auth_context_test.go index d2be00692..fbcfd1554 100644 --- a/cmd/entire/cli/auth_context_test.go +++ b/cmd/entire/cli/auth_context_test.go @@ -2,6 +2,7 @@ package cli import ( "bytes" + "context" "encoding/base64" "fmt" "os" @@ -11,6 +12,7 @@ import ( "time" "github.com/entireio/cli/cmd/entire/cli/auth" + "github.com/entireio/cli/internal/entireclient/contexts" "github.com/entireio/cli/internal/entireclient/tokenstore" "github.com/spf13/cobra" ) @@ -25,15 +27,15 @@ func TestResolveStatusTarget_PrefersActiveContext(t *testing.T) { t.Cleanup(restore) exp := time.Now().Add(time.Hour).Unix() - if _, err := auth.RecordLoginContext(makeContextJWT(t, fmt.Sprintf(`{"iss":"https://eu.auth.entire.io","handle":"alice","exp":%d}`, exp)), "", true); err != nil { + if _, err := auth.RecordLoginContext(makeContextJWT(t, fmt.Sprintf(`{"iss":"`+testCoreURL+`","handle":"alice","exp":%d}`, exp)), "", true); err != nil { t.Fatalf("record context: %v", err) } - got, err := resolveStatusTarget(auth.NewContextStore(), auth.Contexts, "https://fallback.example.com") + got, err := resolveStatusTarget(t.Context(), auth.NewContextStore(), auth.Contexts, auth.RefreshedLoginToken, "https://fallback.example.com") if err != nil { t.Fatalf("resolveStatusTarget: %v", err) } - if got.coreURL != "https://eu.auth.entire.io" { + if got.coreURL != testCoreURL { t.Errorf("coreURL = %q, want the active context's CoreURL", got.coreURL) } if got.token == "" { @@ -44,6 +46,67 @@ func TestResolveStatusTarget_PrefersActiveContext(t *testing.T) { } } +// TestResolveStatusTarget_PrefersRefreshedToken pins the fix: status uses the +// refreshed login JWT for the active context, so an expired-but-refreshable +// session reports "logged in" rather than the false "re-login" the raw read +// produced. The resolver returns a token distinct from what's stored; we assert +// status carries the refreshed one. +func TestResolveStatusTarget_PrefersRefreshedToken(t *testing.T) { + cfgDir := t.TempDir() + t.Setenv("ENTIRE_CONFIG_DIR", cfgDir) + restore := tokenstore.UseFileBackendForTesting(filepath.Join(t.TempDir(), "tokens.json")) + t.Cleanup(restore) + + // Stored token is expired; a raw read would 401 at /me → "re-login". + expired := time.Now().Add(-time.Hour).Unix() + if _, err := auth.RecordLoginContext(makeContextJWT(t, fmt.Sprintf(`{"iss":"`+testCoreURL+`","handle":"alice","exp":%d}`, expired)), "entr_refresh", true); err != nil { + t.Fatalf("record context: %v", err) + } + + refreshed := func(_ context.Context, _ *contexts.Context) (string, error) { return "refreshed-jwt", nil } + got, err := resolveStatusTarget(t.Context(), auth.NewContextStore(), auth.Contexts, refreshed, "https://fallback.example.com") + if err != nil { + t.Fatalf("resolveStatusTarget: %v", err) + } + if got.token != "refreshed-jwt" { + t.Errorf("token = %q, want the refreshed token (not the stale stored one)", got.token) + } + if got.coreURL != testCoreURL { + t.Errorf("coreURL = %q, want the active context's CoreURL", got.coreURL) + } +} + +// TestResolveStatusTarget_FallsBackToStoredWhenRefreshFails pins the safety net: +// when refresh fails (revoked family, network, opaque token) status drops to the +// stored token and lets the /me probe arbitrate — rather than skipping to the +// legacy entry or losing the active context. +func TestResolveStatusTarget_FallsBackToStoredWhenRefreshFails(t *testing.T) { + cfgDir := t.TempDir() + t.Setenv("ENTIRE_CONFIG_DIR", cfgDir) + restore := tokenstore.UseFileBackendForTesting(filepath.Join(t.TempDir(), "tokens.json")) + t.Cleanup(restore) + + exp := time.Now().Add(time.Hour).Unix() + stored := makeContextJWT(t, fmt.Sprintf(`{"iss":"`+testCoreURL+`","handle":"alice","exp":%d}`, exp)) + if _, err := auth.RecordLoginContext(stored, "", true); err != nil { + t.Fatalf("record context: %v", err) + } + + failRefresh := func(_ context.Context, _ *contexts.Context) (string, error) { + return "", auth.ErrNotLoggedIn + } + got, err := resolveStatusTarget(t.Context(), auth.NewContextStore(), auth.Contexts, failRefresh, "https://fallback.example.com") + if err != nil { + t.Fatalf("resolveStatusTarget: %v", err) + } + if got.token != stored { + t.Errorf("token = %q, want the stored token as fallback", got.token) + } + if got.coreURL != testCoreURL || got.activeContext == "" { + t.Errorf("want the active context preserved on fallback, got coreURL=%q activeContext=%q", got.coreURL, got.activeContext) + } +} + // A genuine contexts.json read/parse error is surfaced by resolveStatusTarget, // symmetric with the control-plane commands — not swallowed into the legacy // fallback. (A missing file reads as "no contexts" and is not an error.) @@ -56,7 +119,7 @@ func TestResolveStatusTarget_CorruptContextsErrors(t *testing.T) { if err := os.WriteFile(filepath.Join(cfgDir, "contexts.json"), []byte("{ not valid json"), 0o600); err != nil { t.Fatalf("write corrupt contexts.json: %v", err) } - if _, err := resolveStatusTarget(auth.NewContextStore(), auth.Contexts, "https://fallback.example.com"); err == nil { + if _, err := resolveStatusTarget(t.Context(), auth.NewContextStore(), auth.Contexts, auth.RefreshedLoginToken, "https://fallback.example.com"); err == nil { t.Fatal("want an error when contexts.json is corrupt, got nil") } } diff --git a/cmd/entire/cli/auth_test.go b/cmd/entire/cli/auth_test.go index 9f248a055..7a6edebfe 100644 --- a/cmd/entire/cli/auth_test.go +++ b/cmd/entire/cli/auth_test.go @@ -271,72 +271,65 @@ func TestAuthCmd_RegistersExpectedSubcommands(t *testing.T) { } } -// --- resolveDataAPIToken ---------------------------------------------------- -// -// These tests exercise the production path: they install a real -// tokenmanager.Manager via auth.SetManagerForTest and stub only the -// STS wire call via SetExchangeForTest. That covers the audience- -// matching logic the function-injection tests above can't reach -// (revokeCurrentAuthSession / revokeAllAuthSessions call -// resolveAuthHostToken directly, but unit tests for the surrounding flows -// inject fakes that bypass it). - // authResolveTestIssuer is intentionally distinct from api.AuthBaseURL() so // the manager's same-host shortcut is skipped and the STS-exchange path runs. const authResolveTestIssuer = "https://auth.resolve-test.example.com" -func TestResolveAuthHostToken_ScopesExchangeToAuthHostOrigin(t *testing.T) { - // No t.Parallel: SetManagerForTest mutates package-level state in the - // auth package. Concurrent tests in this package don't reach the real - // auth.TokenForResource path (they inject lister/revoker fakes), so - // serial execution here is purely defensive. +// --- shared token-manager test helpers -------------------------------------- +// +// Used by the data-API resolution tests (activity_cmd_test.go and friends) to +// install a real tokenmanager.Manager via auth.SetManagerForTest while stubbing +// only the STS wire call, so the static fallback path runs end-to-end without a +// live core. - store := newAuthMemStore() - saveCoreToken(t, store, authResolveTestIssuer, "opaque-core-token") +// authMemStore is an in-memory tokenstore.Store for tests that need a +// real tokenmanager.Manager. Mirrors the private memStore in auth-go's +// tokenmanager_test.go — that one isn't exported, so we duplicate the +// trivial implementation rather than pull in a fragile internal package. +type authMemStore struct { + data map[string]tokens.TokenSet +} - var capturedResource string - mgr := newResolveTestManager(t, store, func(_ context.Context, req sts.ExchangeRequest) (*tokens.TokenSet, error) { - capturedResource = req.Resource - return &tokens.TokenSet{AccessToken: "exchanged-auth-host-tok"}, nil - }) - t.Cleanup(auth.SetManagerForTest(t, mgr)) +func newAuthMemStore() *authMemStore { return &authMemStore{data: map[string]tokens.TokenSet{}} } - got, err := resolveAuthHostToken(t.Context()) - if err != nil { - t.Fatalf("resolveAuthHostToken: %v", err) - } +func (s *authMemStore) SaveTokens(profile string, t tokens.TokenSet) error { + s.data[profile] = t + return nil +} - if got != "exchanged-auth-host-tok" { - t.Errorf("token = %q, want %q", got, "exchanged-auth-host-tok") - } - // The whole point of the helper: when an exchange happens, the resource - // handed to STS must be the auth host's origin (where the session - // endpoints live), not the raw env-var value. - if want := api.OriginOnly(api.AuthBaseURL()); capturedResource != want { - t.Errorf("STS exchange Resource = %q, want %q (api.OriginOnly(api.AuthBaseURL()))", - capturedResource, want) +func (s *authMemStore) LoadTokens(profile string) (tokens.TokenSet, error) { + t, ok := s.data[profile] + if !ok { + return tokens.TokenSet{}, tokenstore.ErrNotFound } + return t, nil } -func TestResolveAuthHostToken_WrapsManagerError(t *testing.T) { - store := newAuthMemStore() - saveCoreToken(t, store, authResolveTestIssuer, "opaque-core-token") - - mgr := newResolveTestManager(t, store, func(context.Context, sts.ExchangeRequest) (*tokens.TokenSet, error) { - return nil, errors.New("simulated transport failure") - }) - t.Cleanup(auth.SetManagerForTest(t, mgr)) +func (s *authMemStore) DeleteTokens(profile string) error { + delete(s.data, profile) + return nil +} - _, err := resolveAuthHostToken(t.Context()) - if err == nil { - t.Fatal("expected error when exchange fails") - } - if !strings.Contains(err.Error(), "resolve auth-host token") { - t.Errorf("error = %v, want 'resolve auth-host token' wrap prefix", err) +func saveCoreToken(t *testing.T, store tokenstore.Store, profile, accessToken string) { + t.Helper() + if err := store.SaveTokens(profile, tokens.TokenSet{AccessToken: accessToken}); err != nil { + t.Fatalf("SaveTokens: %v", err) } - if !strings.Contains(err.Error(), "simulated transport failure") { - t.Errorf("error = %v, want underlying message preserved", err) +} + +func newResolveTestManager(t *testing.T, store tokenstore.Store, exchange func(context.Context, sts.ExchangeRequest) (*tokens.TokenSet, error)) *tokenmanager.Manager { + t.Helper() + mgr, err := tokenmanager.New(tokenmanager.Config{ + Issuer: authResolveTestIssuer, + ClientID: "entire-cli-test", + STSPath: "/sts/token", + Store: store, + }) + if err != nil { + t.Fatalf("tokenmanager.New: %v", err) } + tokenmanager.SetExchangeForTest(t, mgr, exchange) + return mgr } // --- isKeychainTokenRejected ----------------------------------------------- @@ -379,58 +372,6 @@ func TestIsKeychainTokenRejected_AllShapes(t *testing.T) { } } -// --- helpers for resolveAuthHostToken tests --------------------------------- - -// authMemStore is an in-memory tokenstore.Store for tests that need a -// real tokenmanager.Manager. Mirrors the private memStore in auth-go's -// tokenmanager_test.go — that one isn't exported, so we duplicate the -// trivial implementation rather than pull in a fragile internal package. -type authMemStore struct { - data map[string]tokens.TokenSet -} - -func newAuthMemStore() *authMemStore { return &authMemStore{data: map[string]tokens.TokenSet{}} } - -func (s *authMemStore) SaveTokens(profile string, t tokens.TokenSet) error { - s.data[profile] = t - return nil -} - -func (s *authMemStore) LoadTokens(profile string) (tokens.TokenSet, error) { - t, ok := s.data[profile] - if !ok { - return tokens.TokenSet{}, tokenstore.ErrNotFound - } - return t, nil -} - -func (s *authMemStore) DeleteTokens(profile string) error { - delete(s.data, profile) - return nil -} - -func saveCoreToken(t *testing.T, store tokenstore.Store, profile, accessToken string) { - t.Helper() - if err := store.SaveTokens(profile, tokens.TokenSet{AccessToken: accessToken}); err != nil { - t.Fatalf("SaveTokens: %v", err) - } -} - -func newResolveTestManager(t *testing.T, store tokenstore.Store, exchange func(context.Context, sts.ExchangeRequest) (*tokens.TokenSet, error)) *tokenmanager.Manager { - t.Helper() - mgr, err := tokenmanager.New(tokenmanager.Config{ - Issuer: authResolveTestIssuer, - ClientID: "entire-cli-test", - STSPath: "/sts/token", - Store: store, - }) - if err != nil { - t.Fatalf("tokenmanager.New: %v", err) - } - tokenmanager.SetExchangeForTest(t, mgr, exchange) - return mgr -} - func TestAuthCmd_TopLevelLoginAndLogoutStillRegistered(t *testing.T) { t.Parallel() diff --git a/cmd/entire/cli/dispatch/mode_local.go b/cmd/entire/cli/dispatch/mode_local.go index 780a4d4da..3b9c00c3d 100644 --- a/cmd/entire/cli/dispatch/mode_local.go +++ b/cmd/entire/cli/dispatch/mode_local.go @@ -24,11 +24,12 @@ import ( ) var ( - // lookupResourceToken returns a bearer scoped to the given resource - // origin. Production wiring goes through auth.TokenForResource so - // the tokenmanager's same-host shortcut, JWT-aud shortcut, and - // exchange dispatch all apply. Tests swap to a fixed-token closure. - lookupResourceToken = auth.TokenForResource + // lookupResourceToken returns a bearer for the given data-API base URL. + // Production wiring goes through auth.ResolveDataAPIToken so the dispatch + // host's /.well-known/entire-api.json picks the matching login context + // (falling back to static resolution when unadvertised). Tests swap to a + // fixed-token closure. + lookupResourceToken = auth.ResolveDataAPIToken nowUTC = func() time.Time { return time.Now().UTC() } ) diff --git a/cmd/entire/cli/logout.go b/cmd/entire/cli/logout.go index d3cdff7d3..bb9ce4799 100644 --- a/cmd/entire/cli/logout.go +++ b/cmd/entire/cli/logout.go @@ -73,8 +73,10 @@ func newLogoutCmd() *cobra.Command { } // Revoke against the active context's core (matching what - // `auth status` lists), not a static AuthBaseURL. - target, err := resolveStatusTarget(auth.NewContextStore(), auth.Contexts, api.AuthBaseURL()) + // `auth status` lists), not a static AuthBaseURL. The refreshing + // resolver means an expired-but-refreshable session still yields a + // bearer that can authenticate the revoke call. + target, err := resolveStatusTarget(cmd.Context(), auth.NewContextStore(), auth.Contexts, auth.RefreshedLoginToken, api.AuthBaseURL()) if err != nil { return err } diff --git a/cmd/entire/cli/recap.go b/cmd/entire/cli/recap.go index d80f6b197..bf4c245d6 100644 --- a/cmd/entire/cli/recap.go +++ b/cmd/entire/cli/recap.go @@ -168,20 +168,22 @@ func runRecap(ctx context.Context, w, errW io.Writer, f *recapFlags) error { // 401s via recapLoadErrorMessage so flag effects (--week, --agent, ...) // and the real auth error are not collapsed into one "sign in" hint. // -// Goes through auth.TokenForResource so split-host deployments get a -// resource-scoped bearer via RFC 8693 exchange. ErrNotLoggedIn is -// collapsed back into an empty token so the caller's "render with no -// bearer, let the server respond 401" path still fires. Every other -// resolution failure (STS exchange rejected, network error, audience -// misconfiguration, keyring locked) surfaces verbatim to the caller — -// previously these were all relabelled as keyring read failures via -// keyringReadError, which sent users on wild goose chases when the -// keyring was fine and the real problem was downstream. +// Goes through auth.ResolveDataAPIToken (the same context-aware path as +// activity/search/dispatch) so the data host's /.well-known/entire-api.json +// picks the matching login context and exchanges for the advertised audience, +// falling back to static resolution when discovery is unavailable. +// ErrNotLoggedIn is collapsed back into an empty token so the caller's "render +// with no bearer, let the server respond 401" path still fires. Every other +// resolution failure (no eligible/ambiguous context, STS exchange rejected, +// network error, keyring locked) surfaces verbatim to the caller — previously +// these were all relabelled as keyring read failures via keyringReadError, +// which sent users on wild goose chases when the keyring was fine and the real +// problem was downstream. func newRecapClient(ctx context.Context, insecureHTTP bool) (*api.Client, error) { if insecureHTTP { auth.EnableInsecureHTTP() } - token, err := auth.TokenForResource(ctx, api.OriginOnly(api.BaseURL())) + token, err := auth.ResolveDataAPIToken(ctx, api.BaseURL()) if errors.Is(err, auth.ErrNotLoggedIn) { token = "" err = nil diff --git a/cmd/entire/cli/search_cmd.go b/cmd/entire/cli/search_cmd.go index 1f85ee720..6f2e9f964 100644 --- a/cmd/entire/cli/search_cmd.go +++ b/cmd/entire/cli/search_cmd.go @@ -216,7 +216,7 @@ func resolveSearchToken(ctx context.Context, serviceURL string, insecureHTTPAuth if insecureHTTPAuth { auth.EnableInsecureHTTP() } - token, err := auth.TokenForResource(ctx, api.OriginOnly(serviceURL)) + token, err := auth.ResolveDataAPIToken(ctx, serviceURL) if errors.Is(err, auth.ErrNotLoggedIn) { return "", errors.New("not authenticated. Run 'entire login' to authenticate") } diff --git a/docs/architecture/upstream-host-resolution.md b/docs/architecture/upstream-host-resolution.md index 86168ba03..25364ae89 100644 --- a/docs/architecture/upstream-host-resolution.md +++ b/docs/architecture/upstream-host-resolution.md @@ -20,7 +20,7 @@ accept a core's JWTs. |---|---|---|---| | **Core** — IdP **and** control-plane API, co-located | `entire-core` (`us.auth.entire.io`) | `org` / `repo` / `project` / `grant`, `auth *`, `login` | none needed — the host *is* the core | | **Resource: git cluster** | `entire-server` / `entiredb` | `git-remote-entire` (clone/push) | `/.well-known/entire-cluster.json` → `core_urls` | -| **Resource: web/data API** | `entire.io` (`partial.to`) | `activity` / `search` / `trail` / `dispatch` | **none today** — see [Deferred](#deferred) | +| **Resource: web/data API** | `entire.io` (`partial.to`) | `activity` / `search` / `trail` / `dispatch` | `/.well-known/entire-api.json` → `trusted_issuers` (audience = the host origin) | `contexts.json` (`$ENTIRE_CONFIG_DIR/contexts.json`, shared with entiredb's CLIs) stores each login as `{Name, CoreURL, Handle, KeychainService}` plus a @@ -69,38 +69,74 @@ api.AuthBaseURL()`. When the active context lives on a *different* core, both its token-store reads and its STS/refresh endpoint are keyed on the wrong host. The per-context provider fixes that by keying on `c.CoreURL`. -## Deferred: the web/data API (`entire.io`) +### Web/data API (done) `activity` / `search` / `trail` / `dispatch` dial `ENTIRE_API_BASE_URL` -(default `entire.io`) and statically resolve their token. `entire.io` is a -**resource server** — it validates incoming JWTs against statically-configured -trusted issuers (`ENTIRE_CORE_BASE_URL` + `ENTIRE_CORE_TRUSTED_ISSUERS`) and a -fixed audience (`ENTIRE_CORE_JWT_AUDIENCE`, e.g. `entire-web-api`) — but it -does **not advertise** any of this. So the CLI can't map an `entire.io` host -back to a core/context the way it does for a git cluster. - -To close the gap (so `ENTIRE_API_BASE_URL=https://partial.to entire activity` -auto-selects the right context without also setting `ENTIRE_AUTH_BASE_URL`): - -1. **Server**: `entire.io` grows a `/.well-known/entire-api.json` advertising - its trust roots. Unlike the cluster blob (`core_urls` only), the API blob - must also carry the **audience** the CLI exchanges for: - ```json - { - "issuer": "https://us.auth.partial.to", - "trusted_issuers": ["https://us.auth.partial.to", "https://eu.auth.partial.to"], - "audience": "entire-web-api", - "jwks_uri": "https://us.auth.partial.to/.well-known/jwks.json" - } - ``` -2. **CLI**: generalize the cluster resolver into a shared "host → trusted - issuers → pick context" path whose *source* of trusted issuers is pluggable - (cluster.json / api.json / the core itself), then exchange the context's - token for the advertised audience via `auth.TokenForResource`. Wire it into - the `activity` / `search` / `trail` / `dispatch` constructors - (`NewAuthenticatedAPIClient`, `dispatch.NewCloudClient`, `search.Search`). - -`ENTIRE_API_BASE_URL` names the resource host to dial (the discovery target); -the context is then chosen from the issuers that host advertises. It does not -need to be paired with `ENTIRE_AUTH_BASE_URL` once discovery exists — that is -the whole point of the well-known. +(default `entire.io`; staging `partial.to`). `entire.io` is a **resource +server** — it validates incoming JWTs against trusted issuers +(`ENTIRE_CORE_BASE_URL` + `ENTIRE_CORE_TRUSTED_ISSUERS`) and a fixed audience +(`ENTIRE_CORE_JWT_AUDIENCE`). It now **advertises** all of this at +`/.well-known/entire-api.json`, so the CLI can map the API host back to a +core/context just like a git cluster: + +```json +{ + "issuer": "https://us.auth.partial.to", + "trusted_issuers": ["https://us.auth.partial.to", "https://eu.auth.partial.to"], + "audience": "https://partial.to", + "jwks_uris": {"https://us.auth.partial.to": "https://us.auth.partial.to/.well-known/jwks.json"} +} +``` + +The CLI reads **only `trusted_issuers`** — exactly the way the git path reads a +cluster's `core_urls`. `issuer`, `audience`, and `jwks_uris` are advertised but +ignored on decode (see the audience note below). + +> **Audience = the data host origin.** entire.io's `ENTIRE_CORE_JWT_AUDIENCE` is +> `https://entire.io` (prod) / `https://partial.to` (staging) — the data host's +> own base URI, on both environments. The token manager already defaults the RFC +> 8693 audience to the resource origin it's exchanging for, so dialing +> `https://entire.io` produces `aud = https://entire.io` with no special +> handling. The CLI therefore **derives** the audience from the host it's already +> dialing rather than reading the advertised `audience` field. (This trades away +> the "server changes audience without a CLI release" flexibility — acceptable +> because `aud == base URI` is a hard requirement on both environments.) + +Because the only field the CLI consumes is the trusted-issuer list — which *is* +a set of core URLs — the data-API discovery cache is literally the git cluster's +cores cache (`ClusterCoresCache`), in a separate file (`api_discovery.json`). + +Resolution (`auth.ResolveDataAPIToken`): + +1. Resolve the API host's trusted issuers: `api_discovery.json` when fresh, else + a live `/.well-known/entire-api.json` fetch (TLS-authenticated — it's a trust + root; redirects refused), cached with a 24h TTL and stale-fallback on a failed + re-fetch. Same `resolveClusterCores` shape the git path uses. +2. Pick the context with the **same cluster semantics** as the git path: + active-context-wins-if-eligible → sole eligible → explicit-choice error. + This is the lever that makes `ENTIRE_API_BASE_URL=https://partial.to entire + activity` authenticate as the partial.to login even while the active context + is a prod entire.io login — without also setting `ENTIRE_AUTH_BASE_URL`. +3. Exchange that context's login JWT at **its** core for the data host origin + (`auth.NewRefreshingResourceProvider`, keyed on `c.CoreURL` like the + control-plane provider; the token manager sets `aud` = that origin). +4. **Fallback**: if the host doesn't advertise discovery (404 / unreachable / + 503 / malformed) *and* no cache entry exists, fall back to the pre-discovery + static path (`TokenForResource` via the singleton manager), so behaviour is + never worse than before. A *reachable* host whose context selection fails + surfaces that error — the user must log in or pick one. (A transient outage + with a warm cache uses the stale entry, not the fallback.) + +The selection rule differs from the control plane (where the active context +*always* wins because there's no host to match): here a host **is** matched, so +the active context wins only when eligible. + +Key files: `cmd/entire/cli/auth/data_api.go` (`ResolveDataAPIToken` + fallback), +`cmd/entire/cli/auth/refresh.go` (`NewRefreshingResourceProvider`), +`internal/entireclient/clusterdiscovery/api_discovery.go` (`DiscoverAPI`, +`ResolveContextForAPI`, sharing `selectContext` *and* the cores cache with the +cluster path), `internal/entireclient/discovery/cluster_cores.go` +(`LoadAPICores`/`ModifyAPICores`). Seams: +`NewAuthenticatedAPIClient` (activity/trail/search-completion), +`dispatch/mode_local.go` `lookupResourceToken` (dispatch), +`search_cmd.go` `resolveSearchToken` (search). diff --git a/internal/entireclient/clusterdiscovery/api_discovery.go b/internal/entireclient/clusterdiscovery/api_discovery.go new file mode 100644 index 000000000..6f8c23cde --- /dev/null +++ b/internal/entireclient/clusterdiscovery/api_discovery.go @@ -0,0 +1,142 @@ +package clusterdiscovery + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/entireio/cli/internal/entireclient/contexts" + "github.com/entireio/cli/internal/entireclient/discovery" +) + +// APIPath is the well-known path a data/web API (entire.io) serves to advertise +// its trust roots, mirroring entire.io's api/src/app.ts route. The document also +// carries an `audience`, but the CLI doesn't consume it: the data API requires +// `aud` == its own base URI (https://entire.io / https://partial.to), which the +// token manager already derives from the resource origin it's dialing, so the +// only field the CLI needs is the trusted-issuer list — exactly like a git +// cluster's core_urls. +const APIPath = "/.well-known/entire-api.json" + +// APIResponse is the parsed shape of /.well-known/entire-api.json. The CLI reads +// only trusted_issuers (to pick the login context); issuer/audience/jwks_uris +// are server-side concerns and ignored on decode. +type APIResponse struct { + // TrustedIssuers is every core whose JWTs the API accepts. Used the same way + // cluster discovery uses core_urls: to pick the local context whose CoreURL + // the API will honour. + TrustedIssuers []string `json:"trusted_issuers"` +} + +// ErrDiscoveryUnavailable wraps every "the API didn't give us a usable +// trust-root document" outcome: it doesn't serve /.well-known/entire-api.json +// (404 — old deployment), is unreachable, answers 503 (unconfigured), or +// returns a malformed/empty body. Callers match on it to fall back to +// static token resolution so behaviour is never worse than before +// discovery existed. Selection failures (no eligible / ambiguous +// context) are NOT wrapped — those are real "log in / pick one" errors +// the user must see. +var ErrDiscoveryUnavailable = errors.New("api discovery unavailable") + +// DiscoverAPI fetches and parses an API host's /.well-known/entire-api.json, +// returning its trusted issuers. Every failure mode (transport, non-200, +// decode, empty trusted_issuers) is folded under ErrDiscoveryUnavailable so the +// caller has a single sentinel to fall back on. +// +// debugf is optional; nil suppresses debug output. +func DiscoverAPI(ctx context.Context, apiHost string, c *http.Client, debugf DebugFunc) (*APIResponse, error) { + if debugf == nil { + debugf = func(string, ...any) {} + } + var body APIResponse + if err := fetchWellKnownJSON(ctx, apiHost, APIPath, c, &body, debugf); err != nil { + return nil, fmt.Errorf("%w: %w", ErrDiscoveryUnavailable, err) + } + if len(body.TrustedIssuers) == 0 { + debugf("api discovery: no trusted_issuers in response from https://%s%s", apiHost, APIPath) + return nil, fmt.Errorf("%w: incomplete /.well-known/entire-api.json from %s", ErrDiscoveryUnavailable, apiHost) + } + return &body, nil +} + +// resolveAPICores returns apiHost's trusted issuer URLs, from +// api_discovery.json when fresh, otherwise via a live +// /.well-known/entire-api.json fetch (which is then cached). A stale-but-present +// cache entry is used as a fallback when the live fetch fails, so a brief outage +// doesn't break a command whose trust roots we already knew. Mirrors +// resolveClusterCores exactly — the data-API trusted issuers ARE core URLs, so +// they share the cores cache (different file). Cold failures stay folded under +// ErrDiscoveryUnavailable (from DiscoverAPI) for the caller's static fallback. +func resolveAPICores(ctx context.Context, cacheDir, apiHost string, httpClient *http.Client, debugf DebugFunc) ([]string, error) { + cache, err := discovery.LoadAPICores(cacheDir) + if err != nil { + // A cache read problem must not block resolution — discover live. + debugf("api-discovery cache load failed: %v; discovering live", err) + cache = nil + } + + var stale []string + if cache != nil { + if urls, fresh, ok := cache.Get(apiHost); ok { + if fresh { + debugf("api host %s trusted issuers from cache: %v", apiHost, urls) + return urls, nil + } + stale = urls + debugf("api host %s trusted-issuers cache expired; re-fetching %s", apiHost, APIPath) + } + } + + body, err := DiscoverAPI(ctx, apiHost, httpClient, debugf) + if err != nil { + if stale != nil { + debugf("api discovery for %s failed (%v); falling back to stale cached trusted issuers %v", apiHost, err, stale) + return stale, nil + } + return nil, err + } + + if mErr := discovery.ModifyAPICores(cacheDir, func(c discovery.ClusterCoresCache) error { + c.Set(apiHost, body.TrustedIssuers) + return nil + }); mErr != nil { + // Non-fatal: we resolved the issuers, the next command just re-fetches. + debugf("api-discovery cache write for %s failed: %v", apiHost, mErr) + } + return body.TrustedIssuers, nil +} + +// ResolveContextForAPI picks the local login context to authenticate data-API +// calls against apiHost. +// +// It mirrors ResolveContextForCluster: active context wins when its CoreURL is +// among the API's trusted issuers, else the sole eligible context, else an +// explicit-choice / login error — sourcing the trusted issuers from +// /.well-known/entire-api.json (cached in api_discovery.json, long TTL, +// re-fetched on expiry with stale fallback) instead of entire-cluster.json. +// Account selection is recomputed every call from the live contexts, never +// persisted. The caller exchanges the chosen context's token for the data host +// origin (which is the audience the API requires); no audience is read here. +// +// When the API doesn't advertise discovery (404 / unreachable / 503 / +// malformed) and no cache entry exists, the returned error wraps +// ErrDiscoveryUnavailable so the caller falls back to static resolution. A +// successful fetch whose context selection fails returns that selection error +// unwrapped — the user must act on it. +// +// debugf is optional; nil suppresses debug output. +func ResolveContextForAPI(ctx context.Context, configDir, cacheDir, apiHost string, httpClient *http.Client, debugf DebugFunc) (*contexts.Context, error) { + if debugf == nil { + debugf = func(string, ...any) {} + } + trustedIssuers, err := resolveAPICores(ctx, cacheDir, apiHost, httpClient, debugf) + if err != nil { + return nil, err + } + f, err := contexts.Load(configDir) + if err != nil { + return nil, fmt.Errorf("load contexts: %w", err) + } + return selectContext(f, "API host "+apiHost, trustedIssuers, debugf) +} diff --git a/internal/entireclient/clusterdiscovery/api_discovery_test.go b/internal/entireclient/clusterdiscovery/api_discovery_test.go new file mode 100644 index 000000000..7cf62db40 --- /dev/null +++ b/internal/entireclient/clusterdiscovery/api_discovery_test.go @@ -0,0 +1,332 @@ +package clusterdiscovery + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/entireio/cli/internal/entireclient/contexts" + "github.com/entireio/cli/internal/entireclient/discovery" +) + +// schemeRewriteTransport rewrites the scheme to http (DiscoverAPI hard-codes +// https://) while leaving the host untouched, so a cross-origin redirect +// reaches its real target rather than being pinned back to the first server. +type schemeRewriteTransport struct{ base http.RoundTripper } + +func (s schemeRewriteTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req.URL.Scheme = "http" + return s.base.RoundTrip(req) +} + +// apiDiscoveryBody carries issuer/audience/jwks_uris alongside trusted_issuers +// to prove the CLI reads only trusted_issuers and ignores the rest (audience is +// derived from the data host origin; jwks is server-side). +const apiDiscoveryBody = `{ + "issuer": "https://us.auth.partial.to", + "trusted_issuers": ["https://us.auth.partial.to", "https://eu.auth.partial.to"], + "audience": "https://partial.to", + "jwks_uris": {"https://us.auth.partial.to": "https://us.auth.partial.to/.well-known/jwks.json"} +}` + +func TestDiscoverAPI(t *testing.T) { + t.Parallel() + + t.Run("parses the document on 200", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, APIPath, r.URL.Path) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(apiDiscoveryBody)) //nolint:errcheck // test handler + })) + defer srv.Close() + + doc, err := DiscoverAPI(t.Context(), "partial.to", hostPinningClient(t, srv), t.Logf) + require.NoError(t, err) + assert.Equal(t, []string{"https://us.auth.partial.to", "https://eu.auth.partial.to"}, doc.TrustedIssuers) + }) + + // 404 (deployment predating the well-known), 503 (unconfigured), transport + // failure, malformed body, and an incomplete document all fold into + // ErrDiscoveryUnavailable so the caller has a single fallback signal. + t.Run("404 → ErrDiscoveryUnavailable", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "not found", http.StatusNotFound) + })) + defer srv.Close() + + _, err := DiscoverAPI(t.Context(), "partial.to", hostPinningClient(t, srv), t.Logf) + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) + + t.Run("503 → ErrDiscoveryUnavailable", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "not configured", http.StatusServiceUnavailable) + })) + defer srv.Close() + + _, err := DiscoverAPI(t.Context(), "partial.to", hostPinningClient(t, srv), t.Logf) + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) + + t.Run("transport error → ErrDiscoveryUnavailable", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {})) + client := hostPinningClient(t, srv) + srv.Close() + + _, err := DiscoverAPI(t.Context(), "partial.to", client, t.Logf) + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) + + t.Run("malformed JSON → ErrDiscoveryUnavailable", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`{not json`)) //nolint:errcheck // test handler + })) + defer srv.Close() + + _, err := DiscoverAPI(t.Context(), "partial.to", hostPinningClient(t, srv), t.Logf) + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) + + // A trust-root fetch must not follow a 3xx to another origin. The redirect + // target serves a perfectly valid document, so this test only passes if the + // redirect is genuinely refused (not merely erroring on a loop): following + // it would succeed and return the target's doc. + t.Run("refuses cross-origin redirect → ErrDiscoveryUnavailable", func(t *testing.T) { + t.Parallel() + target := httptest.NewServer(apiHandler(t, "https://us.auth.partial.to")) + defer target.Close() + redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, target.URL+APIPath, http.StatusFound) + })) + defer redirector.Close() + + // schemeRewriteClient rewrites the hard-coded https:// to http:// but + // leaves the host alone, so the redirect actually reaches `target` + // rather than being pinned back to `redirector`. + client := &http.Client{Transport: schemeRewriteTransport{base: http.DefaultTransport}} + host := strings.TrimPrefix(redirector.URL, "http://") + + doc, err := DiscoverAPI(t.Context(), host, client, t.Logf) + assert.Nil(t, doc, "must not return the redirect target's document") + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) + + t.Run("missing trusted_issuers → ErrDiscoveryUnavailable", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + // Only audience, no trusted_issuers — the one field the CLI needs. + _, _ = w.Write([]byte(`{"audience":"https://partial.to"}`)) //nolint:errcheck // test handler + })) + defer srv.Close() + + _, err := DiscoverAPI(t.Context(), "partial.to", hostPinningClient(t, srv), t.Logf) + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) +} + +func apiHandler(t *testing.T, trustedIssuers ...string) http.HandlerFunc { + t.Helper() + doc := APIResponse{TrustedIssuers: trustedIssuers} + body, err := json.Marshal(doc) + require.NoError(t, err) + return func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, APIPath, r.URL.Path) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(body) //nolint:errcheck // test handler + } +} + +func TestResolveContextForAPI(t *testing.T) { + t.Parallel() + + t.Run("active context wins when eligible, returns the doc", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(apiHandler(t, "https://us.auth.partial.to", "https://eu.auth.partial.to")) + defer srv.Close() + + configDir := t.TempDir() + require.NoError(t, contexts.Save(configDir, &contexts.File{ + CurrentContext: "me@us-partial", + Contexts: []*contexts.Context{ + {Name: "me@prod", CoreURL: "https://us.auth.entire.io", Handle: "me", KeychainService: "kc:prod"}, + {Name: "me@us-partial", CoreURL: "https://us.auth.partial.to", Handle: "me", KeychainService: "kc:partial"}, + }, + })) + + c, err := ResolveContextForAPI(t.Context(), configDir, t.TempDir(), "partial.to", hostPinningClient(t, srv), t.Logf) + require.NoError(t, err) + assert.Equal(t, "me@us-partial", c.Name) + }) + + // The cross-core case the slice exists to fix: the active context is a prod + // login, but the only context eligible for the partial.to API is the + // staging one — pick it without the operator setting ENTIRE_AUTH_BASE_URL. + t.Run("sole eligible context used despite unrelated active", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(apiHandler(t, "https://us.auth.partial.to", "https://eu.auth.partial.to")) + defer srv.Close() + + configDir := t.TempDir() + require.NoError(t, contexts.Save(configDir, &contexts.File{ + CurrentContext: "me@prod", + Contexts: []*contexts.Context{ + {Name: "me@prod", CoreURL: "https://us.auth.entire.io", Handle: "me", KeychainService: "kc:prod"}, + {Name: "me@staging", CoreURL: "https://eu.auth.partial.to", Handle: "me", KeychainService: "kc:staging"}, + }, + })) + + c, err := ResolveContextForAPI(t.Context(), configDir, t.TempDir(), "partial.to", hostPinningClient(t, srv), t.Logf) + require.NoError(t, err) + assert.Equal(t, "me@staging", c.Name) + }) + + t.Run("no eligible context → login hint naming the API host", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(apiHandler(t, "https://us.auth.partial.to")) + defer srv.Close() + + configDir := t.TempDir() + require.NoError(t, contexts.Save(configDir, &contexts.File{ + CurrentContext: "me@prod", + Contexts: []*contexts.Context{{Name: "me@prod", CoreURL: "https://us.auth.entire.io", Handle: "me", KeychainService: "kc:prod"}}, + })) + + _, err := ResolveContextForAPI(t.Context(), configDir, t.TempDir(), "partial.to", hostPinningClient(t, srv), t.Logf) + require.Error(t, err) + require.NotErrorIs(t, err, ErrDiscoveryUnavailable, "a reachable-but-unmatched API is a real login error, not a fallback case") + assert.Contains(t, err.Error(), "no auth context for API host partial.to") + }) + + t.Run("ambiguous eligible contexts → explicit-choice error", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(apiHandler(t, "https://us.auth.partial.to")) + defer srv.Close() + + configDir := t.TempDir() + require.NoError(t, contexts.Save(configDir, &contexts.File{ + CurrentContext: "me@prod", + Contexts: []*contexts.Context{ + {Name: "me@prod", CoreURL: "https://us.auth.entire.io", Handle: "me", KeychainService: "kc:prod"}, + {Name: "alice@partial", CoreURL: "https://us.auth.partial.to", Handle: "alice", KeychainService: "kc:a"}, + {Name: "bob@partial", CoreURL: "https://us.auth.partial.to", Handle: "bob", KeychainService: "kc:b"}, + }, + })) + + _, err := ResolveContextForAPI(t.Context(), configDir, t.TempDir(), "partial.to", hostPinningClient(t, srv), t.Logf) + require.Error(t, err) + assert.Contains(t, err.Error(), "multiple login contexts") + assert.Contains(t, err.Error(), "API host partial.to") + }) + + t.Run("unadvertised → ErrDiscoveryUnavailable for fallback", func(t *testing.T) { + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "not found", http.StatusNotFound) + })) + defer srv.Close() + + _, err := ResolveContextForAPI(t.Context(), t.TempDir(), t.TempDir(), "partial.to", hostPinningClient(t, srv), t.Logf) + assert.ErrorIs(t, err, ErrDiscoveryUnavailable) + }) +} + +// countingAPIHandler serves a fixed trust-root document and counts how many +// times /.well-known/entire-api.json is hit, so tests can assert cache hits vs +// live fetches. +func countingAPIHandler(t *testing.T, calls *int32, trustedIssuers ...string) http.HandlerFunc { + t.Helper() + doc := APIResponse{TrustedIssuers: trustedIssuers} + body, err := json.Marshal(doc) + require.NoError(t, err) + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + assert.Equal(t, APIPath, r.URL.Path) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(body) //nolint:errcheck // test handler + } +} + +func partialContexts(t *testing.T, configDir string) { + t.Helper() + require.NoError(t, contexts.Save(configDir, &contexts.File{ + CurrentContext: "me@us-partial", + Contexts: []*contexts.Context{ + {Name: "me@us-partial", CoreURL: "https://us.auth.partial.to", Handle: "me", KeychainService: "kc:partial"}, + }, + })) +} + +// TestResolveContextForAPI_CachedAcrossCalls: the first call fetches +// /.well-known/entire-api.json and caches it; the second is served from +// api_discovery.json with no network hit — symmetric with the cluster cache. +func TestResolveContextForAPI_CachedAcrossCalls(t *testing.T) { + t.Parallel() + var calls int32 + srv := httptest.NewServer(countingAPIHandler(t, &calls, "https://us.auth.partial.to")) + defer srv.Close() + + configDir := t.TempDir() + cacheDir := t.TempDir() + partialContexts(t, configDir) + + c, err := ResolveContextForAPI(t.Context(), configDir, cacheDir, "partial.to", hostPinningClient(t, srv), t.Logf) + require.NoError(t, err) + assert.Equal(t, "me@us-partial", c.Name) + require.Equal(t, int32(1), atomic.LoadInt32(&calls), "first call fetches /.well-known") + + c2, err := ResolveContextForAPI(t.Context(), configDir, cacheDir, "partial.to", hostPinningClient(t, srv), t.Logf) + require.NoError(t, err) + assert.Equal(t, "me@us-partial", c2.Name) + assert.Equal(t, int32(1), atomic.LoadInt32(&calls), "second call is served from the api-discovery cache") + + // The trusted issuers are persisted (in the cores cache, separate file). + cache, err := discovery.LoadAPICores(cacheDir) + require.NoError(t, err) + urls, fresh, ok := cache.Get("partial.to") + require.True(t, ok) + assert.True(t, fresh) + assert.Equal(t, []string{"https://us.auth.partial.to"}, urls) +} + +// TestResolveContextForAPI_StaleFallbackOnFetchFailure: a present-but-stale +// cache entry is used when the live re-fetch fails, so a brief outage doesn't +// break the command — and crucially does NOT degrade to ErrDiscoveryUnavailable +// (which would drop the caller to static resolution). +func TestResolveContextForAPI_StaleFallbackOnFetchFailure(t *testing.T) { + t.Parallel() + // Server always 503s, so every live fetch fails. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "not configured", http.StatusServiceUnavailable) + })) + defer srv.Close() + + configDir := t.TempDir() + cacheDir := t.TempDir() + partialContexts(t, configDir) + + // Seed a stale entry (fetched longer ago than the TTL). + require.NoError(t, discovery.ModifyAPICores(cacheDir, func(c discovery.ClusterCoresCache) error { + c["partial.to"] = &discovery.CoresEntry{ + CoreURLs: []string{"https://us.auth.partial.to"}, + FetchedAt: time.Now().Add(-discovery.ClusterCoresTTL - time.Hour), + } + return nil + })) + + c, err := ResolveContextForAPI(t.Context(), configDir, cacheDir, "partial.to", hostPinningClient(t, srv), t.Logf) + require.NoError(t, err, "stale cache entry should rescue a failed re-fetch") + require.NotErrorIs(t, err, ErrDiscoveryUnavailable) + assert.Equal(t, "me@us-partial", c.Name) +} diff --git a/internal/entireclient/clusterdiscovery/discovery.go b/internal/entireclient/clusterdiscovery/discovery.go index 3b9022baf..726a02a75 100644 --- a/internal/entireclient/clusterdiscovery/discovery.go +++ b/internal/entireclient/clusterdiscovery/discovery.go @@ -53,6 +53,60 @@ var ( ErrNoCoreURLs = errors.New("cluster advertises no trusted core URLs") ) +// statusError carries the HTTP status from a well-known fetch that +// returned a non-200, so each caller (cluster vs api) can map specific +// codes to its own sentinel (503 → not-configured, 404 → not-advertised) +// without the shared fetcher knowing either contract. +type statusError struct { + Code int + URL string +} + +func (e *statusError) Error() string { return fmt.Sprintf("HTTP %d from %s", e.Code, e.URL) } + +// fetchWellKnownJSON GETs https://host+path and decodes a 200 body into +// out. Transport failures are wrapped under ErrUnreachable; a non-200 +// returns a *statusError so the caller can branch on Code; a malformed +// 200 body returns a wrapped decode error. The scheme is hard-coded to +// https: the response is a trust root (which login servers to honour), +// so it must be TLS-authenticated — a plaintext fetch would let a +// network attacker advertise an attacker-controlled issuer. +func fetchWellKnownJSON(ctx context.Context, host, path string, c *http.Client, out any, debugf DebugFunc) error { + url := "https://" + host + path + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return fmt.Errorf("build discovery request: %w", err) + } + // Refuse redirects. This is a trust-root fetch — the response decides which + // login servers we honour — so a 3xx to another origin (or a plaintext + // downgrade) from a hostile/misconfigured host must not be followed. + // Shallow-copy the caller's client so we don't mutate its redirect policy + // (it's reused for other operations); the copy shares Transport/TLS config. + if c == nil { + c = http.DefaultClient + } + noRedirect := *c + noRedirect.CheckRedirect = func(*http.Request, []*http.Request) error { + return errors.New("discovery does not follow redirects (trust root)") + } + resp, err := noRedirect.Do(req) + if err != nil { + debugf("discovery: %v", err) + return fmt.Errorf("%w: %w", ErrUnreachable, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + debugf("discovery: HTTP %d from %s", resp.StatusCode, url) + return &statusError{Code: resp.StatusCode, URL: url} + } + if err := json.NewDecoder(resp.Body).Decode(out); err != nil { + debugf("discovery: decode: %v", err) + return fmt.Errorf("decode %s: %w", url, err) + } + return nil +} + // Discover fetches and parses the cluster's // /.well-known/entire-cluster.json. On success returns the parsed body // with a non-empty CoreURLs list. On failure returns one of the @@ -66,32 +120,17 @@ func Discover(ctx context.Context, clusterHost string, c *http.Client, debugf De if debugf == nil { debugf = func(string, ...any) {} } - url := "https://" + clusterHost + Path - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - return nil, fmt.Errorf("build discovery request: %w", err) - } - resp, err := c.Do(req) - if err != nil { - debugf("cluster discovery: %v", err) - return nil, fmt.Errorf("%w: %w", ErrUnreachable, err) - } - defer resp.Body.Close() - - if resp.StatusCode == http.StatusServiceUnavailable { - return nil, ErrNoIssuers - } - if resp.StatusCode != http.StatusOK { - debugf("cluster discovery: HTTP %d from %s", resp.StatusCode, url) - return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) - } var body Response - if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { - debugf("cluster discovery: decode: %v", err) - return nil, fmt.Errorf("decode %s: %w", url, err) + err := fetchWellKnownJSON(ctx, clusterHost, Path, c, &body, debugf) + var se *statusError + switch { + case errors.As(err, &se) && se.Code == http.StatusServiceUnavailable: + return nil, ErrNoIssuers + case err != nil: + return nil, err } if len(body.CoreURLs) == 0 { - debugf("cluster discovery: no core_urls in response from %s", url) + debugf("cluster discovery: no core_urls in response from https://%s%s", clusterHost, Path) return nil, ErrNoCoreURLs } return &body, nil @@ -102,13 +141,20 @@ func Discover(ctx context.Context, clusterHost string, c *http.Client, debugf De // clusterHost. The output is stable (one indented URL per line) so // callers can pattern-match in tests. func RenderLoginHint(clusterHost string, coreURLs []string) string { + return renderLoginHint("cluster "+clusterHost, coreURLs) +} + +// renderLoginHint is the subject-agnostic form behind RenderLoginHint: +// subject is a noun phrase like "cluster nyc.entire.io" or "API host +// partial.to" so the same hint serves both the git-cluster and data-API +// resolvers. +func renderLoginHint(subject string, coreURLs []string) string { var b strings.Builder - fmt.Fprintf(&b, "no auth context for cluster %s. This cluster accepts logins from:\n", clusterHost) + fmt.Fprintf(&b, "no auth context for %s. Accepts logins from:\n", subject) for _, u := range coreURLs { fmt.Fprintf(&b, " %s\n", u) } - fmt.Fprint(&b, "\nAuthenticate against one of those login servers and re-run your command:\n"+ - " ENTIRE_AUTH_BASE_URL= entire login\n"+ - "or, if you already have a login there, switch to it with `entire auth use `.") + fmt.Fprint(&b, "\nLog in with `entire login`, then re-run your command.\n"+ + "If you already have a login on one of those servers, switch to it with `entire auth use `.") return b.String() } diff --git a/internal/entireclient/clusterdiscovery/discovery_test.go b/internal/entireclient/clusterdiscovery/discovery_test.go index a8aaa0fb5..446be9d65 100644 --- a/internal/entireclient/clusterdiscovery/discovery_test.go +++ b/internal/entireclient/clusterdiscovery/discovery_test.go @@ -132,5 +132,5 @@ func TestRenderLoginHint(t *testing.T) { assert.Contains(t, hint, "\n https://a.example\n", "missing indented URL line: %q", hint) assert.Contains(t, hint, "\n https://b.example\n", "missing indented URL line: %q", hint) assert.Contains(t, hint, "entire login") - assert.Contains(t, hint, "ENTIRE_AUTH_BASE_URL") + assert.Contains(t, hint, "entire auth use") } diff --git a/internal/entireclient/clusterdiscovery/resolve.go b/internal/entireclient/clusterdiscovery/resolve.go index 7b51756ba..f6762ac1d 100644 --- a/internal/entireclient/clusterdiscovery/resolve.go +++ b/internal/entireclient/clusterdiscovery/resolve.go @@ -63,7 +63,7 @@ func ResolveContextForCluster(ctx context.Context, configDir, cacheDir, clusterH return nil, err } - return selectContext(f, clusterHost, coreURLs, debugf) + return selectContext(f, "cluster "+clusterHost, coreURLs, debugf) } // ResolveClusterCores returns the trusted control-plane core URLs that @@ -124,16 +124,19 @@ func resolveClusterCores(ctx context.Context, cacheDir, clusterHost string, http return body.CoreURLs, nil } -// selectContext applies the account-selection rules over the cluster's -// advertised cores. See ResolveContextForCluster for the rationale. -func selectContext(f *contexts.File, clusterHost string, coreURLs []string, debugf DebugFunc) (*contexts.Context, error) { +// selectContext applies the account-selection rules over a resource's +// advertised trusted issuers. subject is a noun phrase identifying the +// resource ("cluster nyc.entire.io" / "API host partial.to") used in +// messages, so the same rules serve both the git-cluster and data-API +// resolvers. See ResolveContextForCluster for the rationale. +func selectContext(f *contexts.File, subject string, coreURLs []string, debugf DebugFunc) (*contexts.Context, error) { eligible := eligibleContexts(f, coreURLs) - // 1. Active context wins when it's eligible for this cluster. + // 1. Active context wins when it's eligible for this resource. if current := f.Find(f.CurrentContext); current != nil { for _, c := range eligible { if c.Name == current.Name { - debugf("cluster %s -> active context %s", clusterHost, current.Name) + debugf("%s -> active context %s", subject, current.Name) return current, nil } } @@ -142,12 +145,12 @@ func selectContext(f *contexts.File, clusterHost string, coreURLs []string, debu // 2. Otherwise the eligible set decides. switch len(eligible) { case 0: - return nil, errors.New(RenderLoginHint(clusterHost, coreURLs)) + return nil, errors.New(renderLoginHint(subject, coreURLs)) case 1: - debugf("cluster %s -> sole eligible context %s", clusterHost, eligible[0].Name) + debugf("%s -> sole eligible context %s", subject, eligible[0].Name) return eligible[0], nil default: - return nil, ambiguousContextError(clusterHost, eligible) + return nil, ambiguousContextError(subject, eligible) } } @@ -169,16 +172,16 @@ func eligibleContexts(f *contexts.File, coreURLs []string) []*contexts.Context { } // ambiguousContextError is returned when more than one local context could -// authenticate against the cluster and none is active. We refuse to guess — +// authenticate against the resource and none is active. We refuse to guess — // the user picks explicitly. Names are sorted so the message is stable. -func ambiguousContextError(clusterHost string, eligible []*contexts.Context) error { +func ambiguousContextError(subject string, eligible []*contexts.Context) error { names := make([]string, len(eligible)) for i, c := range eligible { names[i] = c.Name } sort.Strings(names) - return fmt.Errorf("multiple login contexts can authenticate against cluster %s (%s); choose one with `entire auth use ` and re-run", - clusterHost, strings.Join(names, ", ")) + return fmt.Errorf("multiple login contexts can authenticate against %s (%s); choose one with `entire auth use ` and re-run", + subject, strings.Join(names, ", ")) } // formatDiscoveryError turns a Discover error into the message diff --git a/internal/entireclient/clusterdiscovery/resolve_test.go b/internal/entireclient/clusterdiscovery/resolve_test.go index f96cb9b4c..af8ded052 100644 --- a/internal/entireclient/clusterdiscovery/resolve_test.go +++ b/internal/entireclient/clusterdiscovery/resolve_test.go @@ -121,7 +121,7 @@ func TestResolve_NoEligibleContextReturnsLoginHint(t *testing.T) { assert.Contains(t, err.Error(), "no auth context for cluster aws-eu-central-1.entire.io") assert.Contains(t, err.Error(), "https://eu.auth.entire.io") assert.Contains(t, err.Error(), "entire login") - assert.Contains(t, err.Error(), "ENTIRE_AUTH_BASE_URL") + assert.Contains(t, err.Error(), "entire auth use") } // TestResolve_CoresCachedAcrossCalls: the first call hits /.well-known and diff --git a/internal/entireclient/discovery/api_discovery_test.go b/internal/entireclient/discovery/api_discovery_test.go new file mode 100644 index 000000000..37a3c7488 --- /dev/null +++ b/internal/entireclient/discovery/api_discovery_test.go @@ -0,0 +1,42 @@ +package discovery + +import "testing" + +// TestAPICores_SeparateFileFromClusterCores: the API and cluster caches share a +// type + TTL but must live in distinct files, so a cluster host and an API host +// with the same name can't clobber each other. +func TestAPICores_SeparateFileFromClusterCores(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + if err := ModifyClusterCores(dir, func(c ClusterCoresCache) error { + c.Set("shared.example", []string{"https://cluster-core.example"}) + return nil + }); err != nil { + t.Fatalf("ModifyClusterCores: %v", err) + } + if err := ModifyAPICores(dir, func(c ClusterCoresCache) error { + c.Set("shared.example", []string{"https://api-core.example"}) + return nil + }); err != nil { + t.Fatalf("ModifyAPICores: %v", err) + } + + clusterCache, err := LoadClusterCores(dir) + if err != nil { + t.Fatalf("LoadClusterCores: %v", err) + } + apiCache, err := LoadAPICores(dir) + if err != nil { + t.Fatalf("LoadAPICores: %v", err) + } + + clusterURLs, _, ok := clusterCache.Get("shared.example") + if !ok || clusterURLs[0] != "https://cluster-core.example" { + t.Fatalf("cluster cache = %v (ok=%v), want the cluster core", clusterURLs, ok) + } + apiURLs, fresh, ok := apiCache.Get("shared.example") + if !ok || !fresh || apiURLs[0] != "https://api-core.example" { + t.Fatalf("api cache = %v (fresh=%v ok=%v), want the api core", apiURLs, fresh, ok) + } +} diff --git a/internal/entireclient/discovery/cluster_cores.go b/internal/entireclient/discovery/cluster_cores.go index 4879a0a99..c6d698c78 100644 --- a/internal/entireclient/discovery/cluster_cores.go +++ b/internal/entireclient/discovery/cluster_cores.go @@ -8,11 +8,17 @@ import ( const ( clusterCoresFileName = "cluster_cores.json" - // ClusterCoresTTL bounds how long a cached cluster→core_urls mapping is - // treated as fresh. Which control plane(s) front a data-plane cluster is - // near-static infra — once a cluster is homed to a core it stays — so a - // long TTL is fine. On expiry we re-fetch /.well-known and only fall back - // to the stale entry if that fetch fails. + // apiDiscoveryFileName caches a data-API host's trusted issuer URLs, the + // same shape a git cluster's cores take. Kept in a separate file from + // cluster_cores.json so a cluster host and an API host that happen to share + // a name can't collide on one cache key. + apiDiscoveryFileName = "api_discovery.json" + + // ClusterCoresTTL bounds how long a cached host→trusted-issuer-URLs mapping + // is treated as fresh, for both clusters and data APIs. Which login + // server(s) front a resource is near-static infra — once homed it stays — + // so a long TTL is fine. On expiry we re-fetch /.well-known and only fall + // back to the stale entry if that fetch fails. ClusterCoresTTL = 24 * time.Hour ) @@ -58,6 +64,25 @@ func writeClusterCoresNoLock(path string, cache ClusterCoresCache) error { return writeCacheFile(path, cache) } +// LoadAPICores / ModifyAPICores are the data-API siblings of +// LoadClusterCores / ModifyClusterCores: same ClusterCoresCache type and TTL, +// different cache file (api_discovery.json). A data API's advertised +// trusted_issuers ARE core URLs (the login servers whose JWTs it accepts), so +// the cluster cores cache fits verbatim — the audience the CLI exchanges for is +// the data host origin itself, derived at call time, never cached. + +// LoadAPICores reads the api-host→trusted-issuer-URLs cache. Unlocked read; use +// ModifyAPICores for a read-modify-write sequence. +func LoadAPICores(cacheDir string) (ClusterCoresCache, error) { + return readClusterCoresNoLock(filepath.Join(cacheDir, apiDiscoveryFileName)) +} + +// ModifyAPICores atomically applies fn to the api-host→trusted-issuer-URLs +// cache under a single exclusive flock. +func ModifyAPICores(cacheDir string, fn func(ClusterCoresCache) error) error { + return modifyCacheFile(cacheDir, apiDiscoveryFileName, readClusterCoresNoLock, writeClusterCoresNoLock, fn) +} + // Get returns a cluster's cached core URLs, whether the entry is still fresh, // and whether it exists at all. A present-but-stale entry returns // (urls, false, true) so callers can attempt a re-fetch yet fall back to the