diff --git a/CLAUDE.md b/CLAUDE.md index 508815e2..f4a525f9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -391,6 +391,8 @@ See `docs/prerelease-builds.md` for download instructions. - Go 1.24 (toolchain go1.24.10) + Cobra (CLI), Chi router (HTTP), Zap (logging), mark3labs/mcp-go (MCP protocol) (020-oauth-login-feedback) - Go 1.24 (toolchain go1.24.10) + Cobra (CLI), Chi router (HTTP), Zap (logging), google/uuid (ID generation) (021-request-id-logging) - BBolt database (`~/.mcpproxy/config.db`) - activity log extended with request_id field (021-request-id-logging) +- Go 1.24 (toolchain go1.24.10) + mcp-go v0.43.1 (OAuth client), BBolt (storage), Prometheus (metrics), Zap (logging) (023-oauth-state-persistence) +- BBolt database (`~/.mcpproxy/config.db`) - `oauth_tokens` bucket with `OAuthTokenRecord` model (023-oauth-state-persistence) - Go 1.24 (toolchain go1.24.10) + TypeScript 5.x / Vue 3.5 + Cobra CLI, Chi router, BBolt storage, Zap logging, mark3labs/mcp-go, Vue 3, Tailwind CSS, DaisyUI (024-expand-activity-log) - BBolt database (`~/.mcpproxy/config.db`) - ActivityRecord model (024-expand-activity-log) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 32270c1e..9e30f59c 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -225,6 +225,7 @@ } ], "license": "MIT", + "peer": true, "engines": { "node": ">=18" }, @@ -248,6 +249,7 @@ } ], "license": "MIT", + "peer": true, "engines": { "node": ">=18" } @@ -1319,6 +1321,7 @@ "integrity": "sha512-4Z+L8I2OqhZV8qA132M4wNL30ypZGYOQVBfMgxDH/K5UX0PNqTu1c6za9ST5r9+tavvHiTWmBnKzpCJ/GlVFtg==", "dev": true, "license": "BSD-2-Clause", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "7.18.0", "@typescript-eslint/types": "7.18.0", @@ -1640,6 +1643,7 @@ "integrity": "sha512-izzd2zmnk8Nl5ECYkW27328RbQ1nKvkm6Bb5DAaz1Gk59EbLkiCMa6OLT0NoaAYTjOFS6N+SMYW1nh4/9ljPiw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@vitest/utils": "2.1.9", "fflate": "^0.8.2", @@ -1901,6 +1905,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2158,6 +2163,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "baseline-browser-mapping": "^2.8.3", "caniuse-lite": "^1.0.30001741", @@ -2292,6 +2298,7 @@ "resolved": "https://registry.npmjs.org/chart.js/-/chart.js-4.5.0.tgz", "integrity": "sha512-aYeC/jDgSEx8SHWZvANYMioYMZ2KX02W6f6uVfyteuCGcadDLcYVHdfdygsTQkQ4TKn5lghoojAsPj5pu0SnvQ==", "license": "MIT", + "peer": true, "dependencies": { "@kurkle/color": "^0.3.0" }, @@ -2835,6 +2842,7 @@ "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -2891,6 +2899,7 @@ "integrity": "sha512-174lJKuNsuDIlLpjeXc5E2Tss8P44uIimAfGD0b90k0NoirJqpG7stLuU9Vp/9ioTOrQdWVREc4mRd1BD+CvGw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.4.0", "globals": "^13.24.0", @@ -4135,6 +4144,7 @@ "resolved": "https://registry.npmjs.org/monaco-editor/-/monaco-editor-0.53.0.tgz", "integrity": "sha512-0WNThgC6CMWNXXBxTbaYYcunj08iB5rnx4/G56UOPeL9UVIUGGHA1GR0EWIh9Ebabj7NpCRawQ5b0hfN1jQmYQ==", "license": "MIT", + "peer": true, "dependencies": { "@types/trusted-types": "^1.0.6" } @@ -4557,6 +4567,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -5264,6 +5275,7 @@ "integrity": "sha512-w33E2aCvSDP0tW9RZuNXadXlkHXqFzSkQew/aIa2i/Sj8fThxwovwlXHSPXTbAHwEIhBFXAedUhP2tueAKP8Og==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@alloc/quick-lru": "^5.2.0", "arg": "^5.0.2", @@ -5410,6 +5422,7 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -5568,6 +5581,7 @@ "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", "devOptional": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -5630,6 +5644,7 @@ "integrity": "sha512-j3lYzGC3P+B5Yfy/pfKNgVEg4+UtcIJcVRt2cDjIOmhLourAqPqf8P7acgxeiSgUB7E3p2P8/3gNIgDLpwzs4g==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", @@ -5713,6 +5728,7 @@ "integrity": "sha512-MSmPM9REYqDGBI8439mA4mWhV5sKmDlBKWIYbA3lRb2PTHACE0mgKwA8yQ2xq9vxDTuk4iPrECBAEW2aoFXY0Q==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@vitest/expect": "2.1.9", "@vitest/mocker": "2.1.9", @@ -5785,6 +5801,7 @@ "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.21.tgz", "integrity": "sha512-xxf9rum9KtOdwdRkiApWL+9hZEMWE90FHh8yS1+KJAiWYh+iGWV1FquPjoO9VUHQ+VIhsCXNNyZ5Sf4++RVZBA==", "license": "MIT", + "peer": true, "dependencies": { "@vue/compiler-dom": "3.5.21", "@vue/compiler-sfc": "3.5.21", diff --git a/internal/cli/output/help_test.go b/internal/cli/output/help_test.go index a3cc5479..f95e0ce0 100644 --- a/internal/cli/output/help_test.go +++ b/internal/cli/output/help_test.go @@ -174,6 +174,7 @@ func TestExtractHelpInfo_Flags(t *testing.T) { if outputFlag == nil { t.Fatal("Expected to find 'output' flag") + return // unreachable but satisfies staticcheck SA5011 } if outputFlag.Shorthand != "o" { diff --git a/internal/health/calculator.go b/internal/health/calculator.go index 7fc8e1fb..9a63bc47 100644 --- a/internal/health/calculator.go +++ b/internal/health/calculator.go @@ -3,9 +3,26 @@ package health import ( "fmt" + "strings" "time" "github.com/smart-mcp-proxy/mcpproxy-go/internal/contracts" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/stringutil" +) + +// RefreshState represents the current state of token refresh for health reporting. +// Mirrors oauth.RefreshState for decoupling. +type RefreshState int + +const ( + // RefreshStateIdle means no refresh is pending or in progress. + RefreshStateIdle RefreshState = iota + // RefreshStateScheduled means a proactive refresh is scheduled at 80% lifetime. + RefreshStateScheduled + // RefreshStateRetrying means refresh failed and is retrying with exponential backoff. + RefreshStateRetrying + // RefreshStateFailed means refresh permanently failed (e.g., invalid_grant). + RefreshStateFailed ) // HealthCalculatorInput contains all fields needed to calculate health status. @@ -36,6 +53,12 @@ type HealthCalculatorInput struct { // Tool info ToolCount int + + // Refresh state (for health status integration - Spec 023) + RefreshState RefreshState // Current refresh state from RefreshManager + RefreshRetryCount int // Number of retry attempts + RefreshLastError string // Human-readable error message + RefreshNextAttempt *time.Time // When next retry will occur } // HealthCalculatorConfig contains configurable thresholds for health calculation. @@ -104,7 +127,7 @@ func CalculateHealth(input HealthCalculatorInput, cfg *HealthCalculatorConfig) * // 4. Connection state checks // Normalize state to lowercase for consistent matching // (ConnectionState.String() returns "Error", "Disconnected", etc.) - state := toLower(input.State) + state := strings.ToLower(input.State) switch state { case "error": // For OAuth-required servers with OAuth-related errors, suggest login instead of restart @@ -219,7 +242,35 @@ func CalculateHealth(input HealthCalculatorInput, cfg *HealthCalculatorConfig) * } } - // 6. Healthy state - connected with valid authentication (if required) + // 6. Refresh state checks (Spec 023) + // Check if refresh is in a degraded or failed state + switch input.RefreshState { + case RefreshStateRetrying: + // Refresh failed but retrying - degraded status + detail := formatRefreshRetryDetail(input.RefreshRetryCount, input.RefreshNextAttempt, input.RefreshLastError) + return &contracts.HealthStatus{ + Level: LevelDegraded, + AdminState: StateEnabled, + Summary: "Token refresh pending", + Detail: detail, + Action: ActionViewLogs, + } + case RefreshStateFailed: + // Refresh permanently failed - unhealthy status + detail := "Re-authentication required" + if input.RefreshLastError != "" { + detail = fmt.Sprintf("Re-authentication required: %s", input.RefreshLastError) + } + return &contracts.HealthStatus{ + Level: LevelUnhealthy, + AdminState: StateEnabled, + Summary: "Refresh token expired", + Detail: detail, + Action: ActionLogin, + } + } + + // 7. Healthy state - connected with valid authentication (if required) return &contracts.HealthStatus{ Level: LevelHealthy, AdminState: StateEnabled, @@ -271,7 +322,7 @@ func formatErrorSummary(lastError string) string { // Check for known patterns (in order) for _, mapping := range errorMappings { - if containsIgnoreCase(lastError, mapping.pattern) { + if stringutil.ContainsIgnoreCase(lastError, mapping.pattern) { return mapping.friendly } } @@ -302,34 +353,28 @@ func formatExpiringTokenSummary(timeUntilExpiry time.Duration) string { return fmt.Sprintf("Token expiring in %dh", hours) } -// containsIgnoreCase checks if s contains substr, ignoring case. -func containsIgnoreCase(s, substr string) bool { - return len(s) >= len(substr) && - (s == substr || - containsLower(toLower(s), toLower(substr))) -} +// formatRefreshRetryDetail formats the detail message for a refresh retry state. +func formatRefreshRetryDetail(retryCount int, nextAttempt *time.Time, lastError string) string { + var detail string -// toLower is a simple ASCII lowercase conversion. -func toLower(s string) string { - b := make([]byte, len(s)) - for i := 0; i < len(s); i++ { - c := s[i] - if c >= 'A' && c <= 'Z' { - c += 'a' - 'A' - } - b[i] = c + // Start with retry count and next attempt time + if nextAttempt != nil && !nextAttempt.IsZero() { + detail = fmt.Sprintf("Refresh retry %d scheduled for %s", retryCount, nextAttempt.Format(time.RFC3339)) + } else { + detail = fmt.Sprintf("Refresh retry %d pending", retryCount) } - return string(b) -} -// containsLower checks if s contains substr (both should be lowercase). -func containsLower(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if s[i:i+len(substr)] == substr { - return true + // Add last error if available + if lastError != "" { + // Truncate error if too long + errorMsg := lastError + if len(errorMsg) > 100 { + errorMsg = errorMsg[:97] + "..." } + detail = fmt.Sprintf("%s: %s", detail, errorMsg) } - return false + + return detail } // isOAuthRelatedError checks if the error message indicates an OAuth issue. @@ -349,7 +394,7 @@ func isOAuthRelatedError(err string) bool { "access_denied", } for _, pattern := range oauthPatterns { - if containsIgnoreCase(err, pattern) { + if stringutil.ContainsIgnoreCase(err, pattern) { return true } } @@ -402,7 +447,7 @@ func ExtractOAuthConfigError(lastError string) string { } for _, pattern := range configPatterns { - if containsIgnoreCase(lastError, pattern) { + if stringutil.ContainsIgnoreCase(lastError, pattern) { return lastError } } diff --git a/internal/health/calculator_test.go b/internal/health/calculator_test.go index 5fe890c1..29c8a59b 100644 --- a/internal/health/calculator_test.go +++ b/internal/health/calculator_test.go @@ -5,6 +5,8 @@ import ( "time" "github.com/stretchr/testify/assert" + + "github.com/smart-mcp-proxy/mcpproxy-go/internal/oauth" ) func TestCalculateHealth_DisabledServer(t *testing.T) { @@ -555,3 +557,226 @@ func TestExtractOAuthConfigError(t *testing.T) { }) } } + +// T033: Test health status output per refresh state (Spec 023) +func TestCalculateHealth_RefreshStateRetrying(t *testing.T) { + nextAttempt := time.Now().Add(30 * time.Second) + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "connected", + Connected: true, + OAuthRequired: true, + OAuthStatus: "authenticated", + ToolCount: 5, + RefreshState: RefreshStateRetrying, + RefreshRetryCount: 3, + RefreshLastError: "connection timeout", + RefreshNextAttempt: &nextAttempt, + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, LevelDegraded, result.Level, "RefreshStateRetrying should return degraded") + assert.Equal(t, StateEnabled, result.AdminState) + assert.Equal(t, "Token refresh pending", result.Summary) + assert.Contains(t, result.Detail, "Refresh retry 3") + assert.Contains(t, result.Detail, "connection timeout") + assert.Equal(t, ActionViewLogs, result.Action, "Retrying should suggest view_logs action") +} + +func TestCalculateHealth_RefreshStateFailed(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "connected", + Connected: true, + OAuthRequired: true, + OAuthStatus: "authenticated", + ToolCount: 5, + RefreshState: RefreshStateFailed, + RefreshLastError: "invalid_grant: refresh token expired", + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, LevelUnhealthy, result.Level, "RefreshStateFailed should return unhealthy") + assert.Equal(t, StateEnabled, result.AdminState) + assert.Equal(t, "Refresh token expired", result.Summary) + assert.Contains(t, result.Detail, "Re-authentication required") + assert.Contains(t, result.Detail, "invalid_grant") + assert.Equal(t, ActionLogin, result.Action, "Failed should suggest login action") +} + +func TestCalculateHealth_RefreshStateFailedNoError(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "connected", + Connected: true, + OAuthRequired: true, + OAuthStatus: "authenticated", + RefreshState: RefreshStateFailed, + // No RefreshLastError + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, LevelUnhealthy, result.Level) + assert.Equal(t, "Refresh token expired", result.Summary) + assert.Equal(t, "Re-authentication required", result.Detail) + assert.Equal(t, ActionLogin, result.Action) +} + +func TestCalculateHealth_RefreshStateIdle(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "connected", + Connected: true, + OAuthRequired: true, + OAuthStatus: "authenticated", + ToolCount: 5, + RefreshState: RefreshStateIdle, // Idle state + } + + result := CalculateHealth(input, nil) + + // RefreshStateIdle should not affect health - server should be healthy + assert.Equal(t, LevelHealthy, result.Level) + assert.Equal(t, "Connected (5 tools)", result.Summary) + assert.Equal(t, ActionNone, result.Action) +} + +func TestCalculateHealth_RefreshStateScheduled(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "connected", + Connected: true, + OAuthRequired: true, + OAuthStatus: "authenticated", + ToolCount: 5, + RefreshState: RefreshStateScheduled, // Scheduled for proactive refresh + } + + result := CalculateHealth(input, nil) + + // RefreshStateScheduled should not affect health - server should be healthy + assert.Equal(t, LevelHealthy, result.Level) + assert.Equal(t, "Connected (5 tools)", result.Summary) + assert.Equal(t, ActionNone, result.Action) +} + +// Test that higher priority issues take precedence over refresh state +func TestCalculateHealth_RefreshStatePriority(t *testing.T) { + t.Run("disabled takes priority over refresh retrying", func(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: false, + RefreshState: RefreshStateRetrying, + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, StateDisabled, result.AdminState) + assert.Equal(t, ActionEnable, result.Action) + }) + + t.Run("quarantine takes priority over refresh failed", func(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + Quarantined: true, + RefreshState: RefreshStateFailed, + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, StateQuarantined, result.AdminState) + assert.Equal(t, ActionApprove, result.Action) + }) + + t.Run("connection error takes priority over refresh retrying", func(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "error", + LastError: "connection refused", + RefreshState: RefreshStateRetrying, + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, "Connection refused", result.Summary) + assert.Equal(t, ActionRestart, result.Action) + }) + + t.Run("OAuth expired takes priority over refresh retrying", func(t *testing.T) { + input := HealthCalculatorInput{ + Name: "test-server", + Enabled: true, + State: "connected", + OAuthRequired: true, + OAuthStatus: "expired", + RefreshState: RefreshStateRetrying, + } + + result := CalculateHealth(input, nil) + + assert.Equal(t, "Token expired", result.Summary) + assert.Equal(t, ActionLogin, result.Action) + }) +} + +// Test formatRefreshRetryDetail helper function +func TestFormatRefreshRetryDetail(t *testing.T) { + t.Run("with next attempt time and error", func(t *testing.T) { + nextAttempt := time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC) + result := formatRefreshRetryDetail(3, &nextAttempt, "network timeout") + + assert.Contains(t, result, "Refresh retry 3") + assert.Contains(t, result, "2024-01-15T10:30:00Z") + assert.Contains(t, result, "network timeout") + }) + + t.Run("without next attempt time", func(t *testing.T) { + result := formatRefreshRetryDetail(5, nil, "connection refused") + + assert.Contains(t, result, "Refresh retry 5 pending") + assert.Contains(t, result, "connection refused") + }) + + t.Run("without error", func(t *testing.T) { + nextAttempt := time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC) + result := formatRefreshRetryDetail(1, &nextAttempt, "") + + assert.Contains(t, result, "Refresh retry 1") + assert.Contains(t, result, "2024-01-15T10:30:00Z") + assert.NotContains(t, result, ": :") + }) + + t.Run("truncates long error", func(t *testing.T) { + longError := "This is a very long error message that exceeds 100 characters and should be truncated to prevent overly long detail messages in the health status" + result := formatRefreshRetryDetail(2, nil, longError) + + assert.Contains(t, result, "...") + assert.LessOrEqual(t, len(result), 200) // Reasonable max length + }) +} + +// TestRefreshStateSync ensures health.RefreshState values stay in sync with oauth.RefreshState. +// The health package mirrors oauth.RefreshState for decoupling, but the values must match +// for proper state mapping when wiring RefreshManager state into health calculation. +func TestRefreshStateSync(t *testing.T) { + // Verify that the integer values match between health and oauth packages + // This test will fail if either package changes its constants without updating the other + assert.Equal(t, int(RefreshStateIdle), int(oauth.RefreshStateIdle), + "RefreshStateIdle values must match between health and oauth packages") + assert.Equal(t, int(RefreshStateScheduled), int(oauth.RefreshStateScheduled), + "RefreshStateScheduled values must match between health and oauth packages") + assert.Equal(t, int(RefreshStateRetrying), int(oauth.RefreshStateRetrying), + "RefreshStateRetrying values must match between health and oauth packages") + assert.Equal(t, int(RefreshStateFailed), int(oauth.RefreshStateFailed), + "RefreshStateFailed values must match between health and oauth packages") +} diff --git a/internal/oauth/logging.go b/internal/oauth/logging.go index 9d959834..bebba7db 100644 --- a/internal/oauth/logging.go +++ b/internal/oauth/logging.go @@ -159,29 +159,78 @@ func LogTokenMetadata(logger *zap.Logger, metadata TokenMetadata) { ) } -// LogTokenRefreshAttempt logs a token refresh attempt. -func LogTokenRefreshAttempt(logger *zap.Logger, attempt int, maxAttempts int) { - logger.Info("Attempting OAuth token refresh", +// LogClientConnectionAttempt logs a client connection attempt (not an actual token refresh). +// Note: This is called when retrying client.Start(), which may trigger automatic +// token refresh internally by mcp-go, but we cannot observe whether refresh actually occurred. +func LogClientConnectionAttempt(logger *zap.Logger, attempt int, maxAttempts int) { + logger.Info("OAuth client connection attempt", zap.Int("attempt", attempt), zap.Int("max_attempts", maxAttempts), ) } -// LogTokenRefreshSuccess logs a successful token refresh. -func LogTokenRefreshSuccess(logger *zap.Logger, duration time.Duration) { - logger.Info("OAuth token refresh successful", +// LogClientConnectionSuccess logs a successful client connection. +// Note: This does NOT mean a token refresh occurred - it means the client connected. +// The mcp-go library may have used a cached token or performed automatic refresh internally. +func LogClientConnectionSuccess(logger *zap.Logger, duration time.Duration) { + logger.Info("OAuth client connection successful", zap.Duration("duration", duration), ) } -// LogTokenRefreshFailure logs a failed token refresh attempt. -func LogTokenRefreshFailure(logger *zap.Logger, attempt int, err error) { - logger.Warn("OAuth token refresh failed", +// LogClientConnectionFailure logs a failed client connection attempt. +func LogClientConnectionFailure(logger *zap.Logger, attempt int, err error) { + logger.Warn("OAuth client connection failed", zap.Int("attempt", attempt), zap.Error(err), ) } +// Deprecated: Use LogClientConnectionAttempt instead. +// LogTokenRefreshAttempt is kept for backward compatibility but is misleading. +func LogTokenRefreshAttempt(logger *zap.Logger, attempt int, maxAttempts int) { + LogClientConnectionAttempt(logger, attempt, maxAttempts) +} + +// Deprecated: Use LogClientConnectionSuccess instead. +// LogTokenRefreshSuccess is kept for backward compatibility but is misleading. +// This is called when client.Start() succeeds, not when a token refresh occurs. +func LogTokenRefreshSuccess(logger *zap.Logger, duration time.Duration) { + LogClientConnectionSuccess(logger, duration) +} + +// Deprecated: Use LogClientConnectionFailure instead. +// LogTokenRefreshFailure is kept for backward compatibility but is misleading. +func LogTokenRefreshFailure(logger *zap.Logger, attempt int, err error) { + LogClientConnectionFailure(logger, attempt, err) +} + +// LogActualTokenRefreshAttempt logs an actual proactive token refresh attempt. +// This is called by RefreshManager when it initiates a token refresh operation. +func LogActualTokenRefreshAttempt(logger *zap.Logger, serverName string, tokenAge time.Duration) { + logger.Info("OAuth token refresh attempt", + zap.String("server", serverName), + zap.Duration("token_age", tokenAge), + ) +} + +// LogActualTokenRefreshResult logs the result of an actual token refresh operation. +// This is called by RefreshManager after a refresh attempt completes. +func LogActualTokenRefreshResult(logger *zap.Logger, serverName string, success bool, duration time.Duration, err error) { + if success { + logger.Info("OAuth token refresh succeeded", + zap.String("server", serverName), + zap.Duration("duration", duration), + ) + } else { + logger.Warn("OAuth token refresh failed", + zap.String("server", serverName), + zap.Duration("duration", duration), + zap.Error(err), + ) + } +} + // LogOAuthFlowStart logs the start of an OAuth flow. func LogOAuthFlowStart(logger *zap.Logger, serverName string, correlationID string) { logger.Info("Starting OAuth flow", diff --git a/internal/oauth/persistent_token_store.go b/internal/oauth/persistent_token_store.go index e453b701..710d01de 100644 --- a/internal/oauth/persistent_token_store.go +++ b/internal/oauth/persistent_token_store.go @@ -54,7 +54,15 @@ func GenerateServerKey(serverName, serverURL string) string { hashStr := hex.EncodeToString(hash[:]) // Return first 16 characters of hash for readability (still highly unique) - return fmt.Sprintf("%s_%s", serverName, hashStr[:16]) + key := fmt.Sprintf("%s_%s", serverName, hashStr[:16]) + + // Log key generation for debugging server key mismatches + zap.L().Debug("Generated OAuth server key", + zap.String("server_name", serverName), + zap.String("server_url", serverURL), + zap.String("generated_key", key)) + + return key } // GetToken retrieves the OAuth token from persistent storage @@ -67,11 +75,13 @@ func (p *PersistentTokenStore) GetToken(ctx context.Context) (*client.Token, err } p.logger.Debug("šŸ” Loading OAuth token from persistent storage", + zap.String("server_name", p.serverName), zap.String("server_key", p.serverKey)) record, err := p.storage.GetOAuthToken(p.serverKey) if err != nil { p.logger.Debug("āŒ No stored OAuth token found", + zap.String("server_name", p.serverName), zap.String("server_key", p.serverKey), zap.Error(err)) return nil, transport.ErrNoToken @@ -148,6 +158,19 @@ func (p *PersistentTokenStore) GetToken(ctx context.Context) (*client.Token, err HasRefreshToken: record.RefreshToken != "", }) + // Warn if returning an expired token without a refresh token - mcp-go cannot refresh this + if isExpired && record.RefreshToken == "" { + p.logger.Warn("āš ļø Returning expired token WITHOUT refresh_token - refresh will fail", + zap.String("server_name", p.serverName), + zap.String("server_key", p.serverKey), + zap.Time("expired_at", record.ExpiresAt)) + } else if record.RefreshToken == "" { + p.logger.Warn("āš ļø Token has no refresh_token - cannot be refreshed when it expires", + zap.String("server_name", p.serverName), + zap.String("server_key", p.serverKey), + zap.Time("expires_at", record.ExpiresAt)) + } + // Return the token - mcp-go library will check IsExpired() and handle refresh if needed // For long-lived tokens, we subtract the grace period from ExpiresAt to trigger refresh earlier // For short-lived tokens, we use the actual expiration to avoid falsely marking them as expired diff --git a/internal/oauth/refresh_manager.go b/internal/oauth/refresh_manager.go index c98b1991..a49a0a8c 100644 --- a/internal/oauth/refresh_manager.go +++ b/internal/oauth/refresh_manager.go @@ -10,32 +10,83 @@ import ( "go.uber.org/zap" "github.com/smart-mcp-proxy/mcpproxy-go/internal/storage" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/stringutil" ) // Default refresh configuration const ( // DefaultRefreshThreshold is the percentage of token lifetime at which proactive refresh triggers. - // 0.8 means refresh at 80% of lifetime (e.g., 30s token → refresh at 24s). - DefaultRefreshThreshold = 0.8 + // Used in hybrid calculation: refresh at the EARLIER of (threshold * lifetime) or (expiry - MinRefreshBuffer). + // 0.75 means refresh at 75% of lifetime for long-lived tokens. + DefaultRefreshThreshold = 0.75 // DefaultMaxRetries is the maximum number of refresh attempts before giving up. - DefaultMaxRetries = 3 + // Set to 0 for unlimited retries until token expiration (FR-009). + DefaultMaxRetries = 0 // MinRefreshInterval prevents too-frequent refresh attempts. MinRefreshInterval = 5 * time.Second + // MinRefreshBuffer is the minimum time before expiration to schedule a refresh. + // This ensures adequate time for retries even with short-lived tokens. + // Industry best practice: Google and Microsoft recommend 5 minutes. + MinRefreshBuffer = 5 * time.Minute + // RetryBackoffBase is the base duration for exponential backoff on retry. - RetryBackoffBase = 2 * time.Second + // Per FR-008: minimum 10 seconds between refresh attempts per server. + RetryBackoffBase = 10 * time.Second + + // MaxRetryBackoff is the maximum backoff duration (5 minutes per FR-009). + MaxRetryBackoff = 5 * time.Minute + + // MaxExpiredTokenAge is how long after token expiration we continue retrying + // before giving up completely. After this duration, we assume the refresh token + // is no longer valid even if it wasn't explicitly rejected. + MaxExpiredTokenAge = 24 * time.Hour ) +// RefreshState represents the current state of token refresh for health reporting. +type RefreshState int + +const ( + // RefreshStateIdle means no refresh is pending or in progress. + RefreshStateIdle RefreshState = iota + // RefreshStateScheduled means a proactive refresh is scheduled (hybrid: 75% lifetime or 5min buffer). + RefreshStateScheduled + // RefreshStateRetrying means refresh failed and is retrying with exponential backoff. + RefreshStateRetrying + // RefreshStateFailed means refresh permanently failed (e.g., invalid_grant). + RefreshStateFailed +) + +// String returns the string representation of RefreshState. +func (s RefreshState) String() string { + switch s { + case RefreshStateIdle: + return "idle" + case RefreshStateScheduled: + return "scheduled" + case RefreshStateRetrying: + return "retrying" + case RefreshStateFailed: + return "failed" + default: + return "unknown" + } +} + // RefreshSchedule tracks the proactive refresh state for a single server. type RefreshSchedule struct { - ServerName string // Unique server identifier - ExpiresAt time.Time // When the current token expires - ScheduledRefresh time.Time // When proactive refresh is scheduled (80% of lifetime) - RetryCount int // Number of refresh retry attempts (0-3) - LastError string // Last refresh error message - Timer *time.Timer // Background timer for scheduled refresh + ServerName string // Unique server identifier + ExpiresAt time.Time // When the current token expires + ScheduledRefresh time.Time // When proactive refresh is scheduled (hybrid strategy) + RetryCount int // Number of refresh retry attempts + LastError string // Last refresh error message + Timer *time.Timer // Background timer for scheduled refresh + RetryBackoff time.Duration // Current backoff duration for retries + MaxBackoff time.Duration // Maximum backoff duration (5 minutes) + LastAttempt time.Time // Time of last refresh attempt + RefreshState RefreshState // Current state for health reporting } // RefreshTokenStore defines storage operations needed by RefreshManager. @@ -55,6 +106,16 @@ type RefreshEventEmitter interface { EmitOAuthRefreshFailed(serverName string, errorMsg string) } +// RefreshMetricsRecorder defines metrics recording methods for OAuth refresh operations. +// This interface decouples RefreshManager from the concrete MetricsManager. +type RefreshMetricsRecorder interface { + // RecordOAuthRefresh records an OAuth token refresh attempt. + // Result should be one of: "success", "failed_network", "failed_invalid_grant", "failed_other". + RecordOAuthRefresh(server, result string) + // RecordOAuthRefreshDuration records the duration of an OAuth token refresh attempt. + RecordOAuthRefreshDuration(server, result string, duration time.Duration) +} + // RefreshManagerConfig holds configuration for the RefreshManager. type RefreshManagerConfig struct { Threshold float64 // Percentage of lifetime at which to refresh (default: 0.8) @@ -63,18 +124,19 @@ type RefreshManagerConfig struct { // RefreshManager coordinates proactive OAuth token refresh across all servers. type RefreshManager struct { - storage RefreshTokenStore - coordinator *OAuthFlowCoordinator - runtime RefreshRuntimeOperations - eventEmitter RefreshEventEmitter - schedules map[string]*RefreshSchedule - threshold float64 - maxRetries int - mu sync.RWMutex - logger *zap.Logger - ctx context.Context - cancel context.CancelFunc - started bool + storage RefreshTokenStore + coordinator *OAuthFlowCoordinator + runtime RefreshRuntimeOperations + eventEmitter RefreshEventEmitter + metricsRecorder RefreshMetricsRecorder + schedules map[string]*RefreshSchedule + threshold float64 + maxRetries int + mu sync.RWMutex + logger *zap.Logger + ctx context.Context + cancel context.CancelFunc + started bool } // NewRefreshManager creates a new RefreshManager instance. @@ -121,13 +183,21 @@ func (m *RefreshManager) SetEventEmitter(emitter RefreshEventEmitter) { m.eventEmitter = emitter } +// SetMetricsRecorder sets the metrics recorder for Prometheus metrics. +// This enables FR-011: OAuth refresh metrics emission. +func (m *RefreshManager) SetMetricsRecorder(recorder RefreshMetricsRecorder) { + m.metricsRecorder = recorder +} + // Start initializes the refresh manager and loads existing tokens. -// It schedules proactive refresh for all non-expired tokens. +// For non-expired tokens, it schedules proactive refresh using hybrid strategy. +// For expired tokens with valid refresh tokens, it attempts immediate refresh. func (m *RefreshManager) Start(ctx context.Context) error { m.mu.Lock() defer m.mu.Unlock() if m.started { + m.logger.Debug("RefreshManager already started, skipping") return nil // Already started } @@ -135,32 +205,195 @@ func (m *RefreshManager) Start(ctx context.Context) error { m.ctx, m.cancel = context.WithCancel(ctx) m.started = true - m.logger.Info("Starting RefreshManager") + m.logger.Info("RefreshManager.Start() called") + + // Track startup refresh stats + var scheduled, immediateRefresh, expired int // Load existing tokens and schedule refreshes if m.storage != nil { + m.logger.Debug("Loading OAuth tokens from storage", + zap.Bool("storage_available", true)) + tokens, err := m.storage.ListOAuthTokens() if err != nil { - m.logger.Warn("Failed to load existing tokens", zap.Error(err)) + m.logger.Warn("Failed to load existing tokens", + zap.Error(err)) // Continue - we can still handle new tokens } else { + m.logger.Info("OAuth tokens retrieved from storage", + zap.Int("count", len(tokens))) + // Collect tokens that need immediate refresh (expired access token but valid refresh token) + var tokensToRefresh []string + for _, token := range tokens { - if token != nil && !token.ExpiresAt.IsZero() { - // Use GetServerName() which returns DisplayName if available, - // falling back to ServerName for backward compatibility - serverName := token.GetServerName() + if token == nil || token.ExpiresAt.IsZero() { + m.logger.Debug("Skipping nil or zero-expiry token") + continue + } + + serverName := token.GetServerName() + now := time.Now() + + // Log each token being processed for debugging + m.logger.Debug("Processing OAuth token", + zap.String("server", serverName), + zap.String("storage_key", token.ServerName), + zap.Time("expires_at", token.ExpiresAt), + zap.Bool("has_refresh_token", token.RefreshToken != ""), + zap.Bool("is_expired", token.ExpiresAt.Before(now)), + zap.Duration("time_until_expiry", token.ExpiresAt.Sub(now))) + + if token.ExpiresAt.After(now) { + // Token not expired - schedule proactive refresh using hybrid strategy + m.logger.Debug("Scheduling proactive refresh for non-expired token", + zap.String("server", serverName), + zap.Time("expires_at", token.ExpiresAt)) m.scheduleRefreshLocked(serverName, token.ExpiresAt) + scheduled++ + } else if token.RefreshToken != "" { + // Access token expired but has refresh token - queue for immediate refresh + tokenAge := now.Sub(token.ExpiresAt) + m.logger.Info("OAuth token refresh needed at startup", + zap.String("server", serverName), + zap.Duration("expired_for", tokenAge), + zap.Time("expired_at", token.ExpiresAt)) + + // Create schedule entry in retrying state + m.schedules[serverName] = &RefreshSchedule{ + ServerName: serverName, + ExpiresAt: token.ExpiresAt, + RefreshState: RefreshStateRetrying, + RetryBackoff: RetryBackoffBase, + MaxBackoff: MaxRetryBackoff, + } + + tokensToRefresh = append(tokensToRefresh, serverName) + immediateRefresh++ + } else { + // Both access and refresh tokens expired - needs re-authentication + m.logger.Warn("OAuth token fully expired at startup - re-authentication required", + zap.String("server", serverName), + zap.Time("expired_at", token.ExpiresAt)) + + // Create schedule entry in failed state + m.schedules[serverName] = &RefreshSchedule{ + ServerName: serverName, + ExpiresAt: token.ExpiresAt, + RefreshState: RefreshStateFailed, + LastError: "Token expired and no refresh token available", + } + expired++ } } + m.logger.Info("Loaded existing tokens", - zap.Int("count", len(tokens)), - zap.Int("scheduled", len(m.schedules))) + zap.Int("total", len(tokens)), + zap.Int("scheduled", scheduled), + zap.Int("immediate_refresh", immediateRefresh), + zap.Int("expired", expired)) + + // Execute immediate refreshes asynchronously (after releasing the lock) + if len(tokensToRefresh) > 0 { + m.logger.Info("Starting asynchronous refresh for expired tokens", + zap.Int("count", len(tokensToRefresh)), + zap.Strings("servers", tokensToRefresh)) + go m.executeStartupRefreshes(tokensToRefresh) + } } + } else { + m.logger.Warn("RefreshManager started with nil storage - token persistence disabled") } + m.logger.Info("RefreshManager startup complete", + zap.Int("schedules_created", len(m.schedules))) + return nil } +// executeStartupRefreshes attempts immediate refresh for expired tokens at startup. +// This runs asynchronously to not block Start(). +func (m *RefreshManager) executeStartupRefreshes(serverNames []string) { + for _, serverName := range serverNames { + // Check if context is cancelled + if m.ctx.Err() != nil { + return + } + + m.logger.Info("OAuth token refresh attempt at startup", + zap.String("server", serverName)) + + m.executeImmediateRefresh(serverName) + } +} + +// executeImmediateRefresh attempts an immediate token refresh for expired tokens. +// This is called at startup for tokens with expired access tokens but valid refresh tokens. +func (m *RefreshManager) executeImmediateRefresh(serverName string) { + m.mu.Lock() + schedule, ok := m.schedules[serverName] + if !ok { + m.mu.Unlock() + return + } + + // Check rate limiting + if m.isRateLimited(schedule) { + timeSince := time.Since(schedule.LastAttempt) + waitTime := RetryBackoffBase - timeSince + m.mu.Unlock() + + m.logger.Debug("OAuth token refresh rate limited", + zap.String("server", serverName), + zap.Duration("wait", waitTime)) + + // Reschedule after rate limit expires + m.rescheduleAfterDelay(serverName, waitTime) + return + } + + // Update last attempt time + schedule.LastAttempt = time.Now() + m.mu.Unlock() + + // Get token info for logging + var tokenAge time.Duration + if m.storage != nil { + if token, err := m.storage.GetOAuthToken(serverName); err == nil && token != nil { + tokenAge = time.Since(token.Updated) + } + } + + // Log the refresh attempt + LogActualTokenRefreshAttempt(m.logger, serverName, tokenAge) + + // Attempt refresh + startTime := time.Now() + var refreshErr error + if m.runtime != nil { + refreshErr = m.runtime.RefreshOAuthToken(serverName) + } else { + refreshErr = ErrRefreshFailed + } + duration := time.Since(startTime) + + // Log the result + LogActualTokenRefreshResult(m.logger, serverName, refreshErr == nil, duration, refreshErr) + + // Record metrics (T014: Emit metrics on refresh attempt) + if m.metricsRecorder != nil { + result := classifyRefreshError(refreshErr) + m.metricsRecorder.RecordOAuthRefresh(serverName, result) + m.metricsRecorder.RecordOAuthRefreshDuration(serverName, result, duration) + } + + if refreshErr != nil { + m.handleRefreshFailure(serverName, refreshErr) + } else { + m.handleRefreshSuccess(serverName) + } +} + // Stop cancels all scheduled refreshes and cleans up resources. func (m *RefreshManager) Stop() { m.mu.Lock() @@ -237,12 +470,53 @@ func (m *RefreshManager) GetScheduleCount() int { return len(m.schedules) } +// RefreshStateInfo contains refresh state information for health status reporting. +type RefreshStateInfo struct { + State RefreshState // Current refresh state + RetryCount int // Number of retry attempts + LastError string // Last error message + NextAttempt *time.Time // When next refresh attempt is scheduled + ExpiresAt time.Time // When the token expires +} + +// GetRefreshState returns the current refresh state for a server. +// This is used by the health calculator to determine health status. +// Returns nil if no schedule exists for the server. +func (m *RefreshManager) GetRefreshState(serverName string) *RefreshStateInfo { + m.mu.RLock() + defer m.mu.RUnlock() + + schedule := m.schedules[serverName] + if schedule == nil { + return nil + } + + info := &RefreshStateInfo{ + State: schedule.RefreshState, + RetryCount: schedule.RetryCount, + LastError: schedule.LastError, + ExpiresAt: schedule.ExpiresAt, + } + + // Set next attempt time if scheduled + if !schedule.ScheduledRefresh.IsZero() { + info.NextAttempt = &schedule.ScheduledRefresh + } + + return info +} + // scheduleRefreshLocked schedules a proactive refresh for a token. // Must be called with m.mu held. +// +// Uses a hybrid refresh strategy (industry best practice): +// - Refresh at the EARLIER of: (threshold * lifetime) OR (expiry - MinRefreshBuffer) +// - This ensures short-lived tokens get adequate buffer time for retries +// - Long-lived tokens refresh at 75% of lifetime (e.g., 1-hour token → 45 min) +// - Short-lived tokens get at least 5 minutes buffer (e.g., 10-min token → 5 min) func (m *RefreshManager) scheduleRefreshLocked(serverName string, expiresAt time.Time) { now := time.Now() - // Calculate when to refresh (at threshold % of lifetime) lifetime := expiresAt.Sub(now) if lifetime <= 0 { m.logger.Debug("Token already expired, skipping schedule", @@ -251,17 +525,34 @@ func (m *RefreshManager) scheduleRefreshLocked(serverName string, expiresAt time return } - // Calculate refresh time at threshold of remaining lifetime - refreshDelay := time.Duration(float64(lifetime) * m.threshold) + // Hybrid refresh calculation: + // 1. Percentage-based: refresh at threshold% of lifetime (default 75%) + // 2. Buffer-based: refresh at (expiry - MinRefreshBuffer) for minimum safety margin + // Use the EARLIER of the two to ensure adequate time for retries + percentageDelay := time.Duration(float64(lifetime) * m.threshold) + bufferDelay := lifetime - MinRefreshBuffer + + // Choose the earlier refresh time (smaller delay) + var refreshDelay time.Duration + var strategy string + if bufferDelay > 0 && bufferDelay < percentageDelay { + // Buffer-based is earlier - use it for short-lived tokens + refreshDelay = bufferDelay + strategy = "buffer-based" + } else { + // Percentage-based is earlier or buffer would be negative + refreshDelay = percentageDelay + strategy = "percentage-based" + } - // Ensure minimum interval + // Ensure minimum interval (prevents hammering on very short tokens) if refreshDelay < MinRefreshInterval { refreshDelay = MinRefreshInterval } refreshAt := now.Add(refreshDelay) - // If refresh would be after expiration, schedule for just before expiration + // Final safety check: ensure we're not scheduling after expiration if refreshAt.After(expiresAt.Add(-MinRefreshInterval)) { refreshAt = expiresAt.Add(-MinRefreshInterval) refreshDelay = refreshAt.Sub(now) @@ -271,6 +562,7 @@ func (m *RefreshManager) scheduleRefreshLocked(serverName string, expiresAt time zap.Time("expires_at", expiresAt)) return } + strategy = "minimum-interval" } // Create or update schedule @@ -279,6 +571,8 @@ func (m *RefreshManager) scheduleRefreshLocked(serverName string, expiresAt time ExpiresAt: expiresAt, ScheduledRefresh: refreshAt, RetryCount: 0, + RefreshState: RefreshStateScheduled, + MaxBackoff: MaxRetryBackoff, } // Start timer @@ -288,11 +582,13 @@ func (m *RefreshManager) scheduleRefreshLocked(serverName string, expiresAt time m.schedules[serverName] = schedule - m.logger.Info("Scheduled proactive token refresh", + m.logger.Info("OAuth token refresh scheduled", zap.String("server", serverName), zap.Time("expires_at", expiresAt), zap.Time("refresh_at", refreshAt), zap.Duration("delay", refreshDelay), + zap.Duration("buffer", expiresAt.Sub(refreshAt)), + zap.String("strategy", strategy), zap.Float64("threshold", m.threshold)) } @@ -325,13 +621,22 @@ func (m *RefreshManager) executeRefresh(serverName string) { m.logger.Info("Executing proactive token refresh", zap.String("server", serverName)) - // Attempt refresh + // Attempt refresh with timing for metrics (T022: Emit refresh duration metric) + startTime := time.Now() var refreshErr error if m.runtime != nil { refreshErr = m.runtime.RefreshOAuthToken(serverName) } else { refreshErr = ErrRefreshFailed } + duration := time.Since(startTime) + + // Record metrics (T022: Emit refresh duration metric on each attempt) + if m.metricsRecorder != nil { + result := classifyRefreshError(refreshErr) + m.metricsRecorder.RecordOAuthRefresh(serverName, result) + m.metricsRecorder.RecordOAuthRefreshDuration(serverName, result, duration) + } if refreshErr != nil { m.handleRefreshFailure(serverName, refreshErr) @@ -347,10 +652,12 @@ func (m *RefreshManager) handleRefreshSuccess(serverName string) { if schedule != nil { schedule.RetryCount = 0 schedule.LastError = "" + schedule.RefreshState = RefreshStateIdle + schedule.RetryBackoff = 0 } m.mu.Unlock() - m.logger.Info("Proactive token refresh succeeded", + m.logger.Info("OAuth token refresh succeeded", zap.String("server", serverName)) // Get the new token expiration to emit event @@ -365,6 +672,7 @@ func (m *RefreshManager) handleRefreshSuccess(serverName string) { } // handleRefreshFailure handles a failed token refresh with exponential backoff retry. +// Per FR-009: Retries continue until token expiration (unlimited retries), not a fixed count. func (m *RefreshManager) handleRefreshFailure(serverName string, err error) { m.mu.Lock() schedule := m.schedules[serverName] @@ -375,38 +683,136 @@ func (m *RefreshManager) handleRefreshFailure(serverName string, err error) { schedule.RetryCount++ schedule.LastError = err.Error() + schedule.RefreshState = RefreshStateRetrying retryCount := schedule.RetryCount - maxRetries := m.maxRetries + expiresAt := schedule.ExpiresAt m.mu.Unlock() - m.logger.Warn("Proactive token refresh failed", + // Classify the error for metrics and handling + errorType := classifyRefreshError(err) + + m.logger.Warn("OAuth token refresh failed", zap.String("server", serverName), zap.Error(err), - zap.Int("retry_count", retryCount), - zap.Int("max_retries", maxRetries)) + zap.String("error_type", errorType), + zap.Int("retry_count", retryCount)) - if retryCount >= maxRetries { - // Max retries exceeded, emit failure event - m.logger.Error("Proactive token refresh failed after max retries", - zap.String("server", serverName), - zap.Int("retries", retryCount)) + // Check if this is a permanent failure (invalid_grant means refresh token is invalid/expired) + if errorType == "failed_invalid_grant" { + m.logger.Error("OAuth refresh token invalid - re-authentication required", + zap.String("server", serverName)) + + m.mu.Lock() + if schedule := m.schedules[serverName]; schedule != nil { + schedule.RefreshState = RefreshStateFailed + schedule.LastError = "Refresh token expired or revoked - re-authentication required" + } + m.mu.Unlock() if m.eventEmitter != nil { m.eventEmitter.EmitOAuthRefreshFailed(serverName, err.Error()) } - - // Clear the schedule - user will need to re-authenticate manually - m.mu.Lock() - delete(m.schedules, serverName) - m.mu.Unlock() return } - // Calculate backoff delay: base * 2^(retry-1) - backoff := RetryBackoffBase * time.Duration(1<<(retryCount-1)) + // Check if we should continue retrying (unlimited retries until token expiration per FR-009) + // Only stop if the access token has completely expired AND no more time remains + now := time.Now() + if !expiresAt.IsZero() && now.After(expiresAt) { + // Token has already expired - check if we should give up + // We'll keep trying as long as there's a chance the refresh token is still valid + // Only give up if we've been trying for too long (MaxExpiredTokenAge) + timeSinceExpiry := now.Sub(expiresAt) + if timeSinceExpiry > MaxExpiredTokenAge { + m.logger.Error("OAuth token refresh failed - token expired too long ago", + zap.String("server", serverName), + zap.Duration("expired_for", timeSinceExpiry), + zap.Int("retries", retryCount)) + + m.mu.Lock() + if schedule := m.schedules[serverName]; schedule != nil { + schedule.RefreshState = RefreshStateFailed + } + m.mu.Unlock() + + if m.eventEmitter != nil { + m.eventEmitter.EmitOAuthRefreshFailed(serverName, err.Error()) + } + return + } + } + + // Calculate backoff delay using exponential backoff with cap + backoff := m.calculateBackoff(retryCount - 1) // -1 because we just incremented m.rescheduleAfterDelay(serverName, backoff) } +// classifyRefreshError categorizes a refresh error for metrics and error handling. +// Returns one of: "failed_network", "failed_invalid_grant", "failed_other". +func classifyRefreshError(err error) string { + if err == nil { + return "success" + } + + errStr := err.Error() + + // Check for permanent OAuth errors (refresh token invalid/expired) + permanentErrors := []string{ + "invalid_grant", + "refresh token expired", + "refresh token revoked", + "refresh token invalid", + } + for _, pattern := range permanentErrors { + if stringutil.ContainsIgnoreCase(errStr, pattern) { + return "failed_invalid_grant" + } + } + + // Check for network-related errors (retryable) + networkErrors := []string{ + "timeout", + "connection refused", + "connection reset", + "no such host", + "dial tcp", + "network", + "EOF", + "context deadline exceeded", + } + for _, pattern := range networkErrors { + if stringutil.ContainsIgnoreCase(errStr, pattern) { + return "failed_network" + } + } + + return "failed_other" +} + +// calculateBackoff calculates the exponential backoff duration for a given retry count. +// The formula is: base * 2^retryCount, capped at MaxRetryBackoff (5 minutes). +// Sequence: 10s → 20s → 40s → 80s → 160s → 300s (cap). +func (m *RefreshManager) calculateBackoff(retryCount int) time.Duration { + if retryCount < 0 { + retryCount = 0 + } + backoff := RetryBackoffBase * time.Duration(1< MaxRetryBackoff { + backoff = MaxRetryBackoff + } + return backoff +} + +// isRateLimited checks if a refresh attempt would violate the rate limit. +// Per FR-008: minimum 10 seconds between refresh attempts per server. +func (m *RefreshManager) isRateLimited(schedule *RefreshSchedule) bool { + if schedule == nil || schedule.LastAttempt.IsZero() { + return false + } + timeSinceLastAttempt := time.Since(schedule.LastAttempt) + return timeSinceLastAttempt < RetryBackoffBase +} + // rescheduleAfterDelay reschedules a refresh attempt after a delay. func (m *RefreshManager) rescheduleAfterDelay(serverName string, delay time.Duration) { m.mu.Lock() @@ -422,13 +828,19 @@ func (m *RefreshManager) rescheduleAfterDelay(serverName string, delay time.Dura schedule.Timer.Stop() } + // Update schedule with next refresh time + schedule.ScheduledRefresh = time.Now().Add(delay) + schedule.RetryBackoff = delay + // Start new timer schedule.Timer = time.AfterFunc(delay, func() { m.executeRefresh(serverName) }) - m.logger.Info("Rescheduled token refresh", + m.logger.Info("OAuth token refresh retry scheduled", zap.String("server", serverName), zap.Duration("delay", delay), - zap.Int("retry_count", schedule.RetryCount)) + zap.Time("next_attempt", schedule.ScheduledRefresh), + zap.Int("retry_count", schedule.RetryCount), + zap.String("refresh_state", schedule.RefreshState.String())) } diff --git a/internal/oauth/refresh_manager_test.go b/internal/oauth/refresh_manager_test.go index a1506c93..c79fbd8d 100644 --- a/internal/oauth/refresh_manager_test.go +++ b/internal/oauth/refresh_manager_test.go @@ -135,12 +135,14 @@ func (m *mockEventEmitter) GetFailedEvents() int { return len(m.failedEvents) } -// T012: Test RefreshManager scheduleRefresh at 80% lifetime -func TestRefreshManager_ScheduleAt80PercentLifetime(t *testing.T) { +// T012: Test RefreshManager hybrid refresh strategy +// Uses the EARLIER of: 75% of lifetime OR (expiry - 5 minutes buffer) +func TestRefreshManager_HybridRefreshStrategy(t *testing.T) { logger := zaptest.NewLogger(t) store := newMockTokenStore() - // Token that expires in 1 hour (well above MinRefreshInterval of 5s) + // Token that expires in 1 hour (well above MinRefreshBuffer of 5 minutes) + // For 1-hour token: 75% = 45 min, buffer = 55 min, so percentage wins (45 min) expiresAt := time.Now().Add(1 * time.Hour) store.AddToken(&storage.OAuthTokenRecord{ ServerName: "test-server", @@ -160,17 +162,54 @@ func TestRefreshManager_ScheduleAt80PercentLifetime(t *testing.T) { schedule := manager.GetSchedule("test-server") require.NotNil(t, schedule, "Schedule should be created") - // Verify refresh is scheduled at approximately 80% of lifetime - // With 1 hour token, 80% = 48 minutes, so refresh should be around 48 minutes from now + // Verify refresh is scheduled using hybrid strategy + // With 1 hour token: 75% = 45 min (percentage-based wins over 55 min buffer-based) assert.Equal(t, "test-server", schedule.ServerName) assert.Equal(t, expiresAt, schedule.ExpiresAt) - // Verify scheduled time is approximately at 80% threshold - expectedRefreshDelay := time.Duration(float64(1*time.Hour) * 0.8) // 48 minutes + // Verify scheduled time is approximately at 75% threshold (hybrid chooses earlier time) + expectedRefreshDelay := time.Duration(float64(1*time.Hour) * DefaultRefreshThreshold) // 45 minutes actualRefreshDelay := time.Until(schedule.ScheduledRefresh) // Allow 10 second tolerance for test timing assert.InDelta(t, expectedRefreshDelay.Seconds(), actualRefreshDelay.Seconds(), 10.0, - "Refresh should be scheduled at ~80%% of lifetime") + "Refresh should be scheduled at ~75%% of lifetime for long-lived tokens") +} + +// T012b: Test buffer-based strategy for short-lived tokens +// For tokens where 75% would leave less than 5 minutes buffer, use 5-minute buffer instead +func TestRefreshManager_BufferBasedStrategy(t *testing.T) { + logger := zaptest.NewLogger(t) + store := newMockTokenStore() + + // Token that expires in 10 minutes + // For 10-min token: 75% = 7.5 min (2.5 min buffer), buffer-based = 5 min (5 min buffer) + // Buffer-based wins because 5 min < 7.5 min delay + expiresAt := time.Now().Add(10 * time.Minute) + store.AddToken(&storage.OAuthTokenRecord{ + ServerName: "test-server", + ExpiresAt: expiresAt, + }) + + manager := NewRefreshManager(store, nil, nil, logger) + runtime := &mockRuntime{} + manager.SetRuntime(runtime) + + ctx := context.Background() + err := manager.Start(ctx) + require.NoError(t, err) + defer manager.Stop() + + // Verify schedule was created + schedule := manager.GetSchedule("test-server") + require.NotNil(t, schedule, "Schedule should be created") + + // Verify refresh is scheduled using buffer-based strategy (5 minutes before expiry) + // Buffer = expiresAt - refreshAt should be ~5 minutes + actualBuffer := schedule.ExpiresAt.Sub(schedule.ScheduledRefresh) + expectedBuffer := MinRefreshBuffer // 5 minutes + // Allow 10 second tolerance for test timing + assert.InDelta(t, expectedBuffer.Seconds(), actualBuffer.Seconds(), 10.0, + "Short-lived tokens should use 5-minute buffer strategy") } // T013: Test RefreshManager retry with exponential backoff @@ -215,20 +254,20 @@ func TestRefreshManager_RetryWithExponentialBackoff(t *testing.T) { } } -// T014: Test RefreshManager stops after max retries -func TestRefreshManager_StopOnMaxRetries(t *testing.T) { +// T014: Test RefreshManager stops on permanent failure (invalid_grant) +func TestRefreshManager_StopOnPermanentFailure(t *testing.T) { logger := zaptest.NewLogger(t) store := newMockTokenStore() emitter := &mockEventEmitter{} config := &RefreshManagerConfig{ - Threshold: 0.1, - MaxRetries: 1, // Only 1 retry for faster test + Threshold: 0.1, } manager := NewRefreshManager(store, nil, config, logger) + // Use invalid_grant error which should be classified as permanent runtime := &mockRuntime{ - refreshErr: errors.New("permanent failure"), + refreshErr: errors.New("invalid_grant: refresh token expired"), } manager.SetRuntime(runtime) manager.SetEventEmitter(emitter) @@ -238,26 +277,27 @@ func TestRefreshManager_StopOnMaxRetries(t *testing.T) { require.NoError(t, err) defer manager.Stop() - // Manually trigger a refresh that will fail + // Manually trigger a refresh that will fail permanently expiresAt := time.Now().Add(10 * time.Second) manager.OnTokenSaved("test-server", expiresAt) // Give time for the schedule to be set up time.Sleep(50 * time.Millisecond) - // Directly call executeRefresh to test max retry behavior + // Directly call executeRefresh to test permanent failure behavior manager.executeRefresh("test-server") - // After max retries, schedule should be removed + // Wait for processing time.Sleep(50 * time.Millisecond) - // Trigger another refresh to exceed max retries - manager.executeRefresh("test-server") - - time.Sleep(50 * time.Millisecond) + // Check that failure event was emitted for permanent failure + assert.Equal(t, 1, emitter.GetFailedEvents(), "Should emit failure event for invalid_grant error") - // Check that failure event was emitted - assert.GreaterOrEqual(t, emitter.GetFailedEvents(), 1, "Should emit failure event after max retries") + // Schedule should have RefreshStateFailed set + schedule := manager.GetSchedule("test-server") + if schedule != nil { + assert.Equal(t, RefreshStateFailed, schedule.RefreshState, "State should be failed for permanent error") + } } // T015: Test RefreshManager coordination with OAuthFlowCoordinator @@ -450,15 +490,16 @@ func TestRefreshManager_SuccessEmitsEvent(t *testing.T) { assert.Equal(t, 1, emitter.GetRefreshedEvents(), "Should emit refreshed event on success") } -// Test that expired tokens are not scheduled -func TestRefreshManager_ExpiredTokenNotScheduled(t *testing.T) { +// Test that expired tokens without refresh token are marked as failed +func TestRefreshManager_ExpiredTokenWithoutRefreshToken(t *testing.T) { logger := zaptest.NewLogger(t) store := newMockTokenStore() - // Token that already expired + // Token that already expired with no refresh token store.AddToken(&storage.OAuthTokenRecord{ - ServerName: "expired-server", - ExpiresAt: time.Now().Add(-1 * time.Hour), // Already expired + ServerName: "expired-server", + ExpiresAt: time.Now().Add(-1 * time.Hour), // Already expired + RefreshToken: "", // No refresh token }) manager := NewRefreshManager(store, nil, nil, logger) @@ -468,6 +509,150 @@ func TestRefreshManager_ExpiredTokenNotScheduled(t *testing.T) { require.NoError(t, err) defer manager.Stop() - // Should not schedule refresh for expired token - assert.Equal(t, 0, manager.GetScheduleCount(), "Expired tokens should not be scheduled") + // Should have a schedule in failed state (not scheduled for refresh) + schedule := manager.GetSchedule("expired-server") + require.NotNil(t, schedule, "Should have a schedule entry for tracking state") + assert.Equal(t, RefreshStateFailed, schedule.RefreshState, "Expired tokens without refresh token should be in failed state") +} + +// Test that expired tokens with refresh token are queued for immediate refresh +func TestRefreshManager_ExpiredTokenWithRefreshToken(t *testing.T) { + logger := zaptest.NewLogger(t) + store := newMockTokenStore() + + // Token that already expired but has refresh token + store.AddToken(&storage.OAuthTokenRecord{ + ServerName: "expired-server", + ExpiresAt: time.Now().Add(-1 * time.Hour), // Already expired + RefreshToken: "valid-refresh-token", // Has refresh token + }) + + runtime := &mockRuntime{ + refreshErr: errors.New("network error"), + } + + manager := NewRefreshManager(store, nil, nil, logger) + manager.SetRuntime(runtime) + + ctx := context.Background() + err := manager.Start(ctx) + require.NoError(t, err) + defer manager.Stop() + + // Wait for async startup refresh to execute + time.Sleep(100 * time.Millisecond) + + // Should have attempted refresh + calls := runtime.GetRefreshCalls() + assert.Len(t, calls, 1, "Should attempt immediate refresh for expired token with refresh token") + + // Should be in retrying state (not failed since error is retryable) + schedule := manager.GetSchedule("expired-server") + require.NotNil(t, schedule, "Should have a schedule entry") + assert.Equal(t, RefreshStateRetrying, schedule.RefreshState, "Should be retrying after network error") +} + +// T023: Test exponential backoff sequence (10s, 20s, 40s, 80s, 160s, 300s cap) +func TestRefreshManager_ExponentialBackoffSequence(t *testing.T) { + logger := zaptest.NewLogger(t) + manager := NewRefreshManager(nil, nil, nil, logger) + + // Test the backoff sequence: 10s -> 20s -> 40s -> 80s -> 160s -> 300s (cap) + expectedBackoffs := []time.Duration{ + 10 * time.Second, // retry 0: 10s * 2^0 = 10s + 20 * time.Second, // retry 1: 10s * 2^1 = 20s + 40 * time.Second, // retry 2: 10s * 2^2 = 40s + 80 * time.Second, // retry 3: 10s * 2^3 = 80s + 160 * time.Second, // retry 4: 10s * 2^4 = 160s + 300 * time.Second, // retry 5: 10s * 2^5 = 320s, capped to 300s + 300 * time.Second, // retry 6: still capped at 300s + 300 * time.Second, // retry 7: still capped at 300s + } + + for i, expected := range expectedBackoffs { + actual := manager.calculateBackoff(i) + assert.Equal(t, expected, actual, "Backoff at retry %d should be %v, got %v", i, expected, actual) + } + + // Verify constants match expected values + assert.Equal(t, 10*time.Second, RetryBackoffBase, "RetryBackoffBase should be 10s") + assert.Equal(t, 5*time.Minute, MaxRetryBackoff, "MaxRetryBackoff should be 5min (300s)") +} + +// T024: Test unlimited retries until token expiration +func TestRefreshManager_UnlimitedRetriesUntilExpiration(t *testing.T) { + logger := zaptest.NewLogger(t) + store := newMockTokenStore() + emitter := &mockEventEmitter{} + + config := &RefreshManagerConfig{ + Threshold: 0.1, + } + + manager := NewRefreshManager(store, nil, config, logger) + // Use a network error which is retryable (not permanent like invalid_grant) + runtime := &mockRuntime{ + refreshErr: errors.New("connection timeout"), + } + manager.SetRuntime(runtime) + manager.SetEventEmitter(emitter) + + ctx := context.Background() + err := manager.Start(ctx) + require.NoError(t, err) + defer manager.Stop() + + // Create a token that expires far in the future + expiresAt := time.Now().Add(1 * time.Hour) + manager.OnTokenSaved("test-server", expiresAt) + + // Give time for the schedule to be set up + time.Sleep(50 * time.Millisecond) + + // Simulate multiple refresh failures + for i := 0; i < 5; i++ { + manager.executeRefresh("test-server") + time.Sleep(10 * time.Millisecond) + } + + // Verify that: + // 1. Schedule still exists (not cleared due to max retries) + schedule := manager.GetSchedule("test-server") + require.NotNil(t, schedule, "Schedule should still exist for retryable errors") + + // 2. State is RefreshStateRetrying (not failed) + assert.Equal(t, RefreshStateRetrying, schedule.RefreshState, "State should be retrying for network errors") + + // 3. No failure events emitted (only permanent failures emit events) + assert.Equal(t, 0, emitter.GetFailedEvents(), "No failure events for retryable network errors") + + // 4. Retry count should have increased + assert.GreaterOrEqual(t, schedule.RetryCount, 5, "Retry count should increase with each failure") +} + +// Test error classification for metrics +func TestClassifyRefreshError(t *testing.T) { + tests := []struct { + name string + err error + expected string + }{ + {"nil error", nil, "success"}, + {"invalid_grant", errors.New("invalid_grant: refresh token expired"), "failed_invalid_grant"}, + {"refresh token expired", errors.New("refresh token expired"), "failed_invalid_grant"}, + {"connection timeout", errors.New("connection timeout"), "failed_network"}, + {"connection refused", errors.New("connection refused"), "failed_network"}, + {"dial tcp error", errors.New("dial tcp: no such host"), "failed_network"}, + {"EOF error", errors.New("unexpected EOF"), "failed_network"}, + {"context deadline exceeded", errors.New("context deadline exceeded"), "failed_network"}, + {"generic error", errors.New("unknown server error"), "failed_other"}, + {"server_error", errors.New("OAuth server_error"), "failed_other"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := classifyRefreshError(tt.err) + assert.Equal(t, tt.expected, result, "Error classification mismatch for: %v", tt.err) + }) + } } diff --git a/internal/observability/metrics.go b/internal/observability/metrics.go index c634820d..4c56e4e4 100644 --- a/internal/observability/metrics.go +++ b/internal/observability/metrics.go @@ -39,6 +39,10 @@ type MetricsManager struct { actorConnectLatency *prometheus.HistogramVec actorRetries *prometheus.CounterVec actorFailures *prometheus.CounterVec + + // OAuth refresh metrics (Spec 023) + oauthRefreshTotal *prometheus.CounterVec + oauthRefreshDuration *prometheus.HistogramVec } // NewMetricsManager creates a new metrics manager @@ -199,6 +203,24 @@ func (mm *MetricsManager) initMetrics() { }, []string{"server", "error_type"}, ) + + // OAuth refresh metrics (Spec 023) + mm.oauthRefreshTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "mcpproxy_oauth_refresh_total", + Help: "Total number of OAuth token refresh attempts", + }, + []string{"server", "result"}, + ) + + mm.oauthRefreshDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "mcpproxy_oauth_refresh_duration_seconds", + Help: "OAuth token refresh duration in seconds", + Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30}, + }, + []string{"server", "result"}, + ) } // registerMetrics registers all metrics with the registry @@ -224,6 +246,9 @@ func (mm *MetricsManager) registerMetrics() { mm.actorConnectLatency, mm.actorRetries, mm.actorFailures, + // OAuth refresh metrics (Spec 023) + mm.oauthRefreshTotal, + mm.oauthRefreshDuration, ) // Also register Go runtime metrics @@ -361,3 +386,17 @@ func (mm *MetricsManager) RecordActorRetry(server string) { func (mm *MetricsManager) RecordActorFailure(server, errorType string) { mm.actorFailures.WithLabelValues(server, errorType).Inc() } + +// OAuth refresh metrics (Spec 023) + +// RecordOAuthRefresh records an OAuth token refresh attempt. +// Result should be one of: "success", "failed_network", "failed_invalid_grant", "failed_other". +func (mm *MetricsManager) RecordOAuthRefresh(server, result string) { + mm.oauthRefreshTotal.WithLabelValues(server, result).Inc() +} + +// RecordOAuthRefreshDuration records the duration of an OAuth token refresh attempt. +// Result should be one of: "success", "failed_network", "failed_invalid_grant", "failed_other". +func (mm *MetricsManager) RecordOAuthRefreshDuration(server, result string, duration time.Duration) { + mm.oauthRefreshDuration.WithLabelValues(server, result).Observe(duration.Seconds()) +} diff --git a/internal/runtime/lifecycle.go b/internal/runtime/lifecycle.go index 8779be76..7dc74c95 100644 --- a/internal/runtime/lifecycle.go +++ b/internal/runtime/lifecycle.go @@ -42,6 +42,10 @@ func (r *Runtime) StartBackgroundInitialization() { // Register token saved callback to wire PersistentTokenStore -> RefreshManager oauth.GetTokenStoreManager().SetTokenSavedCallback(func(serverName string, expiresAt time.Time) { + r.logger.Info("OnTokenSaved callback fired - scheduling proactive refresh", + zap.String("server", serverName), + zap.Time("expires_at", expiresAt), + zap.Duration("valid_for", time.Until(expiresAt))) r.refreshManager.OnTokenSaved(serverName, expiresAt) }) r.logger.Info("Token saved callback registered for proactive refresh") diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go index ff7dee73..3762f574 100644 --- a/internal/runtime/runtime.go +++ b/internal/runtime/runtime.go @@ -1474,6 +1474,20 @@ func (r *Runtime) GetManagementService() interface{} { return r.managementService } +// SetRefreshMetricsRecorder sets the metrics recorder for OAuth token refresh operations. +// This enables FR-011: OAuth refresh metrics emission. +func (r *Runtime) SetRefreshMetricsRecorder(recorder oauth.RefreshMetricsRecorder) { + if r.refreshManager != nil { + r.refreshManager.SetMetricsRecorder(recorder) + } +} + +// RefreshManager returns the OAuth refresh manager for health status integration. +// Returns nil if refresh manager hasn't been initialized. +func (r *Runtime) RefreshManager() *oauth.RefreshManager { + return r.refreshManager +} + // EmitServersChanged implements the EventEmitter interface for the management service. // This delegates to the runtime's internal event emission mechanism. func (r *Runtime) EmitServersChanged(reason string, extra map[string]any) { @@ -1700,6 +1714,16 @@ func (r *Runtime) GetAllServers() ([]map[string]interface{}, error) { healthInput.TokenExpiresAt = &tokenExpiresAt } + // T032: Wire refresh state into health calculation (Spec 023) + if r.refreshManager != nil { + if refreshState := r.refreshManager.GetRefreshState(serverStatus.Name); refreshState != nil { + healthInput.RefreshState = health.RefreshState(refreshState.State) + healthInput.RefreshRetryCount = refreshState.RetryCount + healthInput.RefreshLastError = refreshState.LastError + healthInput.RefreshNextAttempt = refreshState.NextAttempt + } + } + healthStatus := health.CalculateHealth(healthInput, healthConfig) serverMap["health"] = healthStatus diff --git a/internal/server/mcp.go b/internal/server/mcp.go index 96040cfc..56a7f71d 100644 --- a/internal/server/mcp.go +++ b/internal/server/mcp.go @@ -2096,6 +2096,19 @@ func (p *MCPProxyServer) handleListUpstreams(_ context.Context) (*mcp.CallToolRe MissingSecret: health.ExtractMissingSecret(lastError), OAuthConfigErr: health.ExtractOAuthConfigError(lastError), } + + // T032: Wire refresh state into health calculation (Spec 023) + if p.mainServer != nil && p.mainServer.runtime != nil { + if refreshMgr := p.mainServer.runtime.RefreshManager(); refreshMgr != nil { + if refreshState := refreshMgr.GetRefreshState(server.Name); refreshState != nil { + healthInput.RefreshState = health.RefreshState(refreshState.State) + healthInput.RefreshRetryCount = refreshState.RetryCount + healthInput.RefreshLastError = refreshState.LastError + healthInput.RefreshNextAttempt = refreshState.NextAttempt + } + } + } + serverMap["health"] = health.CalculateHealth(healthInput, health.DefaultHealthConfig()) // Add Docker isolation information diff --git a/internal/server/server.go b/internal/server/server.go index 7765bb05..d077f0e6 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -24,6 +24,7 @@ import ( "github.com/smart-mcp-proxy/mcpproxy-go/internal/httpapi" "github.com/smart-mcp-proxy/mcpproxy-go/internal/logs" "github.com/smart-mcp-proxy/mcpproxy-go/internal/management" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/observability" "github.com/smart-mcp-proxy/mcpproxy-go/internal/runtime" "github.com/smart-mcp-proxy/mcpproxy-go/internal/secret" "github.com/smart-mcp-proxy/mcpproxy-go/internal/storage" @@ -86,6 +87,17 @@ func NewServerWithConfigPath(cfg *config.Config, configPath string, logger *zap. // This must happen before StartBackgroundInitialization is called rt.SetVersion(httpapi.GetBuildVersion()) + // Initialize observability manager for metrics (FR-011: OAuth refresh metrics) + obsConfig := observability.DefaultConfig("mcpproxy", httpapi.GetBuildVersion()) + obsManager, err := observability.NewManager(logger.Sugar(), &obsConfig) + if err != nil { + logger.Warn("Failed to create observability manager, metrics will be disabled", zap.Error(err)) + } else if obsManager.Metrics() != nil { + // Wire up metrics recorder to RefreshManager for OAuth refresh metrics + rt.SetRefreshMetricsRecorder(obsManager.Metrics()) + logger.Info("OAuth refresh metrics enabled") + } + // Initialize management service and set it on runtime secretResolver := secret.NewResolver() mgmtService := management.NewService( @@ -665,6 +677,16 @@ func (s *Server) GetAllServers() ([]map[string]interface{}, error) { healthInput.OAuthRequired = true } + // T032: Wire refresh state into health calculation (Spec 023) + if refreshMgr := s.runtime.RefreshManager(); refreshMgr != nil { + if refreshState := refreshMgr.GetRefreshState(serverStatus.Name); refreshState != nil { + healthInput.RefreshState = health.RefreshState(refreshState.State) + healthInput.RefreshRetryCount = refreshState.RetryCount + healthInput.RefreshLastError = refreshState.LastError + healthInput.RefreshNextAttempt = refreshState.NextAttempt + } + } + healthStatus := health.CalculateHealth(healthInput, health.DefaultHealthConfig()) result = append(result, map[string]interface{}{ diff --git a/internal/stringutil/strings.go b/internal/stringutil/strings.go new file mode 100644 index 00000000..2739daef --- /dev/null +++ b/internal/stringutil/strings.go @@ -0,0 +1,9 @@ +// Package stringutil provides common string utility functions. +package stringutil + +import "strings" + +// ContainsIgnoreCase checks if s contains substr, ignoring case. +func ContainsIgnoreCase(s, substr string) bool { + return strings.Contains(strings.ToLower(s), strings.ToLower(substr)) +} diff --git a/internal/stringutil/strings_test.go b/internal/stringutil/strings_test.go new file mode 100644 index 00000000..2a9e1689 --- /dev/null +++ b/internal/stringutil/strings_test.go @@ -0,0 +1,35 @@ +package stringutil + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestContainsIgnoreCase(t *testing.T) { + tests := []struct { + name string + s string + substr string + expected bool + }{ + {"exact match", "hello", "hello", true}, + {"case insensitive match", "Hello World", "hello", true}, + {"case insensitive match upper", "hello world", "WORLD", true}, + {"mixed case", "HeLLo WoRLD", "ello wor", true}, + {"no match", "hello", "goodbye", false}, + {"empty substr", "hello", "", true}, + {"empty string", "", "hello", false}, + {"both empty", "", "", true}, + {"substr longer than string", "hi", "hello", false}, + {"special chars", "error: invalid_grant", "INVALID_GRANT", true}, + {"network error", "connection timeout", "TIMEOUT", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ContainsIgnoreCase(tt.s, tt.substr) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/internal/tray/managers.go b/internal/tray/managers.go index 35be972a..9cd8b6d2 100644 --- a/internal/tray/managers.go +++ b/internal/tray/managers.go @@ -930,6 +930,43 @@ func (m *MenuManager) updateServerActionMenus(serverName string, server map[stri zap.String("server", serverName), zap.String("action", enableText)) } + + // Update OAuth/restart visibility based on current health.action + healthAction := "" + if healthData, ok := server["health"].(map[string]interface{}); ok { + healthAction, _ = healthData["action"].(string) + } + + oauthItem, hasOAuth := m.serverOAuthItems[serverName] + restartItem, hasRestart := m.serverRestartItems[serverName] + + if hasOAuth && hasRestart { + switch healthAction { + case "login": + oauthItem.Show() + restartItem.Hide() + m.logger.Debug("Showing OAuth login, hiding restart", + zap.String("server", serverName), + zap.String("health.action", healthAction)) + case "restart": + oauthItem.Hide() + restartItem.Show() + restartItem.SetTitle("āš ļø Restart Required") + restartItem.SetTooltip(fmt.Sprintf("Restart server %s to fix issues", serverName)) + m.logger.Debug("Showing restart required, hiding OAuth", + zap.String("server", serverName), + zap.String("health.action", healthAction)) + default: + // No specific action or other actions - show standard restart, hide OAuth + oauthItem.Hide() + restartItem.Show() + restartItem.SetTitle("šŸ”„ Restart") + restartItem.SetTooltip(fmt.Sprintf("Restart server %s", serverName)) + m.logger.Debug("Showing standard restart, hiding OAuth", + zap.String("server", serverName), + zap.String("health.action", healthAction)) + } + } } // SynchronizationManager coordinates between state manager and menu manager diff --git a/internal/upstream/core/connection.go b/internal/upstream/core/connection.go index 1bda66a3..0fb4b23b 100644 --- a/internal/upstream/core/connection.go +++ b/internal/upstream/core/connection.go @@ -1243,11 +1243,11 @@ func (c *Client) tryOAuthAuth(ctx context.Context) error { if hasRefreshToken { backoff := refreshConfig.InitialBackoff for attempt := 1; attempt <= refreshConfig.MaxAttempts; attempt++ { - oauth.LogTokenRefreshAttempt(c.logger, attempt, refreshConfig.MaxAttempts) + oauth.LogClientConnectionAttempt(c.logger, attempt, refreshConfig.MaxAttempts) err = c.client.Start(ctx) if err == nil { - oauth.LogTokenRefreshSuccess(c.logger, time.Duration(attempt)*backoff) + oauth.LogClientConnectionSuccess(c.logger, time.Duration(attempt)*backoff) lastErr = nil break } @@ -1261,7 +1261,7 @@ func (c *Client) tryOAuthAuth(ctx context.Context) error { return oauthErr } - oauth.LogTokenRefreshFailure(c.logger, attempt, err) + oauth.LogClientConnectionFailure(c.logger, attempt, err) lastErr = err // Don't sleep on the last attempt @@ -1299,7 +1299,7 @@ func (c *Client) tryOAuthAuth(ctx context.Context) error { return fmt.Errorf("OAuth token exists in storage, retry connection to use it: %w", lastErr) } - c.logger.Info("šŸŽÆ OAuth authorization required after refresh attempts - starting manual OAuth flow", + c.logger.Info("šŸŽÆ OAuth authorization required after connection attempts - starting manual OAuth flow", zap.String("server", c.config.Name), zap.Bool("had_refresh_token", hasRefreshToken)) @@ -1723,11 +1723,11 @@ func (c *Client) trySSEOAuthAuth(ctx context.Context) error { if hasRefreshToken { backoff := refreshConfig.InitialBackoff for attempt := 1; attempt <= refreshConfig.MaxAttempts; attempt++ { - oauth.LogTokenRefreshAttempt(c.logger, attempt, refreshConfig.MaxAttempts) + oauth.LogClientConnectionAttempt(c.logger, attempt, refreshConfig.MaxAttempts) err = c.client.Start(persistentCtx) if err == nil { - oauth.LogTokenRefreshSuccess(c.logger, time.Duration(attempt)*backoff) + oauth.LogClientConnectionSuccess(c.logger, time.Duration(attempt)*backoff) lastErr = nil break } @@ -1741,7 +1741,7 @@ func (c *Client) trySSEOAuthAuth(ctx context.Context) error { return oauthErr } - oauth.LogTokenRefreshFailure(c.logger, attempt, err) + oauth.LogClientConnectionFailure(c.logger, attempt, err) lastErr = err // Don't sleep on the last attempt @@ -1779,7 +1779,7 @@ func (c *Client) trySSEOAuthAuth(ctx context.Context) error { return fmt.Errorf("OAuth token exists in storage, retry connection to use it: %w", lastErr) } - c.logger.Info("šŸŽÆ SSE OAuth authorization required after refresh attempts - starting manual OAuth flow", + c.logger.Info("šŸŽÆ SSE OAuth authorization required after connection attempts - starting manual OAuth flow", zap.String("server", c.config.Name), zap.Bool("had_refresh_token", hasRefreshToken)) diff --git a/internal/upstream/manager.go b/internal/upstream/manager.go index c8777748..1c69e4e9 100644 --- a/internal/upstream/manager.go +++ b/internal/upstream/manager.go @@ -2346,7 +2346,10 @@ func (m *Manager) SetUserLoggedOut(serverName string, loggedOut bool) error { // RefreshOAuthToken triggers a token refresh for a specific server. // This is called by the RefreshManager for proactive token refresh. -// TODO: This will be fully implemented in Phase 3 (US1) with RefreshManager integration. +// +// OAuth detection checks both: +// 1. Static config (serverConfig.OAuth) - for servers with pre-configured OAuth +// 2. Database tokens - for servers using dynamic OAuth discovery (Protected Resource Metadata) func (m *Manager) RefreshOAuthToken(serverName string) error { m.mu.RLock() client, exists := m.clients[serverName] @@ -2356,18 +2359,40 @@ func (m *Manager) RefreshOAuthToken(serverName string) error { return fmt.Errorf("server not found: %s", serverName) } - // Get the server config to check if it uses OAuth + // Check if server uses OAuth via either static config or dynamic discovery serverConfig := client.GetConfig() - if serverConfig == nil || serverConfig.OAuth == nil { + hasStaticOAuth := serverConfig != nil && serverConfig.OAuth != nil + + // Also check for OAuth tokens in the database (dynamic OAuth discovery) + // Servers like atlassian-remote, slack discover OAuth requirements at runtime + // via Protected Resource Metadata and store tokens without OAuth in static config + hasStoredTokens := false + if m.storage != nil && serverConfig != nil { + // IMPORTANT: Use GenerateServerKey to match how PersistentTokenStore stores tokens. + // Tokens are stored with key = SHA256(serverName|serverURL)[:16], not just serverName. + serverKey := oauth.GenerateServerKey(serverName, serverConfig.URL) + token, err := m.storage.GetOAuthToken(serverKey) + if err == nil && token != nil && token.RefreshToken != "" { + hasStoredTokens = true + m.logger.Debug("Found OAuth token in database for dynamic OAuth server", + zap.String("server", serverName), + zap.String("server_key", serverKey), + zap.Bool("has_refresh_token", token.RefreshToken != "")) + } + } + + if !hasStaticOAuth && !hasStoredTokens { return fmt.Errorf("server does not use OAuth: %s", serverName) } - // TODO: Implement actual token refresh in Phase 3 - // For now, force a reconnection which will trigger token refresh if needed - client.ForceReconnect("manual_token_refresh") + // Force a reconnection which will trigger token refresh via mcp-go's + // automatic token refresh when TokenStore provides a refresh token + client.ForceReconnect("oauth_token_refresh") m.logger.Info("OAuth token refresh requested", - zap.String("server", serverName)) + zap.String("server", serverName), + zap.Bool("has_static_oauth", hasStaticOAuth), + zap.Bool("has_stored_tokens", hasStoredTokens)) return nil } diff --git a/internal/upstream/manager_oauth_test.go b/internal/upstream/manager_oauth_test.go new file mode 100644 index 00000000..c935574b --- /dev/null +++ b/internal/upstream/manager_oauth_test.go @@ -0,0 +1,178 @@ +package upstream + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/smart-mcp-proxy/mcpproxy-go/internal/config" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/oauth" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/secret" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/storage" + "github.com/smart-mcp-proxy/mcpproxy-go/internal/upstream/managed" +) + +// TestRefreshOAuthToken_DynamicOAuthDiscovery tests that RefreshOAuthToken works +// for servers that use dynamic OAuth discovery (no OAuth in static config). +// +// Bug: The current implementation checks serverConfig.OAuth which is nil for +// servers that discover OAuth via Protected Resource Metadata at runtime. +// These servers have OAuth tokens stored in the database but not in their config. +// +// Related: spec 023-oauth-state-persistence +func TestRefreshOAuthToken_DynamicOAuthDiscovery(t *testing.T) { + logger := zap.NewNop() + sugaredLogger := logger.Sugar() + + // Create a server config WITHOUT OAuth block (simulates dynamic OAuth discovery) + // This is how servers like atlassian-remote, slack work - they discover OAuth + // requirements at runtime via Protected Resource Metadata + serverConfig := &config.ServerConfig{ + Name: "test-dynamic-oauth", + URL: "https://example.com/mcp", + Protocol: "http", + Enabled: true, + Created: time.Now(), + // NOTE: No OAuth field set - this is the key part of the test + // OAuth was discovered at runtime, not configured statically + } + + // Create an in-memory storage with OAuth tokens for this server + // This simulates a server that authenticated via dynamic OAuth discovery + tempDir := t.TempDir() + db, err := storage.NewBoltDB(tempDir, sugaredLogger) + require.NoError(t, err) + defer db.Close() + + // Generate the server key using the same function as PersistentTokenStore + // This is critical - tokens are stored with key = hash(name|url), not just name + serverKey := oauth.GenerateServerKey(serverConfig.Name, serverConfig.URL) + + // Store an OAuth token for the server (as if it had authenticated previously) + // The ServerName field is used as the storage key (must match GenerateServerKey output) + token := &storage.OAuthTokenRecord{ + ServerName: serverKey, // Key used for storage lookup (hash-based) + DisplayName: "test-dynamic-oauth", // Human-readable name for RefreshManager + AccessToken: "expired-access-token", + RefreshToken: "valid-refresh-token", + TokenType: "Bearer", + ExpiresAt: time.Now().Add(-1 * time.Hour), // Expired + Created: time.Now().Add(-2 * time.Hour), + Updated: time.Now().Add(-1 * time.Hour), + } + err = db.SaveOAuthToken(token) + require.NoError(t, err) + + // Verify token was saved with the correct key + savedToken, err := db.GetOAuthToken(serverKey) + require.NoError(t, err) + require.NotNil(t, savedToken, "Token should be saved in database with server_key") + assert.Equal(t, "valid-refresh-token", savedToken.RefreshToken) + + // Create the manager with a client for this server + manager := &Manager{ + clients: make(map[string]*managed.Client), + logger: logger, + storage: db, + secretResolver: secret.NewResolver(), + } + + // Create a managed client for the server + client, err := managed.NewClient( + "test-dynamic-oauth", + serverConfig, + logger, + nil, // logConfig + &config.Config{}, // globalConfig + db, // bolt storage + secret.NewResolver(), + ) + require.NoError(t, err) + manager.clients["test-dynamic-oauth"] = client + + // Attempt to refresh the OAuth token + // BUG: This currently fails with "server does not use OAuth: test-dynamic-oauth" + // because it checks serverConfig.OAuth which is nil + err = manager.RefreshOAuthToken("test-dynamic-oauth") + + // The refresh should NOT fail with "server does not use OAuth" + // It should either: + // 1. Successfully trigger a token refresh, or + // 2. Fail with a different error (network, invalid token, etc.) + if err != nil { + assert.NotContains(t, err.Error(), "server does not use OAuth", + "RefreshOAuthToken should not fail just because OAuth is not in static config. "+ + "The server has OAuth tokens in the database from dynamic discovery.") + } +} + +// TestRefreshOAuthToken_StaticOAuthConfig tests the happy path where OAuth +// is configured statically in the server config. +func TestRefreshOAuthToken_StaticOAuthConfig(t *testing.T) { + logger := zap.NewNop() + sugaredLogger := logger.Sugar() + + // Create a server config WITH OAuth block (traditional static config) + serverConfig := &config.ServerConfig{ + Name: "test-static-oauth", + URL: "https://example.com/mcp", + Protocol: "http", + Enabled: true, + Created: time.Now(), + OAuth: &config.OAuthConfig{ + ClientID: "test-client-id", + Scopes: []string{"read", "write"}, + }, + } + + tempDir := t.TempDir() + db, err := storage.NewBoltDB(tempDir, sugaredLogger) + require.NoError(t, err) + defer db.Close() + + manager := &Manager{ + clients: make(map[string]*managed.Client), + logger: logger, + storage: db, + secretResolver: secret.NewResolver(), + } + + client, err := managed.NewClient( + "test-static-oauth", + serverConfig, + logger, + nil, + &config.Config{}, + db, + secret.NewResolver(), + ) + require.NoError(t, err) + manager.clients["test-static-oauth"] = client + + // This should not fail with "server does not use OAuth" + // It may fail with connection errors, but that's expected in a unit test + err = manager.RefreshOAuthToken("test-static-oauth") + + // Should not fail with the OAuth detection error + if err != nil { + assert.NotContains(t, err.Error(), "server does not use OAuth") + } +} + +// TestRefreshOAuthToken_ServerNotFound tests that non-existent servers return proper error. +func TestRefreshOAuthToken_ServerNotFound(t *testing.T) { + logger := zap.NewNop() + + manager := &Manager{ + clients: make(map[string]*managed.Client), + logger: logger, + } + + err := manager.RefreshOAuthToken("non-existent-server") + + require.Error(t, err) + assert.Contains(t, err.Error(), "server not found") +} diff --git a/specs/023-oauth-state-persistence/contracts/health-status-changes.md b/specs/023-oauth-state-persistence/contracts/health-status-changes.md new file mode 100644 index 00000000..b24b6b40 --- /dev/null +++ b/specs/023-oauth-state-persistence/contracts/health-status-changes.md @@ -0,0 +1,124 @@ +# Contract Changes: Health Status Extension + +**Feature**: 023-oauth-state-persistence +**Date**: 2026-01-12 + +## Overview + +This feature does NOT add new REST API endpoints. It extends the existing health status response format to include refresh retry state information. + +## Existing Endpoint + +**Endpoint**: `GET /api/v1/servers` +**Response**: Array of server objects with `health` field + +## Health Field Extension + +### Current Schema (unchanged fields) + +```json +{ + "health": { + "level": "healthy|degraded|unhealthy", + "admin_state": "enabled|disabled|quarantined", + "summary": "string", + "detail": "string", + "action": "login|restart|enable|approve|view_logs|" + } +} +``` + +### Extended `detail` Values + +The `detail` field will include refresh-specific information when applicable: + +| Condition | level | summary | detail | action | +|-----------|-------|---------|--------|--------| +| Refresh scheduled | healthy | Connected | Token refresh scheduled for {time} | - | +| Refresh retrying | degraded | Token refresh pending | Refresh retry {n} scheduled for {time}: {error} | view_logs | +| Refresh failed (network) | degraded | Refresh failed - network error | Network error: {details}. Retry in {time} | retry | +| Refresh failed (permanent) | unhealthy | Refresh token expired | Re-authentication required: {error} | login | + +### Example Responses + +**Healthy with scheduled refresh**: +```json +{ + "health": { + "level": "healthy", + "admin_state": "enabled", + "summary": "Connected (5 tools)", + "detail": "Token refresh scheduled for 2026-01-12T15:30:00Z", + "action": "" + } +} +``` + +**Degraded during retry**: +```json +{ + "health": { + "level": "degraded", + "admin_state": "enabled", + "summary": "Token refresh pending", + "detail": "Refresh retry 3 scheduled for 2026-01-12T14:05:40Z: connection timeout", + "action": "view_logs" + } +} +``` + +**Unhealthy (permanent failure)**: +```json +{ + "health": { + "level": "unhealthy", + "admin_state": "enabled", + "summary": "Refresh token expired", + "detail": "Re-authentication required: invalid_grant", + "action": "login" + } +} +``` + +## Metrics Endpoint + +**Endpoint**: `GET /metrics` (Prometheus format) + +### New Metrics + +```prometheus +# HELP mcpproxy_oauth_refresh_total Total number of OAuth token refresh attempts +# TYPE mcpproxy_oauth_refresh_total counter +mcpproxy_oauth_refresh_total{server="github-mcp",result="success"} 42 +mcpproxy_oauth_refresh_total{server="github-mcp",result="failed_network"} 3 +mcpproxy_oauth_refresh_total{server="atlassian-mcp",result="failed_invalid_grant"} 1 + +# HELP mcpproxy_oauth_refresh_duration_seconds OAuth token refresh duration in seconds +# TYPE mcpproxy_oauth_refresh_duration_seconds histogram +mcpproxy_oauth_refresh_duration_seconds_bucket{server="github-mcp",result="success",le="0.5"} 38 +mcpproxy_oauth_refresh_duration_seconds_bucket{server="github-mcp",result="success",le="1"} 41 +mcpproxy_oauth_refresh_duration_seconds_bucket{server="github-mcp",result="success",le="+Inf"} 42 +mcpproxy_oauth_refresh_duration_seconds_sum{server="github-mcp",result="success"} 18.5 +mcpproxy_oauth_refresh_duration_seconds_count{server="github-mcp",result="success"} 42 +``` + +### Label Values + +**`result` label**: +- `success`: Token refresh completed successfully +- `failed_network`: Network error (timeout, DNS, connection refused) +- `failed_invalid_grant`: Refresh token expired or revoked +- `failed_other`: Other OAuth errors (server_error, etc.) + +## SSE Events + +**Endpoint**: `GET /events` + +No new event types. Existing `servers.changed` event will be emitted when health status changes due to refresh state transitions. + +## Backward Compatibility + +All changes are additive: +- `detail` field already exists, just gains more specific content +- New metrics don't affect existing metrics +- No schema breaking changes diff --git a/specs/023-oauth-state-persistence/data-model.md b/specs/023-oauth-state-persistence/data-model.md new file mode 100644 index 00000000..9d27c6f6 --- /dev/null +++ b/specs/023-oauth-state-persistence/data-model.md @@ -0,0 +1,255 @@ +# Data Model: OAuth Token Refresh Reliability + +**Feature**: 023-oauth-state-persistence +**Date**: 2026-01-12 + +## Entity Overview + +This feature primarily extends existing entities rather than creating new ones. The key data structures involved are: + +1. **OAuthTokenRecord** (existing) - Token persistence +2. **RefreshSchedule** (existing) - Proactive refresh scheduling +3. **HealthCalculatorInput** (existing) - Health status calculation +4. **OAuth Metrics** (new) - Prometheus metrics for observability + +## Entity Definitions + +### 1. OAuthTokenRecord (Existing - No Changes) + +**Location**: `internal/storage/models.go` + +```go +type OAuthTokenRecord struct { + ServerName string `json:"server_name"` // Storage key (serverName_hash) + DisplayName string `json:"display_name"` // Actual server name + AccessToken string `json:"access_token"` + RefreshToken string `json:"refresh_token"` // Required for proactive refresh + TokenType string `json:"token_type"` + ExpiresAt time.Time `json:"expires_at"` + Scopes []string `json:"scopes"` + ClientID string `json:"client_id"` // For DCR + ClientSecret string `json:"client_secret"` // For DCR + Created time.Time `json:"created"` + Updated time.Time `json:"updated"` +} +``` + +**Relationships**: +- Referenced by: RefreshSchedule (via DisplayName) +- Stored in: BBolt `oauth_tokens` bucket + +**Validation Rules**: +- `ServerName`: Required, unique per OAuth server +- `AccessToken`: Required when authenticated +- `RefreshToken`: Optional, enables proactive refresh +- `ExpiresAt`: Required for scheduling refresh + +### 2. RefreshSchedule (Existing - Extended) + +**Location**: `internal/oauth/refresh_manager.go` + +```go +type RefreshSchedule struct { + ServerName string // Server identifier + ExpiresAt time.Time // Token expiration time + ScheduledRefresh time.Time // When refresh will be attempted + RetryCount int // Number of failed attempts + LastError error // Most recent failure reason + Timer *time.Timer // Scheduled timer (internal) + + // NEW FIELDS for this feature: + RetryBackoff time.Duration // Current backoff duration + MaxBackoff time.Duration // Maximum backoff (5 minutes) + LastAttempt time.Time // Time of last refresh attempt + RefreshState RefreshState // Current state for health reporting +} + +// NEW: Refresh state for health status integration +type RefreshState int + +const ( + RefreshStateIdle RefreshState = iota // No refresh needed + RefreshStateScheduled // Proactive refresh scheduled + RefreshStateRetrying // Failed, retrying with backoff + RefreshStateFailed // Permanently failed (needs re-auth) +) +``` + +**Lifecycle Invariant**: Schedule exists iff tokens exist for server (created post-auth or when loaded at startup, destroyed when tokens removed). + +**State Transitions**: +``` +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ │ +│ [Token Saved] ──► Scheduled ──► [80% lifetime] ──► Idle │ +│ │ │ │ +│ │ [Refresh Failed] │ │ +│ ā–¼ │ │ +│ Retrying ā—„ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +│ │ │ +│ │ [invalid_grant] │ +│ ā–¼ │ +│ Failed ──► [Re-auth] ──► Scheduled │ +│ │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +### 3. HealthCalculatorInput (Existing - Extended) + +**Location**: `internal/health/calculator.go` + +```go +type HealthCalculatorInput struct { + // Existing fields... + OAuthRequired bool + OAuthStatus string // "authenticated", "expired", "error", "none" + TokenExpiresAt *time.Time + HasRefreshToken bool + UserLoggedOut bool + + // NEW FIELDS for this feature: + RefreshState RefreshState // From RefreshSchedule + RefreshRetryCount int // Number of retry attempts + RefreshLastError string // Human-readable error message + RefreshNextAttempt *time.Time // When next retry will occur +} +``` + +**Health Output Mapping**: + +| RefreshState | Health Level | Summary | Action | +|--------------|--------------|---------|--------| +| Idle | healthy | "Connected" | - | +| Scheduled | healthy | "Token refresh scheduled" | - | +| Retrying | degraded | "Token refresh retry pending" | view_logs | +| Failed | unhealthy | "Refresh token expired" | login | + +### 4. OAuth Metrics (New) + +**Location**: `internal/observability/metrics.go` + +```go +// Added to MetricsManager struct +type MetricsManager struct { + // Existing fields... + + // NEW: OAuth refresh metrics + oauthRefreshTotal *prometheus.CounterVec + oauthRefreshDuration *prometheus.HistogramVec +} +``` + +**Metric Definitions**: + +```go +// Counter: Total refresh attempts +oauthRefreshTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "mcpproxy_oauth_refresh_total", + Help: "Total number of OAuth token refresh attempts", + }, + []string{"server", "result"}, +) + +// Histogram: Refresh duration +oauthRefreshDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "mcpproxy_oauth_refresh_duration_seconds", + Help: "OAuth token refresh duration in seconds", + Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30}, + }, + []string{"server", "result"}, +) +``` + +**Labels**: +- `server`: Server name (e.g., "github-mcp") +- `result`: One of: + - `success`: Refresh completed successfully + - `failed_network`: Network error (retryable) + - `failed_invalid_grant`: Refresh token expired (permanent) + - `failed_other`: Other failures + +## Configuration Constants + +**Location**: `internal/oauth/refresh_manager.go` + +```go +const ( + // Existing + DefaultRefreshThreshold = 0.8 // Refresh at 80% of token lifetime + MinRefreshInterval = 5 * time.Second + + // MODIFIED for this feature + RetryBackoffBase = 10 * time.Second // Was: 2s, Now: 10s (FR-008) + MaxRetryBackoff = 5 * time.Minute // NEW: Cap at 5 minutes (FR-009) + MaxRetries = 0 // NEW: Unlimited (until expiration) +) +``` + +## Data Flow + +### Startup Recovery Flow + +``` +1. RefreshManager.Start() + │ + ā”œā”€ā–ŗ storage.ListOAuthTokens() + │ + └─► For each token: + │ + ā”œā”€ā–ŗ If not expired: + │ └─► scheduleRefreshLocked(80% lifetime) + │ + └─► If access token expired, refresh token exists: + └─► executeRefresh() immediately + │ + ā”œā”€ā–ŗ Success: scheduleRefreshLocked(new 80%) + │ + └─► Failure: rescheduleWithBackoff() +``` + +### Proactive Refresh Flow + +``` +1. Timer fires at 80% lifetime + │ + └─► executeRefresh() + │ + ā”œā”€ā–ŗ runtime.RefreshOAuthToken(serverName) + │ │ + │ └─► mcp-go client.Start() with TokenStore + │ + ā”œā”€ā–ŗ Success: + │ ā”œā”€ā–ŗ Emit metrics (success) + │ ā”œā”€ā–ŗ Update health status (Idle) + │ └─► Schedule next refresh + │ + └─► Failure: + ā”œā”€ā–ŗ Emit metrics (failed_*) + ā”œā”€ā–ŗ Update health status (Retrying/Failed) + └─► Reschedule with backoff (if retryable) +``` + +### Health Status Integration Flow + +``` +1. HealthCalculator.Calculate() + │ + ā”œā”€ā–ŗ Get RefreshSchedule for server + │ + └─► Map RefreshState to health: + │ + ā”œā”€ā–ŗ Retrying: degraded + "Refresh retry pending" + │ + ā”œā”€ā–ŗ Failed: unhealthy + "Refresh token expired" + │ + └─► Other: existing logic +``` + +## Storage Bucket + +**Bucket**: `oauth_tokens` (existing) +**Database**: `~/.mcpproxy/config.db` (BBolt) + +No schema changes required. The `OAuthTokenRecord` structure is sufficient for all refresh operations. diff --git a/specs/023-oauth-state-persistence/followup-token-refresh-bugs.md b/specs/023-oauth-state-persistence/followup-token-refresh-bugs.md new file mode 100644 index 00000000..be8ee6d1 --- /dev/null +++ b/specs/023-oauth-state-persistence/followup-token-refresh-bugs.md @@ -0,0 +1,171 @@ +# Followup: OAuth Token Refresh Bug Fixes + +**Date**: 2026-01-17 +**Branch**: `023-oauth-state-persistence-impl` +**Status**: Fixed, pending end-to-end verification + +## Problem Statement + +After implementing spec 023 (OAuth State Persistence), OAuth servers with expired tokens were failing to refresh automatically despite having valid `refresh_token` values stored in the database. Servers would show errors like: + +``` +MCP initialization failed after OAuth setup: no valid token available, authorization required +``` + +This affected dynamically-discovered OAuth servers (e.g., atlassian-remote, slack, github) that use Protected Resource Metadata (PRM) for OAuth discovery rather than static OAuth configuration. + +## Investigation Summary + +### Diagnostic Approach + +Added comprehensive logging to trace the token refresh flow: + +1. **RefreshManager startup** (`internal/oauth/refresh_manager.go`) + - Logs token loading from storage + - Logs each token's expiration status and refresh_token availability + - Logs schedule creation count + +2. **PersistentTokenStore** (`internal/oauth/persistent_token_store.go`) + - Logs GetToken/SaveToken calls with server_key + - Logs token expiration and refresh_token status + +3. **Server key generation** (`internal/oauth/persistent_token_store.go`) + - Logs key generation with server_name, server_url, and generated_key + +4. **OnTokenSaved callback** (`internal/runtime/lifecycle.go`) + - Logs when tokens are saved and refresh is scheduled + +### Diagnostic Findings + +From the logs, we verified: +- RefreshManager IS starting correctly (9 schedules created) +- OnTokenSaved callback IS firing +- Tokens ARE being saved with correct server_key format + +But `RefreshOAuthToken` was failing to find tokens for dynamically-discovered OAuth servers. + +## Bugs Found and Fixed + +### Bug #1: Static Config Check Only + +**Location**: `internal/upstream/manager.go:2362-2379` + +**Problem**: `RefreshOAuthToken` only checked `serverConfig.OAuth` to determine if a server uses OAuth. For dynamically-discovered OAuth servers, this field is `nil` because OAuth was discovered at runtime via PRM, not configured statically. + +**Fix**: Also check the database for stored OAuth tokens: + +```go +// Check if server uses OAuth via either static config or dynamic discovery +serverConfig := client.GetConfig() +hasStaticOAuth := serverConfig != nil && serverConfig.OAuth != nil + +// Also check for OAuth tokens in the database (dynamic OAuth discovery) +hasStoredTokens := false +if m.storage != nil && serverConfig != nil { + serverKey := oauth.GenerateServerKey(serverName, serverConfig.URL) + token, err := m.storage.GetOAuthToken(serverKey) + if err == nil && token != nil && token.RefreshToken != "" { + hasStoredTokens = true + } +} + +if !hasStaticOAuth && !hasStoredTokens { + return fmt.Errorf("server does not use OAuth: %s", serverName) +} +``` + +### Bug #2: Token Lookup Key Mismatch (Critical) + +**Location**: `internal/upstream/manager.go:2371` + +**Problem**: The initial fix for Bug #1 used `serverName` directly for token lookup: + +```go +token, err := m.storage.GetOAuthToken(serverName) // WRONG +``` + +But tokens are stored using a hash-based key generated by `GenerateServerKey(name, url)`: + +```go +// Tokens are stored with key like "slack_a1b2c3d4e5f6g7h8" +serverKey := GenerateServerKey(serverName, serverURL) +``` + +This mismatch meant `RefreshOAuthToken` could never find tokens for any server, causing all refresh attempts to fail. + +**Fix**: Use `GenerateServerKey` for consistent key lookup: + +```go +serverKey := oauth.GenerateServerKey(serverName, serverConfig.URL) +token, err := m.storage.GetOAuthToken(serverKey) // CORRECT +``` + +## Files Modified + +| File | Changes | +|------|---------| +| `internal/upstream/manager.go` | Fixed OAuth detection and token lookup key | +| `internal/upstream/manager_oauth_test.go` | Added test coverage for dynamic OAuth servers | +| `internal/oauth/refresh_manager.go` | Added diagnostic logging | +| `internal/oauth/persistent_token_store.go` | Added server key generation logging, enhanced GetToken logging | +| `internal/runtime/lifecycle.go` | Enhanced OnTokenSaved callback logging | + +## Test Coverage + +New test file: `internal/upstream/manager_oauth_test.go` + +- `TestRefreshOAuthToken_DynamicOAuthDiscovery` - Tests servers with OAuth from PRM +- `TestRefreshOAuthToken_StaticOAuthConfig` - Tests servers with static OAuth config +- `TestRefreshOAuthToken_ServerNotFound` - Tests error handling + +All tests passing. + +## Verification Steps + +To verify the fix works end-to-end: + +1. **Rebuild**: + ```bash + go build -o mcpproxy ./cmd/mcpproxy + ``` + +2. **Start with debug logging**: + ```bash + ./mcpproxy serve --log-level=debug + ``` + +3. **Authenticate an OAuth server** (e.g., slack) + +4. **Restart mcpproxy** + +5. **Verify auto-reconnect** - Server should reconnect using stored token without browser re-authentication + +6. **Check logs for**: + ``` + Found OAuth token in database for dynamic OAuth server + OAuth token refresh requested has_stored_tokens=true + ``` + +## Root Cause Analysis + +The `GenerateServerKey` function creates a unique storage key by hashing `serverName|serverURL`: + +```go +func GenerateServerKey(serverName, serverURL string) string { + combined := fmt.Sprintf("%s|%s", serverName, serverURL) + hash := sha256.Sum256([]byte(combined)) + hashStr := hex.EncodeToString(hash[:]) + return fmt.Sprintf("%s_%s", serverName, hashStr[:16]) +} +``` + +This design allows multiple servers with the same name but different URLs to have separate tokens. However, any code that looks up tokens must use this same key generation, not just the server name. + +The bug was introduced because the original `RefreshOAuthToken` implementation predated the `PersistentTokenStore` and didn't account for the hash-based storage key. + +## Related Documentation + +- Spec: `/specs/023-oauth-state-persistence/spec.md` +- Research: `/specs/023-oauth-state-persistence/research.md` +- Bug doc: `/docs/oauth_mcpproxy_bug.md` +- Plan: `/Users/josh.nichols/.claude/plans/magical-soaring-corbato.md` diff --git a/specs/023-oauth-state-persistence/plan.md b/specs/023-oauth-state-persistence/plan.md new file mode 100644 index 00000000..4be28567 --- /dev/null +++ b/specs/023-oauth-state-persistence/plan.md @@ -0,0 +1,104 @@ +# Implementation Plan: OAuth Token Refresh Reliability + +**Branch**: `023-oauth-state-persistence` | **Date**: 2026-01-12 | **Spec**: [spec.md](./spec.md) +**Input**: Feature specification from `/specs/023-oauth-state-persistence/spec.md` + +## Summary + +Fix OAuth token refresh so servers survive restarts and proactively refresh before expiration. The current implementation has misleading logging (reports success when `Start()` returns nil, not actual refresh), swallowed errors from mcp-go, and no startup recovery mechanism. This plan implements proper token refresh with exponential backoff retry, health status surfacing, and Prometheus metrics. + +## Technical Context + +**Language/Version**: Go 1.24 (toolchain go1.24.10) +**Primary Dependencies**: mcp-go v0.43.1 (OAuth client), BBolt (storage), Prometheus (metrics), Zap (logging) +**Storage**: BBolt database (`~/.mcpproxy/config.db`) - `oauth_tokens` bucket with `OAuthTokenRecord` model +**Testing**: `go test ./internal/...`, `./scripts/test-api-e2e.sh`, `./scripts/run-all-tests.sh` +**Target Platform**: macOS, Linux, Windows (desktop application) +**Project Type**: Single Go module with core server + tray application +**Performance Goals**: Token refresh within 30 seconds of startup; proactive refresh at 80% lifetime; health status update within 5 seconds of failure +**Constraints**: Rate limit refreshes to 1 per 10 seconds per server; exponential backoff capped at 5 minutes; localhost-only by default +**Scale/Scope**: Support multiple OAuth servers; tokens expire in ~2 hours; refresh tokens valid 24+ hours + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +| Principle | Status | Evidence/Notes | +|-----------|--------|----------------| +| I. Performance at Scale | āœ… PASS | Token refresh is per-server, async background operation; does not block API requests; uses existing scheduler pattern | +| II. Actor-Based Concurrency | āœ… PASS | Existing RefreshManager uses goroutine + timer pattern; will extend with exponential backoff; no new locks required | +| III. Configuration-Driven Architecture | āœ… PASS | Refresh threshold (80%) already configurable; no new config fields required; existing patterns sufficient | +| IV. Security by Default | āœ… PASS | No new security surface; tokens stored in existing encrypted BBolt bucket; rate limiting prevents abuse | +| V. Test-Driven Development (TDD) | āœ… PASS | Will add unit tests for RefreshManager changes, integration tests for health status, E2E tests for restart recovery | +| VI. Documentation Hygiene | āœ… PASS | Will update CLAUDE.md if architecture changes; logging documentation already exists | + +**Architecture Constraints:** + +| Constraint | Status | Evidence/Notes | +|------------|--------|----------------| +| Core + Tray Split | āœ… PASS | Changes are in core server only; tray consumes health status via existing SSE | +| Event-Driven Updates | āœ… PASS | RefreshManager already emits events; health status propagates via existing event bus | +| DDD Layering | āœ… PASS | Changes to Domain (oauth/), Infrastructure (storage/), Presentation (health status via httpapi/) | +| Upstream Client Modularity | āœ… PASS | Changes primarily in oauth/ layer; connection.go logging fix is surgical | + +**Gate Result**: āœ… PASS - No violations. Proceed to Phase 0. + +### Post-Design Re-evaluation (Phase 1 Complete) + +| Principle | Status | Post-Design Evidence | +|-----------|--------|----------------------| +| I. Performance at Scale | āœ… PASS | RefreshSchedule uses timers, not polling; metrics add ~0 overhead | +| II. Actor-Based Concurrency | āœ… PASS | Extended RefreshSchedule struct; no new goroutines or locks | +| III. Configuration-Driven Architecture | āœ… PASS | No new config fields; constants in code are appropriate for fixed behavior | +| IV. Security by Default | āœ… PASS | No changes to security model; rate limiting added | +| V. Test-Driven Development (TDD) | āœ… PASS | Test plan in quickstart.md; unit + integration tests specified | +| VI. Documentation Hygiene | āœ… PASS | CLAUDE.md updated via agent context script | + +**Post-Design Gate Result**: āœ… PASS - Design aligns with constitution. Ready for `/speckit.tasks`. + +## Project Structure + +### Documentation (this feature) + +```text +specs/023-oauth-state-persistence/ +ā”œā”€ā”€ plan.md # This file +ā”œā”€ā”€ research.md # Phase 0 output +ā”œā”€ā”€ data-model.md # Phase 1 output +ā”œā”€ā”€ quickstart.md # Phase 1 output +ā”œā”€ā”€ contracts/ # Phase 1 output (N/A - no new REST endpoints) +└── tasks.md # Phase 2 output (/speckit.tasks command) +``` + +### Source Code (repository root) + +```text +internal/ +ā”œā”€ā”€ oauth/ +│ ā”œā”€ā”€ refresh_manager.go # MODIFY: Add exponential backoff retry, startup refresh +│ ā”œā”€ā”€ persistent_token_store.go # MODIFY: Improve GetToken() error handling +│ └── logging.go # MODIFY: Fix misleading success logging +ā”œā”€ā”€ upstream/ +│ └── core/ +│ └── connection.go # MODIFY: Fix misleading "OAuth token refresh successful" logs +ā”œā”€ā”€ health/ +│ └── calculator.go # MODIFY: Add refresh retry state to health calculation +ā”œā”€ā”€ observability/ +│ └── metrics.go # MODIFY: Add OAuth refresh metrics +└── storage/ + └── models.go # NO CHANGE: Existing OAuthTokenRecord sufficient + +tests/ +ā”œā”€ā”€ integration/ # ADD: Token refresh integration tests +└── unit/ # ADD: RefreshManager unit tests +``` + +**Structure Decision**: Single Go module structure. All changes are modifications to existing files in `internal/` packages. No new packages or architectural changes required. + +## Complexity Tracking + +> No constitution violations to justify. + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|-------------------------------------| +| N/A | N/A | N/A | diff --git a/specs/023-oauth-state-persistence/quickstart.md b/specs/023-oauth-state-persistence/quickstart.md new file mode 100644 index 00000000..1be66fae --- /dev/null +++ b/specs/023-oauth-state-persistence/quickstart.md @@ -0,0 +1,185 @@ +# Quickstart: OAuth Token Refresh Reliability + +**Feature**: 023-oauth-state-persistence +**Date**: 2026-01-12 + +## Overview + +This guide provides the essential information for implementing OAuth token refresh reliability in mcpproxy-go. + +## Implementation Summary + +### Files to Modify + +| File | Changes | +|------|---------| +| `internal/oauth/refresh_manager.go` | Exponential backoff, startup refresh, health state | +| `internal/oauth/logging.go` | Fix misleading success logging | +| `internal/upstream/core/connection.go` | Fix misleading success logs (lines 1239, 1696) | +| `internal/health/calculator.go` | Add refresh state to health calculation | +| `internal/observability/metrics.go` | Add OAuth refresh metrics | + +### Key Constants + +```go +// internal/oauth/refresh_manager.go +const ( + RetryBackoffBase = 10 * time.Second // FR-008: min 10s between attempts + MaxRetryBackoff = 5 * time.Minute // FR-009: cap at 5 minutes +) +``` + +### Core Algorithm: Exponential Backoff + +```go +func (m *RefreshManager) calculateBackoff(retryCount int) time.Duration { + backoff := RetryBackoffBase * time.Duration(1< MaxRetryBackoff { + backoff = MaxRetryBackoff + } + return backoff +} +``` + +**Sequence**: 10s → 20s → 40s → 80s → 160s → 300s (cap) + +## Testing Approach + +### Unit Tests + +```go +// internal/oauth/refresh_manager_test.go + +func TestExponentialBackoff(t *testing.T) { + tests := []struct { + retryCount int + expected time.Duration + }{ + {0, 10 * time.Second}, + {1, 20 * time.Second}, + {2, 40 * time.Second}, + {3, 80 * time.Second}, + {4, 160 * time.Second}, + {5, 300 * time.Second}, // Capped at 5min + {10, 300 * time.Second}, // Still capped + } + // ... +} + +func TestStartupRefresh(t *testing.T) { + // Given: Stored token with expired access but valid refresh + // When: RefreshManager.Start() is called + // Then: Immediate refresh attempt is made +} + +func TestRefreshHealthState(t *testing.T) { + // Given: Refresh fails + // When: Health is calculated + // Then: State is "degraded" with retry info +} +``` + +### Integration Tests + +```go +// internal/oauth/refresh_integration_test.go + +func TestOAuthServerSurvivesRestart(t *testing.T) { + // 1. Start mcpproxy with OAuth server + // 2. Authenticate via OAuth + // 3. Restart mcpproxy + // 4. Verify server reconnects without browser auth +} +``` + +### Manual Testing + +```bash +# 1. Start mcpproxy with OAuth server +./mcpproxy serve --log-level=debug + +# 2. Authenticate an OAuth server (e.g., GitHub MCP) +mcpproxy auth login github-mcp + +# 3. Verify token stored +mcpproxy upstream list -o json | jq '.[] | select(.name=="github-mcp") | .health' + +# 4. Restart and verify auto-reconnect +pkill mcpproxy +./mcpproxy serve --log-level=debug +# Watch logs for refresh attempts + +# 5. Check metrics +curl http://localhost:8080/metrics | grep oauth_refresh +``` + +## Health Status Examples + +### CLI Output + +```bash +$ mcpproxy upstream list +NAME STATUS HEALTH +github-mcp connected healthy (Token refresh scheduled for 14:30) +atlassian-mcp connected degraded (Refresh retry 2 in 40s: timeout) +jira-mcp error unhealthy (Refresh token expired - login required) +``` + +### API Response + +```json +{ + "name": "github-mcp", + "health": { + "level": "degraded", + "summary": "Token refresh pending", + "detail": "Refresh retry 2 scheduled for 2026-01-12T14:05:40Z: connection timeout", + "action": "view_logs" + } +} +``` + +## Metrics + +```bash +# Check refresh success rate +curl -s http://localhost:8080/metrics | grep mcpproxy_oauth_refresh_total + +# Output: +mcpproxy_oauth_refresh_total{server="github-mcp",result="success"} 42 +mcpproxy_oauth_refresh_total{server="github-mcp",result="failed_network"} 3 +``` + +## Debugging + +### Enable Debug Logging + +```bash +./mcpproxy serve --log-level=debug 2>&1 | grep -i oauth +``` + +### Key Log Messages + +| Message | Meaning | +|---------|---------| +| `OAuth token refresh scheduled` | Proactive refresh timer set | +| `OAuth token refresh attempt` | Refresh starting | +| `OAuth token refresh succeeded` | Refresh completed (NEW: actual refresh, not connection) | +| `OAuth token refresh failed` | Refresh failed with error details | +| `OAuth token refresh retry scheduled` | Backoff retry queued | + +### Common Issues + +1. **"invalid_grant" errors**: Refresh token expired on provider side → user must re-authenticate +2. **Rapid retry loops**: Check rate limiting is working (min 10s between attempts) +3. **Health stuck in "retrying"**: Check network connectivity to OAuth provider + +## Success Criteria Verification + +| Criteria | How to Verify | +|----------|---------------| +| SC-001: Auto-reconnect within 60s | Restart mcpproxy, time until server connected | +| SC-002: 99% proactive refresh | Monitor `mcpproxy_oauth_refresh_total` success rate | +| SC-003: Specific failure reasons | Check health status detail field | +| SC-004: Accurate logging | No "successful" logs for failed refreshes | +| SC-005: Manual re-auth only when needed | Only `invalid_grant` requires browser auth | diff --git a/specs/023-oauth-state-persistence/research.md b/specs/023-oauth-state-persistence/research.md new file mode 100644 index 00000000..3dbc59e3 --- /dev/null +++ b/specs/023-oauth-state-persistence/research.md @@ -0,0 +1,237 @@ +# Research: OAuth Token Refresh Reliability + +**Feature**: 023-oauth-state-persistence +**Date**: 2026-01-12 + +## Key Decisions + +### 1. Token Refresh Method + +**Decision**: Use mcp-go's automatic token refresh via TokenStore pattern, not a direct `RefreshToken()` API call. + +**Rationale**: The mcp-go library (v0.43.1) does NOT expose a direct `RefreshToken()` method on the OAuth handler. Token refresh is handled automatically when: +1. A `TokenStore` with a stored refresh token is provided to `OAuthConfig` +2. `client.Start(ctx)` is called +3. mcp-go detects an expired access token and attempts automatic refresh internally + +**Alternatives Considered**: +- Direct OAuth2 HTTP calls: Rejected - duplicates mcp-go functionality and bypasses its token store integration +- Patching mcp-go: Rejected - adds maintenance burden and delays implementation + +### 2. Exponential Backoff Strategy + +**Decision**: Extend existing exponential backoff (base 10s, cap 5min) with unlimited retries until token expiration. + +**Rationale**: The spec requires retries until expiration with 10s→20s→40s→80s→5min cap. The existing `RetryBackoffBase = 2s` is too aggressive; will use 10s base per FR-008 (rate limit 1 per 10s per server). + +**Backoff Sequence** (10s base, 5min cap): +| Attempt | Wait Before | Cumulative Time | +|---------|-------------|-----------------| +| 1 | 0s | 0s | +| 2 | 10s | 10s | +| 3 | 20s | 30s | +| 4 | 40s | 70s (~1min) | +| 5 | 80s | 150s (~2.5min) | +| 6 | 160s | 310s (~5min) | +| 7 | 300s (cap) | 610s (~10min) | +| 8+ | 300s (cap) | continues until expiration | + +**Alternatives Considered**: +- Jitter: Deferred - single-server retries don't cause thundering herd +- Fibonacci backoff: Rejected - exponential is simpler and well-understood + +### 3. Misleading Logging Fix + +**Decision**: Fix the logging in `connection.go` to accurately report token refresh outcomes, and remove/rename the misleading helper function in `logging.go`. + +**Root Cause Analysis**: +- `LogTokenRefreshSuccess()` in `logging.go` is called when `c.client.Start()` succeeds +- This does NOT mean a token refresh occurred - it means the client connected +- The duration logged (`time.Duration(attempt)*backoff`) is retry wait time, not refresh duration + +**Fix Locations**: +1. `internal/upstream/core/connection.go:1239` - HTTP connection retry +2. `internal/upstream/core/connection.go:1696` - SSE connection retry +3. `internal/oauth/logging.go:171` - Rename or remove misleading function + +**Alternatives Considered**: +- Keep logging but change wording: Rejected - still semantically wrong +- Add separate actual refresh logging: Selected - log actual refresh attempts via RefreshManager + +### 4. Health Status Integration + +**Decision**: Surface refresh retry state as `degraded` health with actionable detail. + +**Existing Health Calculation** (`internal/health/calculator.go`): +- Already distinguishes tokens with/without refresh capability +- Has `OAuthStatus` enum: "authenticated", "expired", "error", "none" +- Has `action` field for suggested user action + +**New States to Add**: +| Condition | Level | Summary | Action | +|-----------|-------|---------|--------| +| Refresh retrying | degraded | "Token refresh retry pending" | view_logs | +| Refresh failed (recoverable) | degraded | "Refresh failed - network error" | retry | +| Refresh failed (permanent) | unhealthy | "Refresh token expired" | login | + +**Alternatives Considered**: +- New health level: Rejected - "degraded" is semantically correct for retry state +- Separate refresh status field: Rejected - health system already handles this + +### 5. Metrics Implementation + +**Decision**: Add two Prometheus metrics to existing `MetricsManager`. + +**Metrics**: +```go +mcpproxy_oauth_refresh_total{server, result} // Counter +mcpproxy_oauth_refresh_duration_seconds{server, result} // Histogram +``` + +**Labels**: +- `server`: Server name (e.g., "github-mcp") +- `result`: "success", "failed_network", "failed_invalid_grant", "failed_other" + +**Integration Point**: RefreshManager's `executeRefresh()` method already has the hook point via `RefreshEventEmitter`. + +**Alternatives Considered**: +- Per-error-type counters: Rejected - label-based is more flexible +- Separate expiry warnings counter: Deferred to future work + +### 6. Startup Recovery Flow + +**Decision**: Enhance RefreshManager.Start() to attempt immediate refresh for expired tokens. + +**Current Behavior** (`refresh_manager.go:124-162`): +1. Loads tokens from storage via `m.storage.ListOAuthTokens()` +2. For each non-expired token, schedules proactive refresh at 80% lifetime +3. Does NOT handle already-expired tokens + +**New Behavior**: +1. Load tokens from storage +2. For each token: + - If not expired → schedule proactive refresh at 80% lifetime (unchanged) + - If access token expired but refresh token exists → attempt immediate refresh + - If both expired → mark as needing re-auth + +**Alternatives Considered**: +- Trigger refresh on first tool call: Rejected - delays user workflow +- Background refresh thread: Already exists (RefreshManager), just needs enhancement + +## Technical Findings + +### Token Storage Model + +**Location**: `internal/storage/models.go` + +```go +type OAuthTokenRecord struct { + ServerName string // Storage key: serverName_hash + DisplayName string // Actual server name for RefreshManager lookup + AccessToken string + RefreshToken string // Required for proactive refresh + TokenType string + ExpiresAt time.Time + Scopes []string + ClientID string // For DCR + ClientSecret string // For DCR + Created time.Time + Updated time.Time +} +``` + +**Key Insight**: `GetServerName()` returns `DisplayName` if available, enabling RefreshManager to look up servers by their config name. + +### mcp-go OAuth Integration + +**Version**: v0.43.1 + +**Available Methods**: +```go +client.IsOAuthAuthorizationRequiredError(err error) bool +client.GetOAuthHandler(err error) OAuthHandler +client.GenerateCodeVerifier() (string, error) +client.GenerateCodeChallenge(codeVerifier string) string +``` + +**Token Store Interface**: +```go +type TokenStore interface { + GetToken(ctx context.Context) (*Token, error) + SaveToken(ctx context.Context, token *Token) error +} +``` + +**No Direct RefreshToken() Method**: Refresh is handled internally by mcp-go when TokenStore provides a refresh token. + +### Error Detection + +**OAuth Errors** (`internal/oauth/errors.go`): +```go +ErrServerNotOAuth = errors.New("server does not use OAuth") +ErrTokenExpired = errors.New("OAuth token has expired") +ErrRefreshFailed = errors.New("OAuth token refresh failed") +ErrNoRefreshToken = errors.New("no refresh token available") +``` + +**String-Based Detection** (`connection.go:1798-1825`): +```go +oauthErrors := []string{ + "invalid_token", + "invalid_grant", // Refresh token expired + "access_denied", + "unauthorized", + "401", + "Missing or invalid access token", + "no valid token available", +} +``` + +**Key Insight**: `invalid_grant` specifically indicates the refresh token is invalid/expired, requiring full re-authentication. + +### Existing Backoff Implementation + +**RefreshManager** (`refresh_manager.go:405-407`): +```go +backoff := RetryBackoffBase * time.Duration(1<<(retryCount-1)) +m.rescheduleAfterDelay(serverName, backoff) +``` + +**Connection Retry** (`connection.go:1232-1265`): +```go +backoff := refreshConfig.InitialBackoff // 1s +for attempt := 1; attempt <= refreshConfig.MaxAttempts; attempt++ { + // ... + backoff = min(backoff*2, refreshConfig.MaxBackoff) // Cap at 10s +} +``` + +### Event System + +**RefreshEventEmitter Interface** (`refresh_manager.go`): +```go +type RefreshEventEmitter interface { + EmitTokenRefreshed(serverName string) + EmitTokenRefreshFailed(serverName string, err error) +} +``` + +**Integration Point**: RefreshManager already emits events that can be consumed by MetricsManager. + +## Files to Modify + +| File | Change Type | Description | +|------|-------------|-------------| +| `internal/oauth/refresh_manager.go` | MODIFY | Add exponential backoff with 5min cap, startup refresh for expired tokens | +| `internal/oauth/logging.go` | MODIFY | Fix/remove misleading `LogTokenRefreshSuccess()` | +| `internal/upstream/core/connection.go` | MODIFY | Fix misleading success logs at lines 1239, 1696 | +| `internal/health/calculator.go` | MODIFY | Add refresh retry state to health calculation | +| `internal/observability/metrics.go` | MODIFY | Add OAuth refresh metrics | + +## No Changes Required + +| File | Reason | +|------|--------| +| `internal/storage/models.go` | Existing `OAuthTokenRecord` is sufficient | +| `internal/oauth/persistent_token_store.go` | Grace period logic already correct | +| `internal/oauth/config.go` | OAuth config creation unchanged | diff --git a/specs/023-oauth-state-persistence/tasks.md b/specs/023-oauth-state-persistence/tasks.md new file mode 100644 index 00000000..2ed720d0 --- /dev/null +++ b/specs/023-oauth-state-persistence/tasks.md @@ -0,0 +1,208 @@ +# Tasks: OAuth Token Refresh Reliability + +**Input**: Design documents from `/specs/023-oauth-state-persistence/` +**Prerequisites**: plan.md, spec.md, research.md, data-model.md, contracts/ + +**Tests**: Not explicitly requested in spec. Tests included for critical functionality only. + +**Organization**: Tasks grouped by user story to enable independent implementation and testing. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (US1, US2, US3) +- Include exact file paths in descriptions + +## Path Conventions + +- **Go module**: `internal/` for packages, root for `go.mod` +- **Tests**: `*_test.go` alongside implementation files + +--- + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Constants, types, and metrics infrastructure needed by all user stories + +- [X] T001 Add RefreshState type and constants in internal/oauth/refresh_manager.go +- [X] T002 [P] Add RetryBackoffBase (10s) and MaxRetryBackoff (5min) constants in internal/oauth/refresh_manager.go +- [X] T003 [P] Extend RefreshSchedule struct with RetryBackoff, MaxBackoff, LastAttempt, RefreshState fields in internal/oauth/refresh_manager.go +- [X] T004 [P] Add OAuth refresh metrics (mcpproxy_oauth_refresh_total counter, mcpproxy_oauth_refresh_duration_seconds histogram) in internal/observability/metrics.go +- [X] T005 [P] Add RecordOAuthRefresh() and RecordOAuthRefreshDuration() methods to MetricsManager in internal/observability/metrics.go + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Core backoff algorithm and rate limiting that ALL user stories depend on + +**CRITICAL**: No user story work can begin until this phase is complete + +- [X] T006 Implement calculateBackoff(retryCount int) method with exponential backoff and 5min cap in internal/oauth/refresh_manager.go +- [X] T007 Add rate limiting check (min 10s between attempts per server) in internal/oauth/refresh_manager.go +- [X] T008 [P] Fix misleading LogTokenRefreshSuccess() function - rename to LogClientConnectionSuccess() in internal/oauth/logging.go +- [X] T009 [P] Fix misleading success log at line ~1239 in internal/upstream/core/connection.go (HTTP connection) +- [X] T010 [P] Fix misleading success log at line ~1696 in internal/upstream/core/connection.go (SSE connection) +- [X] T011 Add accurate LogTokenRefreshAttempt() and LogTokenRefreshResult() functions in internal/oauth/logging.go + +**Checkpoint**: Backoff algorithm and accurate logging ready - user story implementation can begin + +--- + +## Phase 3: User Story 1 - Survive Laptop Sleep/Wake (Priority: P1) + +**Goal**: OAuth servers automatically reconnect after laptop wakes from sleep or mcpproxy restarts, using stored refresh tokens + +**Independent Test**: Authenticate OAuth server, restart mcpproxy, verify server reconnects within 60s without browser auth + +### Implementation for User Story 1 + +- [X] T012 [US1] Modify RefreshManager.Start() to detect expired tokens at startup in internal/oauth/refresh_manager.go +- [X] T013 [US1] Add executeImmediateRefresh() method for expired tokens with valid refresh tokens in internal/oauth/refresh_manager.go +- [X] T014 [US1] Emit metrics on refresh attempt (success/failure with result label) in internal/oauth/refresh_manager.go +- [X] T015 [US1] Add structured logging for startup refresh attempts with server name, token age, result in internal/oauth/refresh_manager.go +- [X] T016 [US1] Update RefreshSchedule state to RefreshStateRetrying on failure in internal/oauth/refresh_manager.go +- [X] T017 [US1] Add unit test for startup refresh with expired access token in internal/oauth/refresh_manager_test.go + +**Checkpoint**: OAuth servers survive restart - verify with manual test per quickstart.md + +--- + +## Phase 4: User Story 2 - Proactive Token Refresh (Priority: P2) + +**Goal**: Tokens are refreshed automatically at 80% lifetime before expiration, with exponential backoff retry on failure + +**Independent Test**: Authenticate OAuth server, monitor token expiration, verify new token obtained before original expires + +### Implementation for User Story 2 + +- [X] T018 [US2] Modify executeRefresh() to use new exponential backoff on failure in internal/oauth/refresh_manager.go +- [X] T019 [US2] Implement rescheduleWithBackoff() using calculateBackoff() in internal/oauth/refresh_manager.go +- [X] T020 [US2] Continue retries until token expiration (not limited retry count) in internal/oauth/refresh_manager.go +- [X] T021 [US2] Update RefreshSchedule.RefreshState transitions (Scheduled -> Retrying -> Failed) in internal/oauth/refresh_manager.go +- [X] T022 [US2] Emit refresh duration metric on each attempt in internal/oauth/refresh_manager.go +- [X] T023 [US2] Add unit test for exponential backoff sequence (10s, 20s, 40s, 80s, 160s, 300s cap) in internal/oauth/refresh_manager_test.go +- [X] T024 [US2] Add unit test for unlimited retries until token expiration in internal/oauth/refresh_manager_test.go + +**Checkpoint**: Proactive refresh works with backoff - verify token refresh before expiration + +--- + +## Phase 5: User Story 3 - Clear Refresh Failure Feedback (Priority: P3) + +**Goal**: Users see specific failure reasons in health status (network error vs expired refresh token vs provider error) + +**Independent Test**: Simulate refresh failures, verify distinct error messages in `mcpproxy upstream list` and web UI + +### Implementation for User Story 3 + +- [X] T025 [US3] Add RefreshState, RefreshRetryCount, RefreshLastError, RefreshNextAttempt fields to HealthCalculatorInput in internal/health/calculator.go +- [X] T026 [US3] Implement health calculation for RefreshStateRetrying -> degraded level in internal/health/calculator.go +- [X] T027 [US3] Implement health calculation for RefreshStateFailed -> unhealthy level in internal/health/calculator.go +- [X] T028 [US3] Set appropriate health detail messages per refresh state in internal/health/calculator.go +- [X] T029 [US3] Set appropriate health action per state (view_logs for retrying, login for failed) in internal/health/calculator.go +- [X] T030 [US3] Add error classification for invalid_grant (permanent) vs network errors (retryable) in internal/oauth/refresh_manager.go +- [X] T031 [US3] Expose RefreshSchedule state via RefreshManager.GetRefreshState(serverName) method in internal/oauth/refresh_manager.go +- [X] T032 [US3] Wire RefreshManager state into health calculation flow in internal/health/calculator.go +- [X] T033 [US3] Add unit test for health status output per refresh state in internal/health/calculator_test.go + +**Checkpoint**: Health status shows specific refresh failure reasons + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Final validation, cleanup, and documentation + +- [X] T034 [P] Verify all logging follows naming convention (OAuth token refresh *) in internal/oauth/logging.go +- [X] T035 [P] Run existing tests to ensure no regressions: go test ./internal/... +- [X] T036 [P] Run E2E tests: ./scripts/test-api-e2e.sh (failures unrelated to OAuth refresh - pre-existing infrastructure issues) +- [X] T037 Verify metrics endpoint shows new OAuth metrics: mcpproxy_oauth_refresh_total, mcpproxy_oauth_refresh_duration_seconds defined in internal/observability/metrics.go +- [ ] T038 Manual verification per quickstart.md success criteria checklist +- [X] T039 Update CLAUDE.md if any architecture patterns changed (no changes needed - implementation follows existing patterns) + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies - can start immediately +- **Foundational (Phase 2)**: Depends on Setup completion - BLOCKS all user stories +- **User Stories (Phase 3-5)**: All depend on Foundational phase completion + - User stories can proceed in priority order (P1 -> P2 -> P3) + - US2 depends on backoff from Foundational; US3 depends on state tracking from US1/US2 +- **Polish (Phase 6)**: Depends on all user stories being complete + +### User Story Dependencies + +- **User Story 1 (P1)**: Can start after Foundational (Phase 2) - startup refresh +- **User Story 2 (P2)**: Builds on US1 - adds exponential backoff to existing refresh +- **User Story 3 (P3)**: Builds on US1/US2 - surfaces refresh state in health status + +### Within Each User Story + +- Implementation tasks in order (no parallel within story due to file dependencies) +- Unit tests after implementation of the feature they test +- Story complete before moving to next priority + +### Parallel Opportunities + +- Setup tasks T002, T003, T004, T005 can run in parallel (different files/functions) +- Foundational tasks T008, T009, T010 can run in parallel (different files) +- Polish tasks T034, T035, T036 can run in parallel + +--- + +## Parallel Example: Setup Phase + +```bash +# Launch all setup tasks in parallel (different files): +Task: "Add RetryBackoffBase and MaxRetryBackoff constants" [T002] +Task: "Extend RefreshSchedule struct" [T003] +Task: "Add OAuth refresh metrics" [T004] +Task: "Add MetricsManager methods" [T005] +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup (T001-T005) +2. Complete Phase 2: Foundational (T006-T011) +3. Complete Phase 3: User Story 1 (T012-T017) +4. **STOP and VALIDATE**: Restart mcpproxy with OAuth server, verify auto-reconnect +5. This alone delivers SC-001: "OAuth servers automatically reconnect within 60s" + +### Incremental Delivery + +1. Complete Setup + Foundational -> Backoff and metrics ready +2. Add User Story 1 -> Startup recovery works -> Deploy/Demo (MVP!) +3. Add User Story 2 -> Proactive refresh with backoff -> Deploy/Demo +4. Add User Story 3 -> Clear failure feedback -> Deploy/Demo +5. Each story adds value without breaking previous stories + +### Key Files Modified + +| File | Tasks | User Stories | +|------|-------|--------------| +| `internal/oauth/refresh_manager.go` | T001-T003, T006-T007, T012-T016, T018-T022, T030-T031 | Setup, US1, US2, US3 | +| `internal/oauth/logging.go` | T008, T011, T034 | Foundational, Polish | +| `internal/upstream/core/connection.go` | T009, T010 | Foundational | +| `internal/observability/metrics.go` | T004, T005 | Setup | +| `internal/health/calculator.go` | T025-T029, T032 | US3 | +| `internal/oauth/refresh_manager_test.go` | T017, T023, T024 | US1, US2 | +| `internal/health/calculator_test.go` | T033 | US3 | + +--- + +## Notes + +- [P] tasks = different files, no dependencies +- [Story] label maps task to specific user story for traceability +- Each user story delivers incremental value +- Verify with quickstart.md manual testing steps after each story +- Commit after each task or logical group +- Stop at any checkpoint to validate story independently