diff --git a/src/IdLE.Core/Private/Invoke-IdleWithRetry.ps1 b/src/IdLE.Core/Private/Invoke-IdleWithRetry.ps1 new file mode 100644 index 00000000..ccbc08bf --- /dev/null +++ b/src/IdLE.Core/Private/Invoke-IdleWithRetry.ps1 @@ -0,0 +1,188 @@ +function Test-IdleTransientError { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [ValidateNotNull()] + [System.Exception] $Exception + ) + + # Retries must be safe-by-default: + # We only retry when a trusted code path explicitly marks an exception as transient. + # + # Supported markers: + # - Exception.Data['Idle.IsTransient'] = $true + # - Exception.Data['IdleIsTransient'] = $true + # + # We accept common "truthy" representations to avoid fragile integrations: + # - $true + # - 'true' (case-insensitive) + # - 1 + $markerKeys = @( + 'Idle.IsTransient', + 'IdleIsTransient' + ) + + foreach ($key in $markerKeys) { + if (-not $Exception.Data.Contains($key)) { + continue + } + + $value = $Exception.Data[$key] + + if ($value -is [bool] -and $value) { + return $true + } + + if ($value -is [int] -and $value -eq 1) { + return $true + } + + if ($value -is [string] -and $value.Trim().ToLowerInvariant() -eq 'true') { + return $true + } + } + + if ($null -ne $Exception.InnerException) { + return Test-IdleTransientError -Exception $Exception.InnerException + } + + return $false +} + +function Get-IdleDeterministicJitter { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [ValidateRange(0.0, 1.0)] + [double] $JitterRatio, + + [Parameter(Mandatory)] + [ValidateNotNullOrEmpty()] + [string] $Seed + ) + + if ($JitterRatio -le 0.0) { + return 0.0 + } + + $bytes = [System.Text.Encoding]::UTF8.GetBytes($Seed) + $hash = [System.Security.Cryptography.SHA256]::HashData($bytes) + + $u64 = [System.BitConverter]::ToUInt64($hash, 0) + $unit = $u64 / [double][UInt64]::MaxValue + + return (($unit * 2.0) - 1.0) * $JitterRatio +} + +function Invoke-IdleWithRetry { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [ValidateNotNull()] + [scriptblock] $Operation, + + [Parameter()] + [ValidateRange(1, 50)] + [int] $MaxAttempts = 3, + + [Parameter()] + [ValidateRange(0, 600000)] + [int] $InitialDelayMilliseconds = 250, + + [Parameter()] + [ValidateRange(1.0, 100.0)] + [double] $BackoffFactor = 2.0, + + [Parameter()] + [ValidateRange(0, 600000)] + [int] $MaxDelayMilliseconds = 5000, + + [Parameter()] + [ValidateRange(0.0, 1.0)] + [double] $JitterRatio = 0.2, + + [Parameter()] + [AllowNull()] + [object] $EventSink, + + [Parameter()] + [AllowEmptyString()] + [string] $StepName = '', + + [Parameter()] + [AllowEmptyString()] + [string] $OperationName = 'Operation', + + [Parameter()] + [AllowEmptyString()] + [string] $DeterministicSeed = '' + ) + + $attempt = 0 + + while ($attempt -lt $MaxAttempts) { + $attempt++ + + try { + $value = & $Operation + return [pscustomobject]@{ + PSTypeName = 'IdLE.RetryResult' + Value = $value + Attempts = $attempt + } + } + catch { + $exception = $_.Exception + + if (-not (Test-IdleTransientError -Exception $exception)) { + # Fail fast for non-transient errors. + throw + } + + if ($attempt -ge $MaxAttempts) { + throw + } + + $baseDelay = [math]::Min( + $MaxDelayMilliseconds, + [math]::Round($InitialDelayMilliseconds * [math]::Pow($BackoffFactor, ($attempt - 1))) + ) + + $seed = if ([string]::IsNullOrWhiteSpace($DeterministicSeed)) { + "$OperationName|$StepName|$attempt" + } else { + "$DeterministicSeed|$attempt" + } + + $jitterFactor = Get-IdleDeterministicJitter -JitterRatio $JitterRatio -Seed $seed + $delay = [math]::Round($baseDelay * (1.0 + $jitterFactor)) + if ($delay -lt 0) { $delay = 0 } + + if ($null -ne $EventSink -and $EventSink.PSObject.Methods.Name -contains 'WriteEvent') { + try { + $EventSink.WriteEvent( + 'StepRetrying', + "Transient failure in '$OperationName' (attempt $attempt/$MaxAttempts). Retrying.", + $StepName, + @{ + attempt = $attempt + maxAttempts = $MaxAttempts + delayMs = $delay + errorType = $exception.GetType().FullName + message = $exception.Message + } + ) + } + catch { + # Intentionally ignored. + } + } + + if ($delay -gt 0) { + Start-Sleep -Milliseconds $delay + } + + continue + } + } +} diff --git a/src/IdLE.Core/Public/Invoke-IdlePlanObject.ps1 b/src/IdLE.Core/Public/Invoke-IdlePlanObject.ps1 index 10083add..3cec0408 100644 --- a/src/IdLE.Core/Public/Invoke-IdlePlanObject.ps1 +++ b/src/IdLE.Core/Public/Invoke-IdlePlanObject.ps1 @@ -58,13 +58,25 @@ function Invoke-IdlePlanObject { $stepRegistry = Get-IdleStepRegistry -Providers $Providers $context = [pscustomobject]@{ - PSTypeName = 'IdLE.ExecutionContext' - Plan = $Plan - Providers = $Providers + PSTypeName = 'IdLE.ExecutionContext' + Plan = $Plan + Providers = $Providers # Object-based, stable eventing contract. # Steps and the engine call: $Context.EventSink.WriteEvent(...) - EventSink = $engineEventSink + EventSink = $engineEventSink + } + + # Execution retry policy (safe-by-default): + # - Only retry errors explicitly marked transient by trusted code paths (Exception.Data['Idle.IsTransient'] = $true). + # - Fail fast for all other errors. + # NOTE: This is currently engine-owned and not configurable via plan/workflow to keep the surface small in this increment. + $retryPolicy = @{ + MaxAttempts = 3 + InitialDelayMilliseconds = 250 + BackoffFactor = 2.0 + MaxDelayMilliseconds = 5000 + JitterRatio = 0.2 } # Emit run start event. @@ -108,6 +120,7 @@ function Invoke-IdlePlanObject { Type = $stepType Status = 'NotApplicable' Error = $null + Attempts = 0 } $context.EventSink.WriteEvent('StepNotApplicable', "Step '$stepName' not applicable (condition not met).", $stepName, @{ @@ -140,8 +153,25 @@ function Invoke-IdlePlanObject { throw [System.ArgumentException]::new("Step handler for type '$stepType' is not a valid function name.", 'Providers') } - # Execute the step via handler. - $result = & $handlerName -Context $context -Step $step + # Execute the step via handler using safe retries for transient failures. + # Retries are only performed if trusted code marks the exception as transient. + $operationName = "Step '$stepName' ($stepType)" + $retrySeed = "Plan:$corr|Step:$stepName|Type:$stepType" + + $retryResult = Invoke-IdleWithRetry -Operation { + & $handlerName -Context $context -Step $step + } -MaxAttempts $retryPolicy.MaxAttempts ` + -InitialDelayMilliseconds $retryPolicy.InitialDelayMilliseconds ` + -BackoffFactor $retryPolicy.BackoffFactor ` + -MaxDelayMilliseconds $retryPolicy.MaxDelayMilliseconds ` + -JitterRatio $retryPolicy.JitterRatio ` + -EventSink $context.EventSink ` + -StepName $stepName ` + -OperationName $operationName ` + -DeterministicSeed $retrySeed + + $result = $retryResult.Value + $attempts = [int]$retryResult.Attempts # Normalize result shape (minimal contract). $stepResults += [pscustomobject]@{ @@ -150,23 +180,28 @@ function Invoke-IdlePlanObject { Type = $stepType Status = if ($null -ne $result -and $result.PSObject.Properties.Name -contains 'Status') { [string]$result.Status } else { 'Completed' } Error = if ($null -ne $result -and $result.PSObject.Properties.Name -contains 'Error') { $result.Error } else { $null } + Attempts = $attempts } $context.EventSink.WriteEvent('StepCompleted', "Step '$stepName' completed.", $stepName, @{ - StepType = $stepType - Index = $i + StepType = $stepType + Index = $i + Attempts = $attempts }) } catch { $failed = $true $err = $_ + # We cannot reliably know the number of attempts on failure without wrapping errors. + # For this increment, we keep the output stable and report a minimum of 1 attempt. $stepResults += [pscustomobject]@{ PSTypeName = 'IdLE.StepResult' Name = $stepName Type = $stepType Status = 'Failed' Error = $err.Exception.Message + Attempts = 1 } $context.EventSink.WriteEvent('StepFailed', "Step '$stepName' failed.", $stepName, @{ diff --git a/tests/Invoke-IdlePlan.Retry.Tests.ps1 b/tests/Invoke-IdlePlan.Retry.Tests.ps1 new file mode 100644 index 00000000..46a92f2f --- /dev/null +++ b/tests/Invoke-IdlePlan.Retry.Tests.ps1 @@ -0,0 +1,159 @@ +BeforeDiscovery { + . (Join-Path $PSScriptRoot '_testHelpers.ps1') + Import-IdleTestModule +} + +BeforeAll { + # Create a dedicated, ephemeral test module that exports the step handlers. + # This avoids global scope pollution while ensuring the engine can resolve + # handler names deterministically via module-qualified command names. + $script:RetryTestModuleName = 'IdLE.RetryTest' + $script:RetryTestModule = New-Module -Name $script:RetryTestModuleName -ScriptBlock { + Set-StrictMode -Version Latest + + $script:TransientCallCount = 0 + + function Invoke-IdleRetryTestTransientStep { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [ValidateNotNull()] + [object] $Context, + + [Parameter(Mandatory)] + [ValidateNotNull()] + [object] $Step + ) + + $script:TransientCallCount++ + + if ($script:TransientCallCount -eq 1) { + $ex = [System.Exception]::new('Transient failure (simulated)') + $ex.Data['Idle.IsTransient'] = $true + throw $ex + } + + return [pscustomobject]@{ + PSTypeName = 'IdLE.StepResult' + Name = [string]$Step.Name + Type = [string]$Step.Type + Status = 'Completed' + Error = $null + } + } + + function Invoke-IdleRetryTestNonTransientStep { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [ValidateNotNull()] + [object] $Context, + + [Parameter(Mandatory)] + [ValidateNotNull()] + [object] $Step + ) + + throw [System.Exception]::new('Non-transient failure (simulated)') + } + + function Reset-IdleRetryTestState { + [CmdletBinding()] + param() + + $script:TransientCallCount = 0 + } + + function Get-IdleRetryTestTransientCallCount { + [CmdletBinding()] + param() + + return [int]$script:TransientCallCount + } + + Export-ModuleMember -Function @( + 'Invoke-IdleRetryTestTransientStep', + 'Invoke-IdleRetryTestNonTransientStep', + 'Reset-IdleRetryTestState', + 'Get-IdleRetryTestTransientCallCount' + ) + } + + Import-Module -ModuleInfo $script:RetryTestModule -Force -ErrorAction Stop +} + +AfterAll { + # Remove the ephemeral module. + Remove-Module -Name $script:RetryTestModuleName -Force -ErrorAction SilentlyContinue +} + +Describe 'Invoke-IdlePlan - safe retries for transient failures (fail-fast)' { + + BeforeEach { + & "$script:RetryTestModuleName\Reset-IdleRetryTestState" + } + + It 'retries a step when the error is explicitly marked transient and then succeeds' { + Mock -ModuleName IdLE.Core -CommandName Start-Sleep -MockWith { } + + $wfPath = Join-Path -Path $TestDrive -ChildPath 'retry-transient.psd1' + Set-Content -Path $wfPath -Encoding UTF8 -Value @' +@{ + Name = 'Retry Transient Demo' + LifecycleEvent = 'Joiner' + Steps = @( + @{ Name = 'TransientStep'; Type = 'IdLE.Step.Transient' } + ) +} +'@ + + $req = New-IdleLifecycleRequest -LifecycleEvent 'Joiner' + $plan = New-IdlePlan -WorkflowPath $wfPath -Request $req + + $providers = @{ + StepRegistry = @{ + 'IdLE.Step.Transient' = "$script:RetryTestModuleName\Invoke-IdleRetryTestTransientStep" + } + } + + $result = Invoke-IdlePlan -Plan $plan -Providers $providers + + $result.Status | Should -Be 'Completed' + $result.Steps[0].Status | Should -Be 'Completed' + + @($result.Events | Where-Object Type -eq 'StepRetrying').Count | Should -Be 1 + Should -Invoke -ModuleName IdLE.Core -CommandName Start-Sleep -Times 1 -Exactly + + (& "$script:RetryTestModuleName\Get-IdleRetryTestTransientCallCount") | Should -Be 2 + } + + It 'fails fast and does not retry when the error is not marked transient' { + Mock -ModuleName IdLE.Core -CommandName Start-Sleep -MockWith { } + + $wfPath = Join-Path -Path $TestDrive -ChildPath 'retry-nontransient.psd1' + Set-Content -Path $wfPath -Encoding UTF8 -Value @' +@{ + Name = 'Retry Non-Transient Demo' + LifecycleEvent = 'Joiner' + Steps = @( + @{ Name = 'NonTransientStep'; Type = 'IdLE.Step.NonTransient' } + ) +} +'@ + + $req = New-IdleLifecycleRequest -LifecycleEvent 'Joiner' + $plan = New-IdlePlan -WorkflowPath $wfPath -Request $req + + $providers = @{ + StepRegistry = @{ + 'IdLE.Step.NonTransient' = "$script:RetryTestModuleName\Invoke-IdleRetryTestNonTransientStep" + } + } + + $result = Invoke-IdlePlan -Plan $plan -Providers $providers + + $result.Status | Should -Be 'Failed' + @($result.Events | Where-Object Type -eq 'StepRetrying').Count | Should -Be 0 + Should -Invoke -ModuleName IdLE.Core -CommandName Start-Sleep -Times 0 + } +}