Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b76c117
[WIP] Add support for machine preservation through annotations
thiyyakat Oct 29, 2025
b814d93
Add MachinePreserveTimeout to SafetyOptions.
thiyyakat Nov 5, 2025
f958786
Add PreserveExpiryTime to `machine.Status.CurrentStatus`.
thiyyakat Nov 5, 2025
86502b3
Remove `AutoPreserveFailedMachineCount` from machine set
thiyyakat Nov 5, 2025
b92cfb2
Fix linting error
thiyyakat Nov 5, 2025
6dce992
Add generated files
thiyyakat Nov 5, 2025
1469138
Add support for preserve=now on node and machine objects
thiyyakat Nov 5, 2025
2bbb7e7
Update TODOs
thiyyakat Nov 5, 2025
59edd3b
[WIP] Implement add/remove/update of node and machine annotations
thiyyakat Nov 10, 2025
7792fa3
Update preserve logic to honour node annotations over machine
thiyyakat Nov 13, 2025
5796f51
Add preservation logic in machineset controller. TODO: remove debug logs
thiyyakat Nov 19, 2025
1636ace
Add drain logic post preservation of failed machine
thiyyakat Nov 19, 2025
ee1afa2
Fix return for reconcileMachineHealth. Unit tests passing
thiyyakat Nov 19, 2025
bb515d9
Update CRDs
thiyyakat Nov 19, 2025
f462f8f
Fix bug causing repeated requeuing
thiyyakat Nov 24, 2025
da2427c
Fix drain logic in machine preservation for Unknown->Failed case:
thiyyakat Nov 26, 2025
0cdfc4f
Fix toggle between now and when-failed when machine has not failed.
thiyyakat Nov 27, 2025
dd0c04d
Refactor changes to support auto-preservation of failed machines
thiyyakat Dec 4, 2025
83fef68
Fix bugs that prevented MCS update, and auto-preservation of machines
thiyyakat Dec 5, 2025
377332a
Add support for uncordoning preserved node that is healthy
thiyyakat Dec 8, 2025
d1766de
Refactor code:
thiyyakat Dec 10, 2025
15a3ea2
Fix bug so that recovered preserved nodes are uncordoned
thiyyakat Dec 10, 2025
9e4ee5c
Minor changes
thiyyakat Dec 10, 2025
1a52463
Change verb used in log statements for machine/node name
thiyyakat Dec 10, 2025
e66e2b7
Fix mistake made during rebasing
thiyyakat Dec 10, 2025
ead3648
Change return types of preservation util functions such that only cal…
thiyyakat Dec 11, 2025
9e44aee
Address review comments
thiyyakat Dec 12, 2025
8225d20
Remove incorrect json tag and regenerate CRDs.
thiyyakat Dec 18, 2025
9eb910d
Apply suggestions from code review - part 1
thiyyakat Dec 19, 2025
5e7e2d7
Delete invalid gitlink
thiyyakat Dec 19, 2025
d560664
Address review comments- part 2:
thiyyakat Dec 22, 2025
b60af94
Address review comments- part 3:
thiyyakat Dec 23, 2025
1a97522
Address review comments- part 4:
thiyyakat Dec 23, 2025
8da62a7
Add unit tests for preservation logic in machine.go
thiyyakat Dec 24, 2025
718b451
Refactor tests to reduce redundancy in code.
thiyyakat Dec 26, 2025
8145d8b
Add tests for preservation logic in machine_util.go
thiyyakat Dec 29, 2025
74603a4
Refactor test code to reduce redundant code
thiyyakat Dec 31, 2025
43cf3a1
Fix bugs after merging
thiyyakat Dec 31, 2025
1fedc9f
Remove testing code
thiyyakat Jan 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 105 additions & 4 deletions docs/documents/apis.md
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,21 @@ not be estimated during the time a MachineDeployment is paused. This is not set
by default, which is treated as infinite deadline.</p>
</td>
</tr>
<tr>
<td>
<code>autoPreserveFailedMachineMax</code>
</td>
<td>
<em>
int32
</em>
</td>
<td>
<em>(Optional)</em>
<p>The maximum number of machines in the machine deployment that will be auto-preserved.
In the gardener context, this number is derived from the AutoPreserveFailedMachineMax set at the worker level, distributed amongst the worker&rsquo;s machine deployments</p>
</td>
</tr>
</table>
</td>
</tr>
Expand Down Expand Up @@ -678,6 +693,19 @@ int32
<em>(Optional)</em>
</td>
</tr>
<tr>
<td>
<code>autoPreserveFailedMachineMax</code>
</td>
<td>
<em>
int32
</em>
</td>
<td>
<em>(Optional)</em>
</td>
</tr>
</table>
</td>
</tr>
Expand Down Expand Up @@ -833,6 +861,21 @@ Kubernetes meta/v1.Time
<p>Last update time of current status</p>
</td>
</tr>
<tr>
<td>
<code>preserveExpiryTime</code>
</td>
<td>
<em>
<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.29/#time-v1-meta">
Kubernetes meta/v1.Time
</a>
</em>
</td>
<td>
<p>PreserveExpiryTime is the time at which MCM will stop preserving the machine</p>
</td>
</tr>
</tbody>
</table>
<br>
Expand Down Expand Up @@ -1071,6 +1114,22 @@ Kubernetes meta/v1.Duration
</tr>
<tr>
<td>
<code>preserveTimeout</code>
</td>
<td>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Kubernetes meta/v1.Duration
</a>
</em>
</td>
<td>
<em>(Optional)</em>
<p>MachinePreserveTimeout is the timeout after which the machine preservation is stopped</p>
</td>
</tr>
<tr>
<td>
<code>disableHealthTimeout</code>
</td>
<td>
Expand Down Expand Up @@ -1398,6 +1457,21 @@ not be estimated during the time a MachineDeployment is paused. This is not set
by default, which is treated as infinite deadline.</p>
</td>
</tr>
<tr>
<td>
<code>autoPreserveFailedMachineMax</code>
</td>
<td>
<em>
int32
</em>
</td>
<td>
<em>(Optional)</em>
<p>The maximum number of machines in the machine deployment that will be auto-preserved.
In the gardener context, this number is derived from the AutoPreserveFailedMachineMax set at the worker level, distributed amongst the worker&rsquo;s machine deployments</p>
</td>
</tr>
</tbody>
</table>
<br>
Expand Down Expand Up @@ -1543,8 +1617,8 @@ newest MachineSet.</p>
</td>
<td>
<em>
<a href="#machine.sapcloud.io/v1alpha1.*../../pkg/apis/machine/v1alpha1.MachineSummary">
[]*../../pkg/apis/machine/v1alpha1.MachineSummary
<a href="#machine.sapcloud.io/v1alpha1.*github.com/thiyyakat/machine-controller-manager/pkg/apis/machine/v1alpha1.MachineSummary">
[]*github.com/thiyyakat/machine-controller-manager/pkg/apis/machine/v1alpha1.MachineSummary
</a>
</em>
</td>
Expand Down Expand Up @@ -1860,6 +1934,19 @@ int32
<em>(Optional)</em>
</td>
</tr>
<tr>
<td>
<code>autoPreserveFailedMachineMax</code>
</td>
<td>
<em>
int32
</em>
</td>
<td>
<em>(Optional)</em>
</td>
</tr>
</tbody>
</table>
<br>
Expand Down Expand Up @@ -1988,8 +2075,8 @@ LastOperation
</td>
<td>
<em>
<a href="#machine.sapcloud.io/v1alpha1.[]../../pkg/apis/machine/v1alpha1.MachineSummary">
[]../../pkg/apis/machine/v1alpha1.MachineSummary
<a href="#machine.sapcloud.io/v1alpha1.[]github.com/thiyyakat/machine-controller-manager/pkg/apis/machine/v1alpha1.MachineSummary">
[]github.com/thiyyakat/machine-controller-manager/pkg/apis/machine/v1alpha1.MachineSummary
</a>
</em>
</td>
Expand All @@ -1998,6 +2085,20 @@ LastOperation
<p>FailedMachines has summary of machines on which lastOperation Failed</p>
</td>
</tr>
<tr>
<td>
<code>autoPreserveFailedMachineCount</code>
</td>
<td>
<em>
int32
</em>
</td>
<td>
<em>(Optional)</em>
<p>AutoPreserveFailedMachineCount has a count of the number of failed machines in the machineset that have been auto-preserved</p>
</td>
</tr>
</tbody>
</table>
<br>
Expand Down
10 changes: 10 additions & 0 deletions kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ spec:
spec:
description: Specification of the desired behavior of the MachineDeployment.
properties:
autoPreserveFailedMachineMax:
description: |-
The maximum number of machines in the machine deployment that will be auto-preserved.
In the gardener context, this number is derived from the AutoPreserveFailedMachineMax set at the worker level, distributed amongst the worker's machine deployments
format: int32
type: integer
minReadySeconds:
description: |-
Minimum number of seconds for which a newly created machine should be ready
Expand Down Expand Up @@ -296,6 +302,10 @@ spec:
description: MachineInPlaceUpdateTimeout is the timeout after
which in-place update is declared failed.
type: string
machinePreserveTimeout:
description: MachinePreserveTimeout is the timeout after which
the machine preservation is stopped
type: string
maxEvictRetries:
description: MaxEvictRetries is the number of retries that
will be attempted while draining the node.
Expand Down
9 changes: 9 additions & 0 deletions kubernetes/crds/machine.sapcloud.io_machines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ spec:
description: MachineInPlaceUpdateTimeout is the timeout after which
in-place update is declared failed.
type: string
machinePreserveTimeout:
description: MachinePreserveTimeout is the timeout after which the
machine preservation is stopped
type: string
maxEvictRetries:
description: MaxEvictRetries is the number of retries that will be
attempted while draining the node.
Expand Down Expand Up @@ -287,6 +291,11 @@ spec:
description: MachinePhase is a label for the condition of a machine
at the current time.
type: string
preserveExpiryTime:
description: PreserveExpiryTime is the time at which MCM will
stop preserving the machine
format: date-time
type: string
timeoutActive:
type: boolean
type: object
Expand Down
12 changes: 12 additions & 0 deletions kubernetes/crds/machine.sapcloud.io_machinesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ spec:
spec:
description: MachineSetSpec is the specification of a MachineSet.
properties:
autoPreserveFailedMachineMax:
format: int32
type: integer
machineClass:
description: ClassSpec is the class specification of machine
properties:
Expand Down Expand Up @@ -178,6 +181,10 @@ spec:
description: MachineInPlaceUpdateTimeout is the timeout after
which in-place update is declared failed.
type: string
machinePreserveTimeout:
description: MachinePreserveTimeout is the timeout after which
the machine preservation is stopped
type: string
maxEvictRetries:
description: MaxEvictRetries is the number of retries that
will be attempted while draining the node.
Expand Down Expand Up @@ -312,6 +319,11 @@ spec:
description: MachineSetStatus holds the most recently observed status
of MachineSet.
properties:
autoPreserveFailedMachineCount:
description: AutoPreserveFailedMachineCount has a count of the number
of failed machines in the machineset that have been auto-preserved
format: int32
type: integer
availableReplicas:
description: The number of available replicas (ready for at least
minReadySeconds) for this replica set.
Expand Down
14 changes: 14 additions & 0 deletions pkg/apis/machine/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ type MachineConfiguration struct {
// MachineInPlaceUpdateTimeout is the timeout after which in-place update is declared failed.
MachineInPlaceUpdateTimeout *metav1.Duration

// MachinePreserveTimeout is the timeout after which the machine preservation is stopped
MachinePreserveTimeout *metav1.Duration
// DisableHealthTimeout if set to true, health timeout will be ignored. Leading to machine never being declared failed.
// This is intended to be used only for in-place updates.
DisableHealthTimeout *bool
Expand Down Expand Up @@ -158,6 +160,9 @@ type CurrentStatus struct {

// Last update time of current status
LastUpdateTime metav1.Time

// PreserveExpiryTime is the time at which MCM will stop preserving the machine
PreserveExpiryTime metav1.Time
}

// MachineStatus holds the most recently observed status of Machine.
Expand Down Expand Up @@ -351,6 +356,8 @@ type MachineSetSpec struct {
Template MachineTemplateSpec

MinReadySeconds int32

AutoPreserveFailedMachineMax int32
}

// MachineSetConditionType is the condition on machineset object
Expand Down Expand Up @@ -409,6 +416,9 @@ type MachineSetStatus struct {

// FailedMachines has summary of machines on which lastOperation Failed
FailedMachines *[]MachineSummary

// AutoPreserveFailedMachineCount has a count of the number of failed machines in the machineset that have been auto-preserved
AutoPreserveFailedMachineCount int32
}

// MachineSummary store the summary of machine.
Expand Down Expand Up @@ -487,6 +497,10 @@ type MachineDeploymentSpec struct {
// not be estimated during the time a MachineDeployment is paused. This is not set
// by default.
ProgressDeadlineSeconds *int32

// The maximum number of machines in the machine deployment that will be auto-preserved.
// In the gardener context, this number is derived from the AutoPreserveFailedMachineMax set at the worker level, distributed amongst the worker's machine deployments
AutoPreserveFailedMachineMax int32
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
23 changes: 23 additions & 0 deletions pkg/apis/machine/v1alpha1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,26 @@ const (
UpdateFailed string = "UpdateFailed"
)

const (
// NodePreserved is a node condition type for preservation of machines to allow end-user to know that a node is preserved
NodePreserved corev1.NodeConditionType = "NodePreserved"

// NodePreservedByMCM is a node condition reason for preservation of machines to indicate that the node is auto-preserved by MCM
NodePreservedByMCM string = "PreservedByMCM"

// NodePreservedByUser is a node condition reason to indicate that a machine/node has been preserved due to explicit annotation by user
NodePreservedByUser string = "PreservedByUser"

// NodePreservationStopped is a node condition reason to indicate that a machine/node preservation has been stopped due to annotation update or timeout
NodePreservationStopped string = "PreservationStopped"

// PreservedNodeDrainSuccessful is a constant for the message in condition that indicates that the preserved node's drain is successful
PreservedNodeDrainSuccessful string = "Preserved Node drained successfully"

// PreservedNodeDrainUnsuccessful is a constant for the message in condition that indicates that the preserved node's drain was not successful
PreservedNodeDrainUnsuccessful string = "Preserved Node could not be drained"
)

// CurrentStatus contains information about the current status of Machine.
type CurrentStatus struct {
Phase MachinePhase `json:"phase,omitempty"`
Expand All @@ -252,6 +272,9 @@ type CurrentStatus struct {

// Last update time of current status
LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"`

// PreserveExpiryTime is the time at which MCM will stop preserving the machine
PreserveExpiryTime metav1.Time `json:"preserveExpiryTime,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/machine/v1alpha1/machinedeployment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ type MachineDeploymentSpec struct {
// by default, which is treated as infinite deadline.
// +optional
ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"`

// The maximum number of machines in the machine deployment that will be auto-preserved.
// In the gardener context, this number is derived from the AutoPreserveFailedMachineMax set at the worker level, distributed amongst the worker's machine deployments
// +optional
AutoPreserveFailedMachineMax int32 `json:"autoPreserveFailedMachineMax,omitempty"`
}

const (
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/machine/v1alpha1/machineset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ type MachineSetSpec struct {

// +optional
MinReadySeconds int32 `json:"minReadySeconds,omitempty"`

// +optional
AutoPreserveFailedMachineMax int32 `json:"autoPreserveFailedMachineMax,omitempty"`
}

// MachineSetConditionType is the condition on machineset object
Expand Down Expand Up @@ -135,4 +138,8 @@ type MachineSetStatus struct {
// FailedMachines has summary of machines on which lastOperation Failed
// +optional
FailedMachines *[]MachineSummary `json:"failedMachines,omitempty"`

// AutoPreserveFailedMachineCount has a count of the number of failed machines in the machineset that have been auto-preserved
// +optional
AutoPreserveFailedMachineCount int32 `json:"autoPreserveFailedMachineCount,omitempty"`
}
4 changes: 4 additions & 0 deletions pkg/apis/machine/v1alpha1/shared_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ type MachineConfiguration struct {
// +optional
MachineInPlaceUpdateTimeout *metav1.Duration `json:"inPlaceUpdateTimeout,omitempty"`

// MachinePreserveTimeout is the timeout after which the machine preservation is stopped
// +optional
MachinePreserveTimeout *metav1.Duration `json:"machinePreserveTimeout,omitempty"`

// DisableHealthTimeout if set to true, health timeout will be ignored. Leading to machine never being declared failed.
// This is intended to be used only for in-place updates.
// +optional
Expand Down
Loading
Loading