Skip to content

Commit eee87ae

Browse files
author
weiliu2
committed
Add private AKS cluster support with vpn setup
Implements private cluster connectivity for edge nodes via VPN through a Gateway VM in Azure. Integrates as bootstrapper steps that are no-ops for non-private clusters. Merge original 13 commits into one for rebase main
1 parent 91e44f8 commit eee87ae

20 files changed

Lines changed: 2389 additions & 19 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*.so
66
*.dylib
77
aks-flex-node
8+
AKSFlexNode
89

910
# Test binary, built with `go test -c`
1011
*.test
@@ -42,6 +43,7 @@ Thumbs.db
4243

4344
# Config files with sensitive data (keep sample config)
4445
config.json
46+
Standard_D8pds_v6_sku.json
4547

4648
# Environment files with secrets
4749
.env

commands.go

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ var (
2525
BuildTime = "unknown"
2626
)
2727

28+
// Unbootstrap command flags
29+
var cleanupMode string
30+
2831
// NewAgentCommand creates a new agent command
2932
func NewAgentCommand() *cobra.Command {
3033
cmd := &cobra.Command{
@@ -44,12 +47,19 @@ func NewUnbootstrapCommand() *cobra.Command {
4447
cmd := &cobra.Command{
4548
Use: "unbootstrap",
4649
Short: "Remove AKS node configuration and Arc connection",
47-
Long: "Clean up and remove all AKS node components and Arc registration from this machine",
50+
Long: `Clean up and remove all AKS node components and Arc registration from this machine.
51+
52+
For private clusters (config has private: true), this also handles VPN cleanup:
53+
--cleanup-mode=local Remove node and local VPN config, keep Gateway (default)
54+
--cleanup-mode=full Remove everything including Gateway VM and Azure resources`,
4855
RunE: func(cmd *cobra.Command, args []string) error {
4956
return runUnbootstrap(cmd.Context())
5057
},
5158
}
5259

60+
cmd.Flags().StringVar(&cleanupMode, "cleanup-mode", "local",
61+
"[private cluster only] Cleanup mode: 'local' (keep Gateway) or 'full' (remove all Azure resources)")
62+
5363
return cmd
5464
}
5565

@@ -76,7 +86,7 @@ func runAgent(ctx context.Context) error {
7686
return fmt.Errorf("failed to load config from %s: %w", configPath, err)
7787
}
7888

79-
bootstrapExecutor := bootstrapper.New(cfg, logger)
89+
bootstrapExecutor := bootstrapper.New(logger)
8090
result, err := bootstrapExecutor.Bootstrap(ctx)
8191
if err != nil {
8292
return err
@@ -87,6 +97,13 @@ func runAgent(ctx context.Context) error {
8797
return err
8898
}
8999

100+
// Print visible success message
101+
fmt.Println()
102+
fmt.Println("========================================")
103+
fmt.Println(" Join process finished successfully!")
104+
fmt.Println("========================================")
105+
fmt.Println()
106+
90107
// After successful bootstrap, transition to daemon mode
91108
logger.Info("Bootstrap completed successfully, transitioning to daemon mode...")
92109
return runDaemonLoop(ctx, cfg)
@@ -101,14 +118,27 @@ func runUnbootstrap(ctx context.Context) error {
101118
return fmt.Errorf("failed to load config from %s: %w", configPath, err)
102119
}
103120

104-
bootstrapExecutor := bootstrapper.New(cfg, logger)
121+
// Pass cleanup mode to config so the PrivateClusterUninstall step can read it
122+
if cfg.Azure.TargetCluster != nil {
123+
cfg.Azure.TargetCluster.CleanupMode = cleanupMode
124+
}
125+
126+
bootstrapExecutor := bootstrapper.New(logger)
105127
result, err := bootstrapExecutor.Unbootstrap(ctx)
106128
if err != nil {
107129
return err
108130
}
109131

110132
// Handle and log the result (unbootstrap is more lenient with failures)
111-
return handleExecutionResult(result, "unbootstrap", logger)
133+
if err := handleExecutionResult(result, "unbootstrap", logger); err != nil {
134+
return err
135+
}
136+
137+
// Print final success message
138+
fmt.Println()
139+
fmt.Println("\033[0;32mSUCCESS:\033[0m Unbootstrap completed successfully!")
140+
141+
return nil
112142
}
113143

114144
// runVersion displays version information
@@ -214,7 +244,7 @@ func checkAndBootstrap(ctx context.Context, cfg *config.Config) error {
214244
logger.Info("Node requires re-bootstrapping, initiating auto-bootstrap...")
215245

216246
// Perform bootstrap
217-
bootstrapExecutor := bootstrapper.New(cfg, logger)
247+
bootstrapExecutor := bootstrapper.New(logger)
218248
result, err := bootstrapExecutor.Bootstrap(ctx)
219249
if err != nil {
220250
// Bootstrap failed - remove status file so next check will detect the problem

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ require (
66
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0
77
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1
88
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2
9+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0
910
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0
1011
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0
12+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0
13+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0
1114
github.com/Azure/go-autorest/autorest/to v0.4.1
1215
github.com/google/renameio/v2 v2.0.2
1316
github.com/google/uuid v1.6.0

go.sum

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,22 @@ github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDo
88
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
99
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2 h1:qiir/pptnHqp6hV8QwV+IExYIf6cPsXBfUDUXQ27t2Y=
1010
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2/go.mod h1:jVRrRDLCOuif95HDYC23ADTMlvahB7tMdl519m9Iyjc=
11+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0 h1:z7Mqz6l0EFH549GvHEqfjKvi+cRScxLWbaoeLm9wxVQ=
12+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0/go.mod h1:v6gbfH+7DG7xH2kUNs+ZJ9tF6O3iNnR85wMtmr+F54o=
1113
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0 h1:5n7dPVqsWfVKw+ZiEKSd3Kzu7gwBkbEBkeXb8rgaE9Q=
1214
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0/go.mod h1:HcZY0PHPo/7d75p99lB6lK0qYOP4vLRJUBpiehYXtLQ=
1315
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0 h1:7UuAn4ljE+H3GQ7qts3c7oAaMRvge68EgyckoNP/1Ro=
1416
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0/go.mod h1:F2eDq/BGK2LOEoDtoHbBOphaPqcjT0K/Y5Am8vf7+0w=
1517
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFGRSlMKCQelWwxUyYVEUqseBJVemLyqWJjvMyt0do=
1618
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0/go.mod h1:LRr2FzBTQlONPPa5HREE5+RjSCTXl7BwOvYOaWTqCaI=
17-
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 h1:7CBQ+Ei8SP2c6ydQTGCCrS35bDxgTMfoP2miAwK++OU=
18-
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1/go.mod h1:c/wcGeGx5FUPbM/JltUYHZcKmigwyVLJlDq+4HdtXaw=
19+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0 h1:2qsIIvxVT+uE6yrNldntJKlLRgxGbZ85kgtz5SNBhMw=
20+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0/go.mod h1:AW8VEadnhw9xox+VaVd9sP7NjzOAnaZBLRH6Tq3cJ38=
21+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 h1:HYGD75g0bQ3VO/Omedm54v4LrD3B1cGImuRF3AJ5wLo=
22+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0/go.mod h1:ulHyBFJOI0ONiRL4vcJTmS7rx18jQQlEPmAgo80cRdM=
23+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0 h1:Dd+RhdJn0OTtVGaeDLZpcumkIVCtA/3/Fo42+eoYvVM=
24+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0/go.mod h1:5kakwfW5CjC9KK+Q4wjXAg+ShuIm2mBMua0ZFj2C8PE=
25+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 h1:wxQx2Bt4xzPIKvW59WQf1tJNx/ZZKPfN+EhPX3Z6CYY=
26+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0/go.mod h1:TpiwjwnW/khS0LKs4vW5UmmT9OWcxaveS8U7+tlknzo=
1927
github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs=
2028
github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24=
2129
github.com/Azure/go-autorest/autorest/to v0.4.1 h1:CxNHBqdzTr7rLtdrtb5CMjJcDut+WNGCVv7OmS5+lTc=

main.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ func main() {
2525
}
2626

2727
// Add global flags for configuration
28-
rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required)")
28+
rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required for agent/unbootstrap)")
29+
_ = rootCmd.PersistentFlags().MarkHidden("config")
2930
// Don't mark as required globally - we'll check in PersistentPreRunE for commands that need it
3031

3132
// Add commands
@@ -39,8 +40,9 @@ func main() {
3940

4041
// Set up persistent pre-run to initialize config and logger
4142
rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error {
42-
// Skip config loading for version command
43-
if cmd.Name() == "version" {
43+
// Skip config loading for commands that don't need it
44+
switch cmd.Name() {
45+
case "version":
4446
return nil
4547
}
4648

pkg/bootstrapper/bootstrapper.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414
"go.goms.io/aks/AKSFlexNode/pkg/components/runc"
1515
"go.goms.io/aks/AKSFlexNode/pkg/components/services"
1616
"go.goms.io/aks/AKSFlexNode/pkg/components/system_configuration"
17-
"go.goms.io/aks/AKSFlexNode/pkg/config"
17+
"go.goms.io/aks/AKSFlexNode/pkg/privatecluster"
1818
)
1919

2020
// Bootstrapper executes bootstrap steps sequentially
@@ -23,16 +23,17 @@ type Bootstrapper struct {
2323
}
2424

2525
// New creates a new bootstrapper
26-
func New(cfg *config.Config, logger *logrus.Logger) *Bootstrapper {
26+
func New(logger *logrus.Logger) *Bootstrapper {
2727
return &Bootstrapper{
28-
BaseExecutor: NewBaseExecutor(cfg, logger),
28+
BaseExecutor: NewBaseExecutor(logger),
2929
}
3030
}
3131

3232
// Bootstrap executes all bootstrap steps sequentially
3333
func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error) {
3434
// Define the bootstrap steps in order - using modules directly
3535
steps := []Executor{
36+
privatecluster.NewInstaller(b.logger), // VPN/Gateway setup (if private cluster)
3637
arc.NewInstaller(b.logger), // Setup Arc
3738
services.NewUnInstaller(b.logger), // Stop kubelet before setup
3839
system_configuration.NewInstaller(b.logger), // Configure system (early)
@@ -51,6 +52,7 @@ func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error)
5152
// Unbootstrap executes all cleanup steps sequentially (in reverse order of bootstrap)
5253
func (b *Bootstrapper) Unbootstrap(ctx context.Context) (*ExecutionResult, error) {
5354
steps := []Executor{
55+
privatecluster.NewUninstaller(b.logger), // Node removal + VPN teardown (if private cluster)
5456
services.NewUnInstaller(b.logger), // Stop services first
5557
npd.NewUnInstaller(b.logger), // Uninstall Node Problem Detector
5658
kubelet.NewUnInstaller(b.logger), // Clean kubelet configuration

pkg/bootstrapper/executor.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"time"
88

99
"github.com/sirupsen/logrus"
10-
"go.goms.io/aks/AKSFlexNode/pkg/config"
1110
)
1211

1312
// executor is a common base interface for all executors
@@ -51,14 +50,12 @@ type StepResult struct {
5150

5251
// BaseExecutor provides common functionality for bootstrap and unbootstrap operations
5352
type BaseExecutor struct {
54-
config *config.Config
5553
logger *logrus.Logger
5654
}
5755

5856
// NewBaseExecutor creates a new base executor
59-
func NewBaseExecutor(cfg *config.Config, logger *logrus.Logger) *BaseExecutor {
57+
func NewBaseExecutor(logger *logrus.Logger) *BaseExecutor {
6058
return &BaseExecutor{
61-
config: cfg,
6259
logger: logger,
6360
}
6461
}

pkg/config/structs.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,12 @@ type BootstrapTokenConfig struct {
5555

5656
// TargetClusterConfig holds configuration for the target AKS cluster the ARC machine will connect to.
5757
type TargetClusterConfig struct {
58-
ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster
59-
Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2")
58+
ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster
59+
Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2")
60+
IsPrivateCluster bool `json:"private" mapstructure:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup)
61+
GatewayVMSize string `json:"gatewayVMSize,omitempty" mapstructure:"gatewayVMSize"` // VPN Gateway VM size (defaults to "Standard_D2s_v3")
62+
GatewayPort int `json:"gatewayPort,omitempty" mapstructure:"gatewayPort"` // VPN Gateway port (defaults to 51820)
63+
CleanupMode string `json:"-"` // Runtime-only, set by CLI flag for unbootstrap
6064
Name string // will be populated from ResourceID
6165
ResourceGroup string // will be populated from ResourceID
6266
SubscriptionID string // will be populated from ResourceID

pkg/privatecluster/README.md

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Private AKS Cluster - Edge Node Join/Leave
2+
3+
## Prerequisites
4+
5+
### 1. Login to Azure CLI
6+
7+
```bash
8+
az login
9+
```
10+
11+
> **Note:** When running the agent with `sudo`, use `sudo -E` to preserve your Azure CLI token. Alternatively, run `sudo az login` to login as root directly.
12+
13+
### 2. Create a Private AKS Cluster
14+
15+
Create a Private AKS cluster with AAD and Azure RBAC enabled, and assign the required roles to your user.
16+
17+
See: [create_private_cluster.md](create_private_cluster.md)
18+
19+
### 3. Prepare Configuration File
20+
21+
Create a `config.json` with `"private": true` in the `targetCluster` section:
22+
23+
```json
24+
{
25+
"azure": {
26+
"subscriptionId": "<SUBSCRIPTION_ID>",
27+
"tenantId": "<TENANT_ID>",
28+
"targetCluster": {
29+
"resourceId": "/subscriptions/<SUB_ID>/resourceGroups/<RG>/providers/Microsoft.ContainerService/managedClusters/<CLUSTER_NAME>",
30+
"location": "eastus2",
31+
"private": true
32+
},
33+
"arc": {
34+
"enabled": true,
35+
"resourceGroup": "<RG>",
36+
"location": "eastus2"
37+
}
38+
},
39+
"kubernetes": {
40+
"version": "1.33.0"
41+
},
42+
"containerd": {
43+
"version": "1.7.11",
44+
"pauseImage": "mcr.microsoft.com/oss/kubernetes/pause:3.6"
45+
},
46+
"agent": {
47+
"logLevel": "info",
48+
"logDir": "/var/log/aks-flex-node"
49+
}
50+
}
51+
```
52+
53+
## Join Private AKS Cluster
54+
55+
### 1. Build the project
56+
57+
```bash
58+
go build -o aks-flex-node .
59+
```
60+
61+
### 2. Join the cluster
62+
63+
When the config has `"private": true`, the `agent` command automatically sets up the Gateway/VPN before bootstrapping:
64+
65+
```bash
66+
sudo -E ./aks-flex-node agent --config config.json
67+
```
68+
69+
This will:
70+
1. Detect private cluster from config
71+
2. Set up Gateway VM and VPN tunnel (WireGuard)
72+
3. Run normal bootstrap (Arc, containerd, kubelet, etc.)
73+
4. Enter daemon mode for status monitoring
74+
75+
### 3. Verify
76+
77+
```bash
78+
kubectl get nodes
79+
```
80+
81+
## Leave Private AKS Cluster
82+
83+
When the config has `"private": true`, the `unbootstrap` command automatically handles VPN/Gateway cleanup:
84+
85+
```bash
86+
sudo ./aks-flex-node unbootstrap --config config.json [--cleanup-mode <local|full>]
87+
```
88+
89+
### Mode Comparison
90+
91+
| Mode | Command | Description |
92+
|------|---------|-------------|
93+
| `local` (default) | `sudo -E ./aks-flex-node unbootstrap --config config.json` | Remove node and local VPN config, **keep Gateway** for other nodes |
94+
| `full` | `sudo -E ./aks-flex-node unbootstrap --config config.json --cleanup-mode full` | Remove all components **including Gateway VM and Azure resources** |
95+
96+
### When to use each mode
97+
98+
- **`--cleanup-mode=local`** (default): Other nodes are still using the Gateway, or you plan to rejoin later
99+
- **`--cleanup-mode=full`**: Last node leaving, clean up all Azure resources (Gateway VM, subnet, NSG, public IP)

0 commit comments

Comments
 (0)