Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cli/cmd/bootstrap_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ func AddBootstrapGcpCmd(parent *cobra.Command, opts *GlobalOptions) {
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPublicKeyPath, "ssh-public-key-path", "~/.ssh/id_rsa.pub", "SSH Public Key Path (default: ~/.ssh/id_rsa.pub)")
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPrivateKeyPath, "ssh-private-key-path", "~/.ssh/id_rsa", "SSH Private Key Path (default: ~/.ssh/id_rsa)")
flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Preemptible, "preemptible", false, "Use preemptible VMs for Codesphere infrastructure (default: false)")
flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Spot, "spot", false, "Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false)")
flags.IntVar(&bootstrapGcpCmd.CodesphereEnv.DatacenterID, "datacenter-id", 1, "Datacenter ID (default: 1)")
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.CustomPgIP, "custom-pg-ip", "", "Custom PostgreSQL IP (optional)")
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.InstallConfigPath, "install-config", "config.yaml", "Path to install config file (optional)")
Expand Down
1 change: 1 addition & 0 deletions docs/oms_beta_bootstrap-gcp.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ oms beta bootstrap-gcp [flags]
--registry-user string Custom Registry username (only for GitHub registry type) (optional)
--secrets-dir string Directory for secrets (default: /etc/codesphere/secrets) (default "/etc/codesphere/secrets")
--secrets-file string Path to secrets files (optional) (default "prod.vault.yaml")
--spot Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false)
--ssh-private-key-path string SSH Private Key Path (default: ~/.ssh/id_rsa) (default "~/.ssh/id_rsa")
--ssh-public-key-path string SSH Public Key Path (default: ~/.ssh/id_rsa.pub) (default "~/.ssh/id_rsa.pub")
--ssh-quiet Suppress SSH command output (default: false)
Expand Down
186 changes: 167 additions & 19 deletions internal/bootstrap/gcp/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ type CodesphereEnvironment struct {
InstallHash string `json:"install_hash"`
InstallSkipSteps []string `json:"install_skip_steps"`
Preemptible bool `json:"preemptible"`
Spot bool `json:"spot"`
WriteConfig bool `json:"-"`
GatewayIP string `json:"gateway_ip"`
PublicGatewayIP string `json:"public_gateway_ip"`
Expand Down Expand Up @@ -307,9 +308,22 @@ func (b *GCPBootstrapper) ValidateInput() error {
return err
}

err = b.validateVMProvisioningOptions()
if err != nil {
return err
}

return b.validateGithubParams()
}

// validateVMProvisioningOptions checks that spot and preemptible options are not both set
func (b *GCPBootstrapper) validateVMProvisioningOptions() error {
if b.Env.Spot && b.Env.Preemptible {
return fmt.Errorf("cannot specify both --spot and --preemptible flags; use --spot for the newer spot VM model")
}
return nil
}

// validateInstallVersion checks if the specified install version exists and contains the required installer artifact
func (b *GCPBootstrapper) validateInstallVersion() error {
if b.Env.InstallLocal != "" {
Expand Down Expand Up @@ -693,6 +707,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
wg := sync.WaitGroup{}
errCh := make(chan error, len(vmDefs))
resultCh := make(chan vmResult, len(vmDefs))
logCh := make(chan string, len(vmDefs))
rootDiskSize := int64(200)
if b.Env.RegistryType == RegistryTypeGitHub {
rootDiskSize = 50
Expand All @@ -701,6 +716,43 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
wg.Add(1)
go func(vm VMDef) {
defer wg.Done()

existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name)
if err != nil {
if !isNotFoundError(err) {
errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err)
return
}
}
if existingInstance != nil {
instanceStatus := existingInstance.GetStatus()
if instanceStatus == "TERMINATED" || instanceStatus == "STOPPED" || instanceStatus == "SUSPENDED" {
// Start the stopped instance
err = b.GCPClient.StartInstance(projectID, zone, vm.Name)
if err != nil {
errCh <- fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err)
return
}
}

// Wait until the instance is RUNNING and IPs are populated.
readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP)
if err != nil {
errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err)
return
}

internalIP, externalIP := extractInstanceIPs(readyInstance)
resultCh <- vmResult{
vmType: vm.Tags[0],
name: vm.Name,
externalIP: externalIP,
internalIP: internalIP,
}
return
}

// Instance doesn't exist, create it
disks := []*computepb.AttachedDisk{
{
Boot: protoBool(true),
Expand Down Expand Up @@ -744,9 +796,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
Tags: &computepb.Tags{
Items: vm.Tags,
},
Scheduling: &computepb.Scheduling{
Preemptible: &b.Env.Preemptible,
},
Scheduling: b.buildSchedulingConfig(),
NetworkInterfaces: []*computepb.NetworkInterface{
{
Network: protoString(network),
Expand Down Expand Up @@ -774,29 +824,20 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
}
}

err = b.GCPClient.CreateInstance(projectID, zone, instance)
if err != nil && !isAlreadyExistsError(err) {
errCh <- fmt.Errorf("failed to create instance %s: %w", vm.Name, err)
err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name, logCh)
if err != nil {
errCh <- err
return
}

// Find out the IP addresses of the created instance
resp, err := b.GCPClient.GetInstance(projectID, zone, vm.Name)
// Wait for the newly created instance to be RUNNING with IPs assigned
readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP)
if err != nil {
errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err)
errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err)
return
}

externalIP := ""
internalIP := ""
if len(resp.GetNetworkInterfaces()) > 0 {
internalIP = resp.GetNetworkInterfaces()[0].GetNetworkIP()
if len(resp.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 {
externalIP = resp.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP()
}
}

// Send result through channel instead of creating nodes in goroutine
internalIP, externalIP := extractInstanceIPs(readyInstance)
resultCh <- vmResult{
vmType: vm.Tags[0],
name: vm.Name,
Expand All @@ -809,6 +850,11 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {

close(errCh)
close(resultCh)
close(logCh)

for msg := range logCh {
b.stlog.Logf("%s", msg)
}

var errs []error
for err := range errCh {
Expand Down Expand Up @@ -850,6 +896,104 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
return nil
}

// extractInstanceIPs returns the internal and external IPs from a compute instance.
func extractInstanceIPs(inst *computepb.Instance) (internalIP, externalIP string) {
if len(inst.GetNetworkInterfaces()) > 0 {
internalIP = inst.GetNetworkInterfaces()[0].GetNetworkIP()
if len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 {
externalIP = inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP()
}
}
return
}

// buildSchedulingConfig creates the scheduling configuration based on spot/preemptible settings
func (b *GCPBootstrapper) buildSchedulingConfig() *computepb.Scheduling {
if b.Env.Spot {
return &computepb.Scheduling{
ProvisioningModel: protoString("SPOT"),
OnHostMaintenance: protoString("TERMINATE"),
AutomaticRestart: protoBool(false),
InstanceTerminationAction: protoString("STOP"),
}
}
if b.Env.Preemptible {
return &computepb.Scheduling{
Preemptible: protoBool(true),
}
}

return &computepb.Scheduling{}
}

// createInstanceWithFallback attempts to create an instance with the configured settings.
// If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs.
func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string, logCh chan<- string) error {
err := b.GCPClient.CreateInstance(projectID, zone, instance)
if err == nil {
return nil
}

if isAlreadyExistsError(err) {
return nil
}

if b.Env.Spot && isSpotCapacityError(err) {
logCh <- fmt.Sprintf("Spot capacity unavailable for %s, falling back to standard VM", vmName)
instance.Scheduling = &computepb.Scheduling{}
err = b.GCPClient.CreateInstance(projectID, zone, instance)
if err != nil && !isAlreadyExistsError(err) {
return fmt.Errorf("failed to create instance %s (fallback to standard VM): %w", vmName, err)
}
return nil
}

return fmt.Errorf("failed to create instance %s: %w", vmName, err)
}

// waitForInstanceRunning polls GetInstance until the instance status is RUNNING
// and its internal IP (and external IP, when needsExternalIP is true) are populated.
// It returns the ready instance or an error if the deadline is exceeded.
func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, needsExternalIP bool) (*computepb.Instance, error) {
const (
maxAttempts = 60
pollInterval = 5 * time.Second
)
for attempt := range maxAttempts {
inst, err := b.GCPClient.GetInstance(projectID, zone, name)
if err != nil {
return nil, fmt.Errorf("failed to poll instance %s: %w", name, err)
}

if inst.GetStatus() == "RUNNING" &&
len(inst.GetNetworkInterfaces()) > 0 &&
inst.GetNetworkInterfaces()[0].GetNetworkIP() != "" &&
(!needsExternalIP || (len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 &&
inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() != "")) {
return inst, nil
}

if attempt < maxAttempts-1 {
time.Sleep(pollInterval)
}
}
return nil, fmt.Errorf("timed out waiting for instance %s to be RUNNING with IPs assigned after %s",
name, time.Duration(maxAttempts)*pollInterval)
}

// isSpotCapacityError checks if the error is related to spot VM capacity issues
func isSpotCapacityError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "ZONE_RESOURCE_POOL_EXHAUSTED") ||
strings.Contains(errStr, "UNSUPPORTED_OPERATION") ||
strings.Contains(errStr, "stockout") ||
strings.Contains(errStr, "does not have enough resources") ||
status.Code(err) == codes.ResourceExhausted
}

// EnsureGatewayIPAddresses reserves 2 static external IP addresses for the ingress
// controllers of the cluster.
func (b *GCPBootstrapper) EnsureGatewayIPAddresses() error {
Expand Down Expand Up @@ -1598,6 +1742,10 @@ func isAlreadyExistsError(err error) bool {
return status.Code(err) == codes.AlreadyExists || strings.Contains(err.Error(), "already exists")
}

func isNotFoundError(err error) bool {
return status.Code(err) == codes.NotFound || strings.Contains(strings.ToLower(err.Error()), "not found")
}

// readSSHKey reads an SSH key file, expanding ~ in the path
func (b *GCPBootstrapper) readSSHKey(path string) (string, error) {
realPath := util.ExpandPath(path)
Expand Down
21 changes: 21 additions & 0 deletions internal/bootstrap/gcp/gcp_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ type GCPClientManager interface {
CreateFirewallRule(projectID string, rule *computepb.Firewall) error
CreateInstance(projectID, zone string, instance *computepb.Instance) error
GetInstance(projectID, zone, instanceName string) (*computepb.Instance, error)
StartInstance(projectID, zone, instanceName string) error
CreateAddress(projectID, region string, address *computepb.Address) (string, error)
GetAddress(projectID, region, addressName string) (*computepb.Address, error)
EnsureDNSManagedZone(projectID, zoneName, dnsName, description string) error
Expand Down Expand Up @@ -562,6 +563,26 @@ func (c *GCPClient) GetInstance(projectID, zone, instanceName string) (*computep
})
}

// StartInstance starts a stopped Compute Engine instance in the specified project and zone.
func (c *GCPClient) StartInstance(projectID, zone, instanceName string) error {
client, err := compute.NewInstancesRESTClient(c.ctx)
if err != nil {
return err
}
defer util.IgnoreError(client.Close)

op, err := client.Start(c.ctx, &computepb.StartInstanceRequest{
Project: projectID,
Zone: zone,
Instance: instanceName,
})
if err != nil {
return err
}

return op.Wait(c.ctx)
}

// CreateAddress creates a new static IP address in the specified project and region.
func (c *GCPClient) CreateAddress(projectID, region string, address *computepb.Address) (string, error) {
client, err := compute.NewAddressesRESTClient(c.ctx)
Expand Down
Loading
Loading