diff --git a/cli/cmd/bootstrap_gcp.go b/cli/cmd/bootstrap_gcp.go index 62e2e22..24e3766 100644 --- a/cli/cmd/bootstrap_gcp.go +++ b/cli/cmd/bootstrap_gcp.go @@ -70,6 +70,7 @@ func AddBootstrapGcpCmd(parent *cobra.Command, opts *GlobalOptions) { flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPublicKeyPath, "ssh-public-key-path", "~/.ssh/id_rsa.pub", "SSH Public Key Path (default: ~/.ssh/id_rsa.pub)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPrivateKeyPath, "ssh-private-key-path", "~/.ssh/id_rsa", "SSH Private Key Path (default: ~/.ssh/id_rsa)") flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Preemptible, "preemptible", false, "Use preemptible VMs for Codesphere infrastructure (default: false)") + flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Spot, "spot", false, "Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false)") flags.IntVar(&bootstrapGcpCmd.CodesphereEnv.DatacenterID, "datacenter-id", 1, "Datacenter ID (default: 1)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.CustomPgIP, "custom-pg-ip", "", "Custom PostgreSQL IP (optional)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.InstallConfigPath, "install-config", "config.yaml", "Path to install config file (optional)") diff --git a/docs/oms_beta_bootstrap-gcp.md b/docs/oms_beta_bootstrap-gcp.md index fab05f1..f397428 100644 --- a/docs/oms_beta_bootstrap-gcp.md +++ b/docs/oms_beta_bootstrap-gcp.md @@ -48,6 +48,7 @@ oms beta bootstrap-gcp [flags] --registry-user string Custom Registry username (only for GitHub registry type) (optional) --secrets-dir string Directory for secrets (default: /etc/codesphere/secrets) (default "/etc/codesphere/secrets") --secrets-file string Path to secrets files (optional) (default "prod.vault.yaml") + --spot Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false) --ssh-private-key-path string SSH Private Key Path (default: ~/.ssh/id_rsa) (default "~/.ssh/id_rsa") --ssh-public-key-path string SSH Public Key Path (default: ~/.ssh/id_rsa.pub) (default "~/.ssh/id_rsa.pub") --ssh-quiet Suppress SSH command output (default: false) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index 229d108..06997f0 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -94,6 +94,7 @@ type CodesphereEnvironment struct { InstallHash string `json:"install_hash"` InstallSkipSteps []string `json:"install_skip_steps"` Preemptible bool `json:"preemptible"` + Spot bool `json:"spot"` WriteConfig bool `json:"-"` GatewayIP string `json:"gateway_ip"` PublicGatewayIP string `json:"public_gateway_ip"` @@ -307,9 +308,22 @@ func (b *GCPBootstrapper) ValidateInput() error { return err } + err = b.validateVMProvisioningOptions() + if err != nil { + return err + } + return b.validateGithubParams() } +// validateVMProvisioningOptions checks that spot and preemptible options are not both set +func (b *GCPBootstrapper) validateVMProvisioningOptions() error { + if b.Env.Spot && b.Env.Preemptible { + return fmt.Errorf("cannot specify both --spot and --preemptible flags; use --spot for the newer spot VM model") + } + return nil +} + // validateInstallVersion checks if the specified install version exists and contains the required installer artifact func (b *GCPBootstrapper) validateInstallVersion() error { if b.Env.InstallLocal != "" { @@ -693,6 +707,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { wg := sync.WaitGroup{} errCh := make(chan error, len(vmDefs)) resultCh := make(chan vmResult, len(vmDefs)) + logCh := make(chan string, len(vmDefs)) rootDiskSize := int64(200) if b.Env.RegistryType == RegistryTypeGitHub { rootDiskSize = 50 @@ -701,6 +716,43 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { wg.Add(1) go func(vm VMDef) { defer wg.Done() + + existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) + if err != nil { + if !isNotFoundError(err) { + errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err) + return + } + } + if existingInstance != nil { + instanceStatus := existingInstance.GetStatus() + if instanceStatus == "TERMINATED" || instanceStatus == "STOPPED" || instanceStatus == "SUSPENDED" { + // Start the stopped instance + err = b.GCPClient.StartInstance(projectID, zone, vm.Name) + if err != nil { + errCh <- fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err) + return + } + } + + // Wait until the instance is RUNNING and IPs are populated. + readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP) + if err != nil { + errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err) + return + } + + internalIP, externalIP := extractInstanceIPs(readyInstance) + resultCh <- vmResult{ + vmType: vm.Tags[0], + name: vm.Name, + externalIP: externalIP, + internalIP: internalIP, + } + return + } + + // Instance doesn't exist, create it disks := []*computepb.AttachedDisk{ { Boot: protoBool(true), @@ -744,9 +796,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { Tags: &computepb.Tags{ Items: vm.Tags, }, - Scheduling: &computepb.Scheduling{ - Preemptible: &b.Env.Preemptible, - }, + Scheduling: b.buildSchedulingConfig(), NetworkInterfaces: []*computepb.NetworkInterface{ { Network: protoString(network), @@ -774,29 +824,20 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { } } - err = b.GCPClient.CreateInstance(projectID, zone, instance) - if err != nil && !isAlreadyExistsError(err) { - errCh <- fmt.Errorf("failed to create instance %s: %w", vm.Name, err) + err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name, logCh) + if err != nil { + errCh <- err return } - // Find out the IP addresses of the created instance - resp, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) + // Wait for the newly created instance to be RUNNING with IPs assigned + readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP) if err != nil { - errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err) + errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err) return } - externalIP := "" - internalIP := "" - if len(resp.GetNetworkInterfaces()) > 0 { - internalIP = resp.GetNetworkInterfaces()[0].GetNetworkIP() - if len(resp.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { - externalIP = resp.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() - } - } - - // Send result through channel instead of creating nodes in goroutine + internalIP, externalIP := extractInstanceIPs(readyInstance) resultCh <- vmResult{ vmType: vm.Tags[0], name: vm.Name, @@ -809,6 +850,11 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { close(errCh) close(resultCh) + close(logCh) + + for msg := range logCh { + b.stlog.Logf("%s", msg) + } var errs []error for err := range errCh { @@ -850,6 +896,104 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { return nil } +// extractInstanceIPs returns the internal and external IPs from a compute instance. +func extractInstanceIPs(inst *computepb.Instance) (internalIP, externalIP string) { + if len(inst.GetNetworkInterfaces()) > 0 { + internalIP = inst.GetNetworkInterfaces()[0].GetNetworkIP() + if len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { + externalIP = inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() + } + } + return +} + +// buildSchedulingConfig creates the scheduling configuration based on spot/preemptible settings +func (b *GCPBootstrapper) buildSchedulingConfig() *computepb.Scheduling { + if b.Env.Spot { + return &computepb.Scheduling{ + ProvisioningModel: protoString("SPOT"), + OnHostMaintenance: protoString("TERMINATE"), + AutomaticRestart: protoBool(false), + InstanceTerminationAction: protoString("STOP"), + } + } + if b.Env.Preemptible { + return &computepb.Scheduling{ + Preemptible: protoBool(true), + } + } + + return &computepb.Scheduling{} +} + +// createInstanceWithFallback attempts to create an instance with the configured settings. +// If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs. +func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string, logCh chan<- string) error { + err := b.GCPClient.CreateInstance(projectID, zone, instance) + if err == nil { + return nil + } + + if isAlreadyExistsError(err) { + return nil + } + + if b.Env.Spot && isSpotCapacityError(err) { + logCh <- fmt.Sprintf("Spot capacity unavailable for %s, falling back to standard VM", vmName) + instance.Scheduling = &computepb.Scheduling{} + err = b.GCPClient.CreateInstance(projectID, zone, instance) + if err != nil && !isAlreadyExistsError(err) { + return fmt.Errorf("failed to create instance %s (fallback to standard VM): %w", vmName, err) + } + return nil + } + + return fmt.Errorf("failed to create instance %s: %w", vmName, err) +} + +// waitForInstanceRunning polls GetInstance until the instance status is RUNNING +// and its internal IP (and external IP, when needsExternalIP is true) are populated. +// It returns the ready instance or an error if the deadline is exceeded. +func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, needsExternalIP bool) (*computepb.Instance, error) { + const ( + maxAttempts = 60 + pollInterval = 5 * time.Second + ) + for attempt := range maxAttempts { + inst, err := b.GCPClient.GetInstance(projectID, zone, name) + if err != nil { + return nil, fmt.Errorf("failed to poll instance %s: %w", name, err) + } + + if inst.GetStatus() == "RUNNING" && + len(inst.GetNetworkInterfaces()) > 0 && + inst.GetNetworkInterfaces()[0].GetNetworkIP() != "" && + (!needsExternalIP || (len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 && + inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() != "")) { + return inst, nil + } + + if attempt < maxAttempts-1 { + time.Sleep(pollInterval) + } + } + return nil, fmt.Errorf("timed out waiting for instance %s to be RUNNING with IPs assigned after %s", + name, time.Duration(maxAttempts)*pollInterval) +} + +// isSpotCapacityError checks if the error is related to spot VM capacity issues +func isSpotCapacityError(err error) bool { + if err == nil { + return false + } + errStr := err.Error() + return strings.Contains(errStr, "ZONE_RESOURCE_POOL_EXHAUSTED") || + strings.Contains(errStr, "UNSUPPORTED_OPERATION") || + strings.Contains(errStr, "stockout") || + strings.Contains(errStr, "does not have enough resources") || + status.Code(err) == codes.ResourceExhausted +} + // EnsureGatewayIPAddresses reserves 2 static external IP addresses for the ingress // controllers of the cluster. func (b *GCPBootstrapper) EnsureGatewayIPAddresses() error { @@ -1598,6 +1742,10 @@ func isAlreadyExistsError(err error) bool { return status.Code(err) == codes.AlreadyExists || strings.Contains(err.Error(), "already exists") } +func isNotFoundError(err error) bool { + return status.Code(err) == codes.NotFound || strings.Contains(strings.ToLower(err.Error()), "not found") +} + // readSSHKey reads an SSH key file, expanding ~ in the path func (b *GCPBootstrapper) readSSHKey(path string) (string, error) { realPath := util.ExpandPath(path) diff --git a/internal/bootstrap/gcp/gcp_client.go b/internal/bootstrap/gcp/gcp_client.go index 2b45232..4411f5c 100644 --- a/internal/bootstrap/gcp/gcp_client.go +++ b/internal/bootstrap/gcp/gcp_client.go @@ -49,6 +49,7 @@ type GCPClientManager interface { CreateFirewallRule(projectID string, rule *computepb.Firewall) error CreateInstance(projectID, zone string, instance *computepb.Instance) error GetInstance(projectID, zone, instanceName string) (*computepb.Instance, error) + StartInstance(projectID, zone, instanceName string) error CreateAddress(projectID, region string, address *computepb.Address) (string, error) GetAddress(projectID, region, addressName string) (*computepb.Address, error) EnsureDNSManagedZone(projectID, zoneName, dnsName, description string) error @@ -562,6 +563,26 @@ func (c *GCPClient) GetInstance(projectID, zone, instanceName string) (*computep }) } +// StartInstance starts a stopped Compute Engine instance in the specified project and zone. +func (c *GCPClient) StartInstance(projectID, zone, instanceName string) error { + client, err := compute.NewInstancesRESTClient(c.ctx) + if err != nil { + return err + } + defer util.IgnoreError(client.Close) + + op, err := client.Start(c.ctx, &computepb.StartInstanceRequest{ + Project: projectID, + Zone: zone, + Instance: instanceName, + }) + if err != nil { + return err + } + + return op.Wait(c.ctx) +} + // CreateAddress creates a new static IP address in the specified project and region. func (c *GCPClient) CreateAddress(projectID, region string, address *computepb.Address) (string, error) { client, err := compute.NewAddressesRESTClient(c.ctx) diff --git a/internal/bootstrap/gcp/gcp_test.go b/internal/bootstrap/gcp/gcp_test.go index 20521dd..ff74936 100644 --- a/internal/bootstrap/gcp/gcp_test.go +++ b/internal/bootstrap/gcp/gcp_test.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "strings" + "sync" "time" "os" @@ -170,9 +171,11 @@ var _ = Describe("GCP Bootstrapper", func() { gc.EXPECT().CreateFirewallRule(projectId, mock.Anything).Return(nil).Times(5) // 11. EnsureComputeInstances - gc.EXPECT().CreateInstance(projectId, "us-central1-a", mock.Anything).Return(nil).Times(9) - // GetInstance calls to retrieve IPs + // Track GetInstance calls per VM name + instanceCalls := make(map[string]int) + var instanceMu sync.Mutex ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), NetworkInterfaces: []*computepb.NetworkInterface{ { NetworkIP: protoString("10.0.0.1"), @@ -182,9 +185,19 @@ var _ = Describe("GCP Bootstrapper", func() { }, }, } - - gc.EXPECT().GetInstance(projectId, "us-central1-a", mock.Anything).Return(ipResp, nil).Times(9) + gc.EXPECT().GetInstance(projectId, "us-central1-a", mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + instanceMu.Lock() + defer instanceMu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call, instance doesn't exist + return nil, fmt.Errorf("not found") + } + // Second call, return instance with IPs + return ipResp, nil + }).Times(18) fw.EXPECT().ReadFile(mock.Anything).Return([]byte("fake-key"), nil).Times(9) + gc.EXPECT().CreateInstance(projectId, "us-central1-a", mock.Anything).Return(nil).Times(9) // 12. EnsureGatewayIPAddresses gc.EXPECT().GetAddress(projectId, "us-central1", "gateway").Return(nil, fmt.Errorf("not found")) @@ -387,6 +400,17 @@ var _ = Describe("GCP Bootstrapper", func() { }) + Context("When both --spot and --preemptible are specified", func() { + BeforeEach(func() { + csEnv.Spot = true + csEnv.Preemptible = true + }) + It("fails with a clear error message", func() { + err := bs.ValidateInput() + Expect(err).To(MatchError(MatchRegexp("cannot specify both --spot and --preemptible"))) + }) + }) + }) Describe("EnsureInstallConfig", func() { @@ -948,14 +972,10 @@ var _ = Describe("GCP Bootstrapper", func() { }) Describe("Valid EnsureComputeInstances", func() { It("creates all instances", func() { - // Mock ReadFile for SSH key (called 9 times in parallel) - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Times(9) - - // Mock CreateInstance (9 times) - gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) - - // Mock GetInstance (9 times) + instanceCalls := make(map[string]int) + var mu sync.Mutex ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), NetworkInterfaces: []*computepb.NetworkInterface{ { NetworkIP: protoString("10.0.0.x"), @@ -965,7 +985,19 @@ var _ = Describe("GCP Bootstrapper", func() { }, }, } - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(ipResp, nil).Times(9) + // First call per VM: not found (triggers creation). Second call: running with IPs. + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return ipResp, nil + }).Times(18) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) err := bs.EnsureComputeInstances() Expect(err).NotTo(HaveOccurred()) @@ -978,7 +1010,8 @@ var _ = Describe("GCP Bootstrapper", func() { Describe("Invalid cases", func() { It("fails when SSH key read fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return(nil, fmt.Errorf("read error")).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("not found")).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return(nil, fmt.Errorf("read error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) @@ -986,7 +1019,8 @@ var _ = Describe("GCP Bootstrapper", func() { }) It("fails when CreateInstance fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("not found")).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(fmt.Errorf("create error")).Maybe() err := bs.EnsureComputeInstances() @@ -995,15 +1029,245 @@ var _ = Describe("GCP Bootstrapper", func() { }) It("fails when GetInstance fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Maybe() + instanceCalls := make(map[string]int) + var mu sync.Mutex + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn( + func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return nil, fmt.Errorf("get error") + }, + ).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Maybe() - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("get error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("error ensuring compute instances")) }) }) + + Describe("Spot VM functionality", func() { + It("creates spot VMs when spot flag is enabled", func() { + csEnv.Spot = true + + // Track GetInstance calls per VM name + instanceCalls := make(map[string]int) + var mu sync.Mutex + ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return ipResp, nil + }).Times(18) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + + // Verify CreateInstance is called with SPOT provisioning model + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.MatchedBy(func(instance *computepb.Instance) bool { + return instance.Scheduling != nil && + instance.Scheduling.ProvisioningModel != nil && + *instance.Scheduling.ProvisioningModel == "SPOT" + })).Return(nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("falls back to standard VM when spot capacity is exhausted", func() { + csEnv.Spot = true + + // Track GetInstance calls per VM name + instanceCalls := make(map[string]int) + var mu sync.Mutex + ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return ipResp, nil + }).Times(18) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + + createCalls := make(map[string]int) + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone string, instance *computepb.Instance) error { + mu.Lock() + defer mu.Unlock() + name := *instance.Name + createCalls[name]++ + if createCalls[name] == 1 { + return fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED") + } + return nil + }).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("restarts stopped VMs instead of creating new ones", func() { + instanceCalls := make(map[string]int) + var mu sync.Mutex + stoppedResp := &computepb.Instance{ + Status: protoString("TERMINATED"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + runningResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call, VM exists but is stopped + return stoppedResp, nil + } + // After StartInstance, VM is running + return runningResp, nil + }).Times(18) + + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("uses existing running VMs without starting them", func() { + runningResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + // 9 VMs × 2 GetInstance calls each (initial check + waitForInstanceRunning poll) + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(runningResp, nil).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("handles VMs in intermediate states (STAGING/PROVISIONING)", func() { + instanceCalls := make(map[string]int) + var mu sync.Mutex + stagingResp := &computepb.Instance{ + Status: protoString("STAGING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + runningResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call: instance exists but is still staging + return stagingResp, nil + } + // Second call via waitForInstanceRunning: now running + return runningResp, nil + }).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("fails when StartInstance returns an error", func() { + stoppedResp := &computepb.Instance{ + Status: protoString("TERMINATED"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(stoppedResp, nil).Maybe() + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(fmt.Errorf("start error")).Maybe() + + err := bs.EnsureComputeInstances() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to start stopped instance")) + }) + + It("fails when initial GetInstance returns a non-NotFound error", func() { + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("permission denied")).Maybe() + + err := bs.EnsureComputeInstances() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get instance")) + }) + }) }) Describe("EnsureGatewayIPAddresses", func() { diff --git a/internal/bootstrap/gcp/mocks.go b/internal/bootstrap/gcp/mocks.go index fd90494..0092134 100644 --- a/internal/bootstrap/gcp/mocks.go +++ b/internal/bootstrap/gcp/mocks.go @@ -1327,3 +1327,66 @@ func (_c *MockGCPClientManager_GetProjectByName_Call) RunAndReturn(run func(fold _c.Call.Return(run) return _c } + +// StartInstance provides a mock function for the type MockGCPClientManager +func (_mock *MockGCPClientManager) StartInstance(projectID string, zone string, instanceName string) error { + ret := _mock.Called(projectID, zone, instanceName) + + if len(ret) == 0 { + panic("no return value specified for StartInstance") + } + + var r0 error + if returnFunc, ok := ret.Get(0).(func(string, string, string) error); ok { + r0 = returnFunc(projectID, zone, instanceName) + } else { + r0 = ret.Error(0) + } + return r0 +} + +// MockGCPClientManager_StartInstance_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'StartInstance' +type MockGCPClientManager_StartInstance_Call struct { + *mock.Call +} + +// StartInstance is a helper method to define mock.On call +// - projectID string +// - zone string +// - instanceName string +func (_e *MockGCPClientManager_Expecter) StartInstance(projectID interface{}, zone interface{}, instanceName interface{}) *MockGCPClientManager_StartInstance_Call { + return &MockGCPClientManager_StartInstance_Call{Call: _e.mock.On("StartInstance", projectID, zone, instanceName)} +} + +func (_c *MockGCPClientManager_StartInstance_Call) Run(run func(projectID string, zone string, instanceName string)) *MockGCPClientManager_StartInstance_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + var arg1 string + if args[1] != nil { + arg1 = args[1].(string) + } + var arg2 string + if args[2] != nil { + arg2 = args[2].(string) + } + run( + arg0, + arg1, + arg2, + ) + }) + return _c +} + +func (_c *MockGCPClientManager_StartInstance_Call) Return(err error) *MockGCPClientManager_StartInstance_Call { + _c.Call.Return(err) + return _c +} + +func (_c *MockGCPClientManager_StartInstance_Call) RunAndReturn(run func(projectID string, zone string, instanceName string) error) *MockGCPClientManager_StartInstance_Call { + _c.Call.Return(run) + return _c +}