diff --git a/v1/providers/sfcompute/instance.go b/v1/providers/sfcompute/instance.go index 3442834..dfaf244 100644 --- a/v1/providers/sfcompute/instance.go +++ b/v1/providers/sfcompute/instance.go @@ -29,8 +29,10 @@ func (c *SFCClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceA return nil, errors.WrapAndTrace(err) } - // Create a name for the node - name := brevDataToSFCName(attrs.RefID, attrs.Name) + // Pack cloud cred ref ID, brev stage, instance ref ID, and name into the SFC node name. + // SFC has no tags API, so the node name is the only place to persist this metadata. + stage := getStageFromTags(attrs.Tags) + name := brevDataToSFCName(c.refID, stage, attrs.RefID, attrs.Name) // Create the node resp, err := c.client.Nodes.New(ctx, sfcnodes.NodeNewParams{ @@ -231,11 +233,15 @@ type sfcNodeInfo struct { } func (c *SFCClient) sfcNodeToBrevInstance(node sfcNodeInfo) (*v1.Instance, error) { - // Get the refID and name from the node name - refID, name, err := sfcNameToBrevData(node.name) + // Parse cloud cred ref ID, brev stage, instance ref ID, and name from the node name. + // Old-format names (refID_name) return empty cloudCredRefID — fall back to c.refID. + cloudCredRefID, _, refID, name, err := sfcNameToBrevData(node.name) if err != nil { return nil, errors.WrapAndTrace(err) } + if cloudCredRefID == "" { + cloudCredRefID = c.refID + } // Get the instance type for the zone instanceType, err := getInstanceTypeForZone(*node.zone) @@ -270,7 +276,7 @@ func (c *SFCClient) sfcNodeToBrevInstance(node sfcNodeInfo) (*v1.Instance, error Spot: false, Stoppable: false, Rebootable: false, - CloudCredRefID: c.refID, // TODO: this should be pulled from the node itself + CloudCredRefID: cloudCredRefID, } return inst, nil } @@ -448,16 +454,45 @@ func (c *SFCClient) getSSHHostnameFromVM(ctx context.Context, vmID string, vmSta return sshResponse.SSHHostname, nil } -func brevDataToSFCName(refID string, name string) string { - return fmt.Sprintf("%s_%s", refID, name) +// brevDataToSFCName packs cloud credential ref ID, brev stage, instance ref ID, and instance +// name into a single SFC node name, separated by underscores. This is necessary because SFC +// has no tags/labels API — the node name is the only place to store metadata. +// +// Format: {cloudCredRefID}_{brevStage}_{refID}_{name} +func brevDataToSFCName(cloudCredRefID string, brevStage string, refID string, name string) string { + return fmt.Sprintf("%s_%s_%s_%s", cloudCredRefID, brevStage, refID, name) +} + +// sfcNameToBrevData parses an SFC node name back into its components. +// +// Supports two formats for backward compatibility: +// - New (4+ parts): {cloudCredRefID}_{brevStage}_{refID}_{name} +// - Old (2 parts): {refID}_{name} — cloudCredRefID and brevStage returned empty +func sfcNameToBrevData(name string) (cloudCredRefID string, brevStage string, refID string, instanceName string, err error) { + parts := strings.SplitN(name, "_", 4) + switch len(parts) { + case 4: + // New format: cloudCredRefID_brevStage_refID_name + return parts[0], parts[1], parts[2], parts[3], nil + case 2: + // Old format: refID_name (backward compat — cloudCredRefID and stage unknown) + // TODO: remove this case once all old-format nodes have been cleaned up + return "", "", parts[0], parts[1], nil + default: + return "", "", "", "", errors.WrapAndTrace(fmt.Errorf("invalid node name %s: expected 2 or 4 underscore-separated parts", name)) + } } -func sfcNameToBrevData(name string) (string, string, error) { - parts := strings.SplitN(name, "_", 2) - if len(parts) != 2 { - return "", "", errors.WrapAndTrace(fmt.Errorf("invalid node name %s", name)) +// getStageFromTags extracts the control plane stage value from instance tags. +// The tag key is prefixed by the control plane +// so we match any key ending with "-stage" to avoid coupling to a specific prefix. +func getStageFromTags(tags v1.Tags) string { + for k, v := range tags { + if strings.HasSuffix(k, "-stage") { + return v + } } - return parts[0], parts[1], nil + return "unknown" } // Optional if supported: diff --git a/v1/providers/sfcompute/instancetype.go b/v1/providers/sfcompute/instancetype.go index 515f5ef..bbaca04 100644 --- a/v1/providers/sfcompute/instancetype.go +++ b/v1/providers/sfcompute/instancetype.go @@ -21,6 +21,11 @@ const ( interconnectInfiniband = "infiniband" formFactorSXM5 = "sxm5" diskTypeSSD = "ssd" + + // Currently only 8xH100/H200 instance types are available + // so it's safe to hardcode vCPU and GPU count. + sfcVCPU = 112 + sfcGPUCount = 8 ) func makeDefaultInstanceTypePrice(amount string, currencyCode string) currency.Amount { @@ -112,6 +117,7 @@ func getInstanceTypeForZone(zone sfcnodes.ZoneListResponseData) (*v1.InstanceTyp Type: makeInstanceTypeName(zone), Memory: ram, MemoryBytes: gpuMetadata.memoryBytes, + VCPU: gpuMetadata.vcpu, Location: zoneToLocation(zone).Name, Stoppable: false, Rebootable: false, @@ -216,6 +222,7 @@ type sfcInstanceTypeMetadata struct { architecture v1.Architecture memoryBytes v1.Bytes diskBytes v1.Bytes + vcpu int32 gpuCount int32 gpuManufacturer v1.Manufacturer gpuVRAM v1.Bytes @@ -240,7 +247,8 @@ var h100InstanceTypeMetadata = sfcInstanceTypeMetadata{ architecture: v1.ArchitectureX86_64, memoryBytes: v1.NewBytes(960, v1.Gigabyte), diskBytes: v1.NewBytes(1500, v1.Gigabyte), - gpuCount: 8, + vcpu: sfcVCPU, + gpuCount: sfcGPUCount, gpuManufacturer: v1.ManufacturerNVIDIA, gpuVRAM: v1.NewBytes(80, v1.Gigabyte), estimatedDeployTime: 14 * time.Minute, @@ -253,7 +261,8 @@ var h200InstanceTypeMetadata = sfcInstanceTypeMetadata{ architecture: v1.ArchitectureX86_64, memoryBytes: v1.NewBytes(960, v1.Gigabyte), diskBytes: v1.NewBytes(1500, v1.Gigabyte), - gpuCount: 8, + vcpu: sfcVCPU, + gpuCount: sfcGPUCount, gpuManufacturer: v1.ManufacturerNVIDIA, gpuVRAM: v1.NewBytes(141, v1.Gigabyte), estimatedDeployTime: 14 * time.Minute,