From 478db4758b004820e01d84ae1a6a865dee40e009 Mon Sep 17 00:00:00 2001 From: Tyler Fong Date: Fri, 27 Feb 2026 09:11:39 -0800 Subject: [PATCH 1/2] fixed sf compute tagging for v0 until tags --- v1/providers/sfcompute/instance.go | 58 ++++++++++++++++++++------ v1/providers/sfcompute/instancetype.go | 8 ++++ 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/v1/providers/sfcompute/instance.go b/v1/providers/sfcompute/instance.go index 3442834d..451b6e70 100644 --- a/v1/providers/sfcompute/instance.go +++ b/v1/providers/sfcompute/instance.go @@ -29,8 +29,10 @@ func (c *SFCClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceA return nil, errors.WrapAndTrace(err) } - // Create a name for the node - name := brevDataToSFCName(attrs.RefID, attrs.Name) + // Pack cloud cred ref ID, brev stage, instance ref ID, and name into the SFC node name. + // SFC has no tags API, so the node name is the only place to persist this metadata. + stage := getStageFromTags(attrs.Tags) + name := brevDataToSFCName(c.refID, stage, attrs.RefID, attrs.Name) // Create the node resp, err := c.client.Nodes.New(ctx, sfcnodes.NodeNewParams{ @@ -231,11 +233,15 @@ type sfcNodeInfo struct { } func (c *SFCClient) sfcNodeToBrevInstance(node sfcNodeInfo) (*v1.Instance, error) { - // Get the refID and name from the node name - refID, name, err := sfcNameToBrevData(node.name) + // Parse cloud cred ref ID, brev stage, instance ref ID, and name from the node name. + // Old-format names (refID_name) return empty cloudCredRefID — fall back to c.refID. + cloudCredRefID, _, refID, name, err := sfcNameToBrevData(node.name) if err != nil { return nil, errors.WrapAndTrace(err) } + if cloudCredRefID == "" { + cloudCredRefID = c.refID + } // Get the instance type for the zone instanceType, err := getInstanceTypeForZone(*node.zone) @@ -270,7 +276,7 @@ func (c *SFCClient) sfcNodeToBrevInstance(node sfcNodeInfo) (*v1.Instance, error Spot: false, Stoppable: false, Rebootable: false, - CloudCredRefID: c.refID, // TODO: this should be pulled from the node itself + CloudCredRefID: cloudCredRefID, } return inst, nil } @@ -448,16 +454,44 @@ func (c *SFCClient) getSSHHostnameFromVM(ctx context.Context, vmID string, vmSta return sshResponse.SSHHostname, nil } -func brevDataToSFCName(refID string, name string) string { - return fmt.Sprintf("%s_%s", refID, name) +// brevDataToSFCName packs cloud credential ref ID, brev stage, instance ref ID, and instance +// name into a single SFC node name, separated by underscores. This is necessary because SFC +// has no tags/labels API — the node name is the only place to store metadata. +// +// Format: {cloudCredRefID}_{brevStage}_{refID}_{name} +func brevDataToSFCName(cloudCredRefID string, brevStage string, refID string, name string) string { + return fmt.Sprintf("%s_%s_%s_%s", cloudCredRefID, brevStage, refID, name) +} + +// sfcNameToBrevData parses an SFC node name back into its components. +// +// Supports two formats for backward compatibility: +// - New (4+ parts): {cloudCredRefID}_{brevStage}_{refID}_{name} +// - Old (2 parts): {refID}_{name} — cloudCredRefID and brevStage returned empty +func sfcNameToBrevData(name string) (cloudCredRefID string, brevStage string, refID string, instanceName string, err error) { + parts := strings.SplitN(name, "_", 4) + switch len(parts) { + case 4: + // New format: cloudCredRefID_brevStage_refID_name + return parts[0], parts[1], parts[2], parts[3], nil + case 2: + // Old format: refID_name (backward compat — cloudCredRefID and stage unknown) + return "", "", parts[0], parts[1], nil + default: + return "", "", "", "", errors.WrapAndTrace(fmt.Errorf("invalid node name %s: expected 2 or 4 underscore-separated parts", name)) + } } -func sfcNameToBrevData(name string) (string, string, error) { - parts := strings.SplitN(name, "_", 2) - if len(parts) != 2 { - return "", "", errors.WrapAndTrace(fmt.Errorf("invalid node name %s", name)) +// getStageFromTags extracts the control plane stage value from instance tags. +// The tag key is prefixed by the control plane +// so we match any key ending with "-stage" to avoid coupling to a specific prefix. +func getStageFromTags(tags v1.Tags) string { + for k, v := range tags { + if strings.HasSuffix(k, "-stage") { + return v + } } - return parts[0], parts[1], nil + return "unknown" } // Optional if supported: diff --git a/v1/providers/sfcompute/instancetype.go b/v1/providers/sfcompute/instancetype.go index 515f5efa..da0016fc 100644 --- a/v1/providers/sfcompute/instancetype.go +++ b/v1/providers/sfcompute/instancetype.go @@ -112,6 +112,7 @@ func getInstanceTypeForZone(zone sfcnodes.ZoneListResponseData) (*v1.InstanceTyp Type: makeInstanceTypeName(zone), Memory: ram, MemoryBytes: gpuMetadata.memoryBytes, + VCPU: gpuMetadata.vcpu, Location: zoneToLocation(zone).Name, Stoppable: false, Rebootable: false, @@ -216,6 +217,7 @@ type sfcInstanceTypeMetadata struct { architecture v1.Architecture memoryBytes v1.Bytes diskBytes v1.Bytes + vcpu int32 gpuCount int32 gpuManufacturer v1.Manufacturer gpuVRAM v1.Bytes @@ -234,12 +236,17 @@ func getInstanceTypeMetadata(gpuType string) (*sfcInstanceTypeMetadata, error) { } } +// vCPU count provided by SF Compute. Currently only 8xH100/H200 instance types are +// available so it's safe to hardcode. +const sfcVCPU = 112 + var h100InstanceTypeMetadata = sfcInstanceTypeMetadata{ gpuType: gpuTypeH100, formFactor: formFactorSXM5, architecture: v1.ArchitectureX86_64, memoryBytes: v1.NewBytes(960, v1.Gigabyte), diskBytes: v1.NewBytes(1500, v1.Gigabyte), + vcpu: sfcVCPU, gpuCount: 8, gpuManufacturer: v1.ManufacturerNVIDIA, gpuVRAM: v1.NewBytes(80, v1.Gigabyte), @@ -253,6 +260,7 @@ var h200InstanceTypeMetadata = sfcInstanceTypeMetadata{ architecture: v1.ArchitectureX86_64, memoryBytes: v1.NewBytes(960, v1.Gigabyte), diskBytes: v1.NewBytes(1500, v1.Gigabyte), + vcpu: sfcVCPU, gpuCount: 8, gpuManufacturer: v1.ManufacturerNVIDIA, gpuVRAM: v1.NewBytes(141, v1.Gigabyte), From 31f84fdd3de235c57743e2b2c5e8a0af0446958b Mon Sep 17 00:00:00 2001 From: Tyler Fong Date: Fri, 27 Feb 2026 10:27:42 -0800 Subject: [PATCH 2/2] fixed comments --- v1/providers/sfcompute/instance.go | 1 + v1/providers/sfcompute/instancetype.go | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/v1/providers/sfcompute/instance.go b/v1/providers/sfcompute/instance.go index 451b6e70..dfaf2447 100644 --- a/v1/providers/sfcompute/instance.go +++ b/v1/providers/sfcompute/instance.go @@ -476,6 +476,7 @@ func sfcNameToBrevData(name string) (cloudCredRefID string, brevStage string, re return parts[0], parts[1], parts[2], parts[3], nil case 2: // Old format: refID_name (backward compat — cloudCredRefID and stage unknown) + // TODO: remove this case once all old-format nodes have been cleaned up return "", "", parts[0], parts[1], nil default: return "", "", "", "", errors.WrapAndTrace(fmt.Errorf("invalid node name %s: expected 2 or 4 underscore-separated parts", name)) diff --git a/v1/providers/sfcompute/instancetype.go b/v1/providers/sfcompute/instancetype.go index da0016fc..bbaca048 100644 --- a/v1/providers/sfcompute/instancetype.go +++ b/v1/providers/sfcompute/instancetype.go @@ -21,6 +21,11 @@ const ( interconnectInfiniband = "infiniband" formFactorSXM5 = "sxm5" diskTypeSSD = "ssd" + + // Currently only 8xH100/H200 instance types are available + // so it's safe to hardcode vCPU and GPU count. + sfcVCPU = 112 + sfcGPUCount = 8 ) func makeDefaultInstanceTypePrice(amount string, currencyCode string) currency.Amount { @@ -236,10 +241,6 @@ func getInstanceTypeMetadata(gpuType string) (*sfcInstanceTypeMetadata, error) { } } -// vCPU count provided by SF Compute. Currently only 8xH100/H200 instance types are -// available so it's safe to hardcode. -const sfcVCPU = 112 - var h100InstanceTypeMetadata = sfcInstanceTypeMetadata{ gpuType: gpuTypeH100, formFactor: formFactorSXM5, @@ -247,7 +248,7 @@ var h100InstanceTypeMetadata = sfcInstanceTypeMetadata{ memoryBytes: v1.NewBytes(960, v1.Gigabyte), diskBytes: v1.NewBytes(1500, v1.Gigabyte), vcpu: sfcVCPU, - gpuCount: 8, + gpuCount: sfcGPUCount, gpuManufacturer: v1.ManufacturerNVIDIA, gpuVRAM: v1.NewBytes(80, v1.Gigabyte), estimatedDeployTime: 14 * time.Minute, @@ -261,7 +262,7 @@ var h200InstanceTypeMetadata = sfcInstanceTypeMetadata{ memoryBytes: v1.NewBytes(960, v1.Gigabyte), diskBytes: v1.NewBytes(1500, v1.Gigabyte), vcpu: sfcVCPU, - gpuCount: 8, + gpuCount: sfcGPUCount, gpuManufacturer: v1.ManufacturerNVIDIA, gpuVRAM: v1.NewBytes(141, v1.Gigabyte), estimatedDeployTime: 14 * time.Minute,