From c4549a8b05183b340b3ea15debdb7552f3272b69 Mon Sep 17 00:00:00 2001 From: andreaanez Date: Mon, 4 May 2026 00:46:49 -0700 Subject: [PATCH 1/4] feat: add SFC V2 cloud provider integration Introduces the sfcomputev2 provider package that talks to the V2 SFC API via github.com/sfcompute/sfc-go. Uses capacity-based slot tracking to report availability and native tags for instance metadata instead of name encoding. Co-Authored-By: Claude Sonnet 4.6 --- go.mod | 4 + go.sum | 2 + v1/providers/sfcomputev2/capabilities.go | 23 ++ v1/providers/sfcomputev2/client.go | 108 +++++++++ v1/providers/sfcomputev2/instance.go | 278 +++++++++++++++++++++++ v1/providers/sfcomputev2/instancetype.go | 204 +++++++++++++++++ 6 files changed, 619 insertions(+) create mode 100644 v1/providers/sfcomputev2/capabilities.go create mode 100644 v1/providers/sfcomputev2/client.go create mode 100644 v1/providers/sfcomputev2/instance.go create mode 100644 v1/providers/sfcomputev2/instancetype.go diff --git a/go.mod b/go.mod index 92a51b2..a608c70 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,7 @@ require ( github.com/nebius/gosdk v0.0.0-20250826102719-940ad1dfb5de github.com/pkg/errors v0.9.1 github.com/sfcompute/nodes-go v0.1.0-alpha.4 + github.com/sfcompute/sfc-go v0.0.0-local github.com/stretchr/testify v1.11.1 golang.org/x/crypto v0.47.0 golang.org/x/text v0.33.0 @@ -85,6 +86,7 @@ require ( github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/afero v1.15.0 // indirect github.com/spf13/pflag v1.0.10 // indirect + github.com/spyzhov/ajson v0.8.0 // indirect github.com/tidwall/gjson v1.18.0 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect @@ -113,3 +115,5 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +replace github.com/sfcompute/sfc-go v0.0.0-local => /Users/andreaanez/Documents/sf_compute/sfc-go diff --git a/go.sum b/go.sum index 55a8123..26e4b65 100644 --- a/go.sum +++ b/go.sum @@ -168,6 +168,8 @@ github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spyzhov/ajson v0.8.0 h1:sFXyMbi4Y/BKjrsfkUZHSjA2JM1184enheSjjoT/zCc= +github.com/spyzhov/ajson v0.8.0/go.mod h1:63V+CGM6f1Bu/p4nLIN8885ojBdt88TbLoSFzyqMuVA= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= diff --git a/v1/providers/sfcomputev2/capabilities.go b/v1/providers/sfcomputev2/capabilities.go new file mode 100644 index 0000000..6ca8e19 --- /dev/null +++ b/v1/providers/sfcomputev2/capabilities.go @@ -0,0 +1,23 @@ +package v2 + +import ( + "context" + + v1 "github.com/brevdev/cloud/v1" +) + +func getSFCCapabilitiesV2() v1.Capabilities { + return v1.Capabilities{ + v1.CapabilityCreateInstance, + v1.CapabilityTerminateInstance, + v1.CapabilityCreateTerminateInstance, + } +} + +func (c *SFCClientV2) GetCapabilities(_ context.Context) (v1.Capabilities, error) { + return getSFCCapabilitiesV2(), nil +} + +func (c *SFCCredentialV2) GetCapabilities(_ context.Context) (v1.Capabilities, error) { + return getSFCCapabilitiesV2(), nil +} diff --git a/v1/providers/sfcomputev2/client.go b/v1/providers/sfcomputev2/client.go new file mode 100644 index 0000000..ba0e69f --- /dev/null +++ b/v1/providers/sfcomputev2/client.go @@ -0,0 +1,108 @@ +package v2 + +import ( + "context" + + v1 "github.com/brevdev/cloud/v1" + sfc "github.com/sfcompute/sfc-go" +) + +const CloudProviderID = "sfcompute" + +// SFCCredentialV2 holds only authentication details. Operational config (capacity, image) +// is set on SFCClientV2 at MakeClient time via the Brev credential config. +type SFCCredentialV2 struct { + RefID string + APIKey string `json:"api_key"` + CapacityID string `json:"capacity_id"` + ImageID string `json:"image_id"` +} + +var _ v1.CloudCredential = &SFCCredentialV2{} + +func NewSFCCredentialV2(refID, apiKey, capacityID, imageID string) *SFCCredentialV2 { + return &SFCCredentialV2{ + RefID: refID, + APIKey: apiKey, + CapacityID: capacityID, + ImageID: imageID, + } +} + +func (c *SFCCredentialV2) GetReferenceID() string { + return c.RefID +} + +func (c *SFCCredentialV2) GetAPIType() v1.APIType { + return v1.APITypeGlobal +} + +func (c *SFCCredentialV2) GetCloudProviderID() v1.CloudProviderID { + return CloudProviderID +} + +func (c *SFCCredentialV2) GetTenantID() (string, error) { + return "", nil +} + +type SFCClientV2 struct { + v1.NotImplCloudClient + refID string + location string + capacityID string + imageID string + client *sfc.SDK + logger v1.Logger +} + +var _ v1.CloudClient = &SFCClientV2{} + +type SFCClientV2Option func(c *SFCClientV2) + +func WithLogger(logger v1.Logger) SFCClientV2Option { + return func(c *SFCClientV2) { + c.logger = logger + } +} + +func (c *SFCCredentialV2) MakeClientWithOptions(_ context.Context, location string, opts ...SFCClientV2Option) (v1.CloudClient, error) { + sfcClient := &SFCClientV2{ + refID: c.RefID, + location: location, + capacityID: c.CapacityID, + imageID: c.ImageID, + client: sfc.New(sfc.WithSecurity(c.APIKey)), + logger: &v1.NoopLogger{}, + } + + for _, opt := range opts { + opt(sfcClient) + } + + return sfcClient, nil +} + +func (c *SFCCredentialV2) MakeClient(ctx context.Context, location string) (v1.CloudClient, error) { + return c.MakeClientWithOptions(ctx, location) +} + +func (c *SFCClientV2) GetAPIType() v1.APIType { + return v1.APITypeGlobal +} + +func (c *SFCClientV2) GetCloudProviderID() v1.CloudProviderID { + return CloudProviderID +} + +func (c *SFCClientV2) GetReferenceID() string { + return c.refID +} + +func (c *SFCClientV2) GetTenantID() (string, error) { + return "", nil +} + +func (c *SFCClientV2) MakeClient(_ context.Context, location string) (v1.CloudClient, error) { + c.location = location + return c, nil +} diff --git a/v1/providers/sfcomputev2/instance.go b/v1/providers/sfcomputev2/instance.go new file mode 100644 index 0000000..6834206 --- /dev/null +++ b/v1/providers/sfcomputev2/instance.go @@ -0,0 +1,278 @@ +package v2 + +import ( + "context" + "encoding/base64" + "fmt" + "slices" + "strings" + "time" + + "github.com/alecthomas/units" + "github.com/brevdev/cloud/internal/errors" + v1 "github.com/brevdev/cloud/v1" + "github.com/sfcompute/sfc-go/models/components" + "github.com/sfcompute/sfc-go/models/operations" + "github.com/sfcompute/sfc-go/optionalnullable" +) + +const ( + defaultPort = 22 + defaultSSHUsername = "ubuntu" + + // Tag keys used to persist Brev metadata on V2 instances (native tags replace name encoding). + tagKeyCloudCredRefID = "brev-cloud-cred-ref-id" + tagKeyStage = "brev-stage" + tagKeyRefID = "brev-ref-id" + tagKeyName = "brev-name" +) + +func (c *SFCClientV2) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) { + c.logger.Debug(ctx, "sfcv2: CreateInstance start", + v1.LogField("name", attrs.Name), + v1.LogField("location", attrs.Location), + ) + + stage := getStageFromTags(attrs.Tags) + tags := map[string]string{ + tagKeyCloudCredRefID: c.refID, + tagKeyStage: stage, + tagKeyRefID: attrs.RefID, + tagKeyName: attrs.Name, + } + + cloudInit := sshKeyCloudInit(attrs.PublicKey) + resp, err := c.client.Instances.Create(ctx, components.CreateInstanceRequest{ + Capacity: c.capacityID, + Image: c.imageID, + CloudInitUserData: &cloudInit, + Tags: optionalnullable.From(&tags), + Name: optionalnullable.From(&attrs.Name), + }) + if err != nil { + return nil, errors.WrapAndTrace(err) + } + if resp.InstanceResponse == nil { + return nil, errors.WrapAndTrace(fmt.Errorf("no instance returned from create")) + } + + instance, err := c.sfcInstanceToBrevInstance(resp.InstanceResponse, "") + if err != nil { + return nil, errors.WrapAndTrace(err) + } + + c.logger.Debug(ctx, "sfcv2: CreateInstance end", + v1.LogField("instanceID", resp.InstanceResponse.ID), + ) + + return instance, nil +} + +func sshKeyCloudInit(sshKey string) string { + script := fmt.Sprintf("#cloud-config\nssh_authorized_keys:\n - %s", sshKey) + return base64.StdEncoding.EncodeToString([]byte(script)) +} + +func (c *SFCClientV2) GetInstance(ctx context.Context, id v1.CloudProviderInstanceID) (*v1.Instance, error) { + c.logger.Debug(ctx, "sfcv2: GetInstance start", + v1.LogField("instanceID", id), + ) + + resp, err := c.client.Instances.Fetch(ctx, string(id), nil) + if err != nil { + return nil, errors.WrapAndTrace(err) + } + if resp.InstanceResponse == nil { + return nil, errors.WrapAndTrace(fmt.Errorf("instance %s not found", id)) + } + + sshHostname, err := c.getSSHHostname(ctx, string(id), resp.InstanceResponse.Status) + if err != nil { + return nil, errors.WrapAndTrace(err) + } + + instance, err := c.sfcInstanceToBrevInstance(resp.InstanceResponse, sshHostname) + if err != nil { + return nil, errors.WrapAndTrace(err) + } + + c.logger.Debug(ctx, "sfcv2: GetInstance end", + v1.LogField("instanceID", id), + v1.LogField("status", resp.InstanceResponse.Status), + ) + + return instance, nil +} + +func (c *SFCClientV2) ListInstances(ctx context.Context, args v1.ListInstancesArgs) ([]v1.Instance, error) { + c.logger.Debug(ctx, "sfcv2: ListInstances start", + v1.LogField("location", c.location), + ) + + resp, err := c.client.Instances.List(ctx, operations.ListInstancesRequest{ + Capacity: &c.capacityID, + }) + if err != nil { + return nil, errors.WrapAndTrace(err) + } + if resp.ListInstancesResponse == nil { + return []v1.Instance{}, nil + } + + var instances []v1.Instance + for _, inst := range resp.ListInstancesResponse.Data { + inst := inst // capture loop variable + + // Filter by instance IDs if specified. + if len(args.InstanceIDs) > 0 && !slices.Contains(args.InstanceIDs, v1.CloudProviderInstanceID(inst.ID)) { + continue + } + + sshHostname, err := c.getSSHHostname(ctx, inst.ID, inst.Status) + if err != nil { + c.logger.Error(ctx, err, + v1.LogField("msg", "sfcv2: ListInstances skipping instance due to SSH error"), + v1.LogField("instanceID", inst.ID), + ) + continue + } + + brevInst, err := c.sfcInstanceToBrevInstance(&inst, sshHostname) + if err != nil { + c.logger.Error(ctx, err, + v1.LogField("msg", "sfcv2: ListInstances skipping instance due to conversion error"), + v1.LogField("instanceID", inst.ID), + ) + continue + } + instances = append(instances, *brevInst) + } + + c.logger.Debug(ctx, "sfcv2: ListInstances end", + v1.LogField("instance count", len(instances)), + ) + + return instances, nil +} + +func (c *SFCClientV2) TerminateInstance(ctx context.Context, id v1.CloudProviderInstanceID) error { + c.logger.Debug(ctx, "sfcv2: TerminateInstance start", + v1.LogField("instanceID", id), + ) + + _, err := c.client.Instances.TerminateInstance(ctx, string(id)) + if err != nil { + return errors.WrapAndTrace(err) + } + + c.logger.Debug(ctx, "sfcv2: TerminateInstance end", + v1.LogField("instanceID", id), + ) + + return nil +} + +func (c *SFCClientV2) getSSHHostname(ctx context.Context, id string, status components.InstanceStatus) (string, error) { + if status != components.InstanceStatusRunning { + return "", nil + } + + resp, err := c.client.Instances.GetSSHInfoForInstance(ctx, id) + if err != nil { + return "", errors.WrapAndTrace(err) + } + if resp.InstanceSSHInfo == nil { + return "", nil + } + + return resp.InstanceSSHInfo.Hostname, nil +} + +func (c *SFCClientV2) sfcInstanceToBrevInstance(inst *components.InstanceResponse, sshHostname string) (*v1.Instance, error) { + tags, _ := inst.GetTags().GetOrZero() + + cloudCredRefID := tags[tagKeyCloudCredRefID] + if cloudCredRefID == "" { + cloudCredRefID = c.refID + } + refID := tags[tagKeyRefID] + name := tags[tagKeyName] + if name == "" { + name = inst.Name + } + + status := sfcStatusToLifecycleStatus(inst.Status) + + diskInt64, err := h100InstanceTypeMetadata.diskBytes.ByteCountInUnitInt64(v1.Gibibyte) + if err != nil { + return nil, err + } + diskSize := units.Base2Bytes(diskInt64 * int64(units.Gibibyte)) + + return &v1.Instance{ + Name: name, + CloudID: v1.CloudProviderInstanceID(inst.ID), + RefID: refID, + PublicDNS: sshHostname, + PublicIP: sshHostname, + SSHUser: defaultSSHUsername, + SSHPort: defaultPort, + CreatedAt: time.Unix(inst.CreatedAt, 0), + DiskSize: diskSize, + DiskSizeBytes: h100InstanceTypeMetadata.diskBytes, + Status: v1.Status{ + LifecycleStatus: status, + }, + InstanceTypeID: h100InstanceTypeMetadata.instanceTypeID, + InstanceType: h100InstanceType, + Location: c.location, + Spot: false, + Stoppable: false, + Rebootable: false, + CloudCredRefID: cloudCredRefID, + }, nil +} + +func sfcStatusToLifecycleStatus(status components.InstanceStatus) v1.LifecycleStatus { + switch status { + case components.InstanceStatusAwaitingAllocation: + return v1.LifecycleStatusPending + case components.InstanceStatusRunning: + return v1.LifecycleStatusRunning + case components.InstanceStatusTerminated: + return v1.LifecycleStatusTerminated + case components.InstanceStatusFailed: + return v1.LifecycleStatusFailed + default: + return v1.LifecycleStatusPending + } +} + +func getStageFromTags(tags v1.Tags) string { + for k, v := range tags { + if strings.HasSuffix(k, "-stage") { + return v + } + } + return "unknown" +} + +func (c *SFCClientV2) RebootInstance(_ context.Context, _ v1.CloudProviderInstanceID) error { + return v1.ErrNotImplemented +} + +func (c *SFCClientV2) StopInstance(_ context.Context, _ v1.CloudProviderInstanceID) error { + return v1.ErrNotImplemented +} + +func (c *SFCClientV2) StartInstance(_ context.Context, _ v1.CloudProviderInstanceID) error { + return v1.ErrNotImplemented +} + +func (c *SFCClientV2) MergeInstanceForUpdate(_ v1.Instance, newInst v1.Instance) v1.Instance { + return newInst +} + +func (c *SFCClientV2) MergeInstanceTypeForUpdate(_ v1.InstanceType, newIt v1.InstanceType) v1.InstanceType { + return newIt +} diff --git a/v1/providers/sfcomputev2/instancetype.go b/v1/providers/sfcomputev2/instancetype.go new file mode 100644 index 0000000..21fc3da --- /dev/null +++ b/v1/providers/sfcomputev2/instancetype.go @@ -0,0 +1,204 @@ +package v2 + +import ( + "context" + "fmt" + "time" + + "github.com/alecthomas/units" + "github.com/bojanz/currency" + "github.com/brevdev/cloud/internal/errors" + v1 "github.com/brevdev/cloud/v1" + "github.com/sfcompute/sfc-go/models/components" + "github.com/sfcompute/sfc-go/models/operations" +) + +const ( + h100InstanceType = "h100.ib" + sfcVCPU = 112 + sfcGPUCount = 8 + sfcLocation = "sfc" + diskTypeSSD = "ssd" + formFactorSXM5 = "sxm5" +) + +type sfcInstanceTypeMetadata struct { + diskBytes v1.Bytes + memoryBytes v1.Bytes + gpuVRAM v1.Bytes + vcpu int32 + gpuCount int32 + gpuManufacturer v1.Manufacturer + architecture v1.Architecture + deployTime time.Duration + price currency.Amount + instanceTypeID v1.InstanceTypeID +} + +var h100InstanceTypeMetadata = func() sfcInstanceTypeMetadata { + price, err := currency.NewAmount("16.00", "USD") + if err != nil { + panic(err) + } + m := sfcInstanceTypeMetadata{ + diskBytes: v1.NewBytes(1500, v1.Gigabyte), + memoryBytes: v1.NewBytes(960, v1.Gigabyte), + gpuVRAM: v1.NewBytes(80, v1.Gigabyte), + vcpu: sfcVCPU, + gpuCount: sfcGPUCount, + gpuManufacturer: v1.ManufacturerNVIDIA, + architecture: v1.ArchitectureX86_64, + deployTime: 14 * time.Minute, + price: price, + } + + // Compute the instance type ID from a representative InstanceType so it matches + // what Brev expects when validating or storing the type. + it := buildInstanceType(m, true) + m.instanceTypeID = it.ID + return m +}() + +func buildInstanceType(m sfcInstanceTypeMetadata, isAvailable bool) v1.InstanceType { + ramInt64, _ := m.memoryBytes.ByteCountInUnitInt64(v1.Gibibyte) + ram := units.Base2Bytes(ramInt64 * int64(units.Gibibyte)) + + vramInt64, _ := m.gpuVRAM.ByteCountInUnitInt64(v1.Gibibyte) + vram := units.Base2Bytes(vramInt64 * int64(units.Gibibyte)) + + diskInt64, _ := m.diskBytes.ByteCountInUnitInt64(v1.Gibibyte) + diskSize := units.Base2Bytes(diskInt64 * int64(units.Gibibyte)) + + it := v1.InstanceType{ + IsAvailable: isAvailable, + Type: h100InstanceType, + Memory: ram, + MemoryBytes: m.memoryBytes, + VCPU: m.vcpu, + Location: sfcLocation, + Stoppable: false, + Rebootable: false, + IsContainer: false, + Provider: CloudProviderID, + BasePrice: &m.price, + EstimatedDeployTime: &m.deployTime, + SupportedGPUs: []v1.GPU{{ + Count: m.gpuCount, + Type: "H100", + Manufacturer: m.gpuManufacturer, + Name: "H100", + Memory: vram, + MemoryBytes: m.gpuVRAM, + NetworkDetails: formFactorSXM5, + }}, + SupportedStorage: []v1.Storage{{ + Type: diskTypeSSD, + Count: 1, + Size: diskSize, + SizeBytes: m.diskBytes, + }}, + SupportedArchitectures: []v1.Architecture{m.architecture}, + } + it.ID = v1.MakeGenericInstanceTypeID(it) + return it +} + +func (c *SFCClientV2) GetInstanceTypes(ctx context.Context, args v1.GetInstanceTypeArgs) ([]v1.InstanceType, error) { + c.logger.Debug(ctx, "sfcv2: GetInstanceTypes start", + v1.LogField("location", c.location), + ) + + available, err := c.availableSlots(ctx) + if err != nil { + return nil, errors.WrapAndTrace(err) + } + + if available <= 0 { + c.logger.Debug(ctx, "sfcv2: GetInstanceTypes no available slots") + return []v1.InstanceType{}, nil + } + + instanceType := buildInstanceType(h100InstanceTypeMetadata, true) + + if !v1.IsSelectedByArgs(instanceType, args) { + return []v1.InstanceType{}, nil + } + + c.logger.Debug(ctx, "sfcv2: GetInstanceTypes end", + v1.LogField("available slots", available), + ) + + return []v1.InstanceType{instanceType}, nil +} + +// availableSlots returns how many more instances can be created in the configured capacity. +// It subtracts the count of active (running + awaiting_allocation) instances from the total +// procurement target. +func (c *SFCClientV2) availableSlots(ctx context.Context) (int, error) { + target, err := c.procurementTarget(ctx) + if err != nil { + return 0, errors.WrapAndTrace(err) + } + + active, err := c.activeInstanceCount(ctx) + if err != nil { + return 0, errors.WrapAndTrace(err) + } + + available := target - active + if available < 0 { + available = 0 + } + return available, nil +} + +// procurementTarget sums the Integer targets from all procurements on c.capacityID. +func (c *SFCClientV2) procurementTarget(ctx context.Context) (int, error) { + resp, err := c.client.Procurements.List(ctx, operations.ListProcurementsRequest{ + Capacity: &c.capacityID, + }) + if err != nil { + return 0, errors.WrapAndTrace(err) + } + if resp.ListProcurementsResponse == nil { + return 0, nil + } + + total := 0 + for _, p := range resp.ListProcurementsResponse.Data { + if p.Target.Type == components.ProcurementTargetTypeInteger && p.Target.Integer != nil { + total += int(*p.Target.Integer) + } + } + return total, nil +} + +// activeInstanceCount returns the number of running or awaiting_allocation instances +// in c.capacityID. +func (c *SFCClientV2) activeInstanceCount(ctx context.Context) (int, error) { + activeStatuses := []components.InstanceStatus{ + components.InstanceStatusRunning, + components.InstanceStatusAwaitingAllocation, + } + + resp, err := c.client.Instances.List(ctx, operations.ListInstancesRequest{ + Capacity: &c.capacityID, + Status: activeStatuses, + }) + if err != nil { + return 0, errors.WrapAndTrace(err) + } + if resp.ListInstancesResponse == nil { + return 0, nil + } + + return len(resp.ListInstancesResponse.Data), nil +} + +func (c *SFCClientV2) GetLocations(_ context.Context, _ v1.GetLocationsArgs) ([]v1.Location, error) { + return []v1.Location{{ + Name: sfcLocation, + Description: fmt.Sprintf("sfc_%s_h100", sfcLocation), + Available: true, + }}, nil +} From c9ef4e9ca6f45d78fa4e53f7af4b72cdbb773293 Mon Sep 17 00:00:00 2001 From: andreaanez Date: Tue, 5 May 2026 09:33:48 -0700 Subject: [PATCH 2/4] refactor(sfcomputev2): consolidate Brev-specific constants into brev_constants.go Moves tag keys, SSH defaults, and adds production capacity/image IDs to a dedicated file to separate Brev-specific config from generated SDK usage. Co-Authored-By: Claude Sonnet 4.6 --- v1/providers/sfcomputev2/brev_constants.go | 19 +++++++++++++++++++ v1/providers/sfcomputev2/instance.go | 10 ---------- 2 files changed, 19 insertions(+), 10 deletions(-) create mode 100644 v1/providers/sfcomputev2/brev_constants.go diff --git a/v1/providers/sfcomputev2/brev_constants.go b/v1/providers/sfcomputev2/brev_constants.go new file mode 100644 index 0000000..388479d --- /dev/null +++ b/v1/providers/sfcomputev2/brev_constants.go @@ -0,0 +1,19 @@ +package v2 + +const ( + defaultPort = 22 + defaultSSHUsername = "ubuntu" + + // Tag keys used to persist Brev metadata on SFCompute V2 instances. + tagKeyCloudCredRefID = "brev-cloud-cred-ref-id" + tagKeyStage = "brev-stage" + tagKeyRefID = "brev-ref-id" + tagKeyName = "brev-name" + + // BrevProductionCapacityID is the SFCompute capacity used for Brev-managed instances. + // TODO: replace with dynamic lookup from the Brev credential config; this is a stand-in. + BrevProductionCapacityID = "brev-production-capacity" + + // BrevProductionImageID is the public SFCompute image "ubuntu-24.04.4-cuda-12.8" (vm_images.vm_image_id). + BrevProductionImageID = "vmi_4GwEvmclFURy7ztFQjOdr" +) diff --git a/v1/providers/sfcomputev2/instance.go b/v1/providers/sfcomputev2/instance.go index 6834206..7b96202 100644 --- a/v1/providers/sfcomputev2/instance.go +++ b/v1/providers/sfcomputev2/instance.go @@ -16,16 +16,6 @@ import ( "github.com/sfcompute/sfc-go/optionalnullable" ) -const ( - defaultPort = 22 - defaultSSHUsername = "ubuntu" - - // Tag keys used to persist Brev metadata on V2 instances (native tags replace name encoding). - tagKeyCloudCredRefID = "brev-cloud-cred-ref-id" - tagKeyStage = "brev-stage" - tagKeyRefID = "brev-ref-id" - tagKeyName = "brev-name" -) func (c *SFCClientV2) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) { c.logger.Debug(ctx, "sfcv2: CreateInstance start", From 6fe2bb3e6458c6be9106f831f58eab10efa400b0 Mon Sep 17 00:00:00 2001 From: andreaanez Date: Tue, 5 May 2026 14:46:52 -0700 Subject: [PATCH 3/4] refactor(sfcomputev2): clean up provider for correctness and consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Store user tags from attrs.Tags as real SFC V2 instance tags; populate v1.Instance.Tags on read, filtering internal Brev metadata keys - Remove CapacityID/ImageID from SFCCredentialV2 — pull from constants instead, with a TODO to source from env vars - Replace procurementTarget (Procurements.List) with currentCapacityAllocation (Capacities.Fetch + AllocationSchedule.Total) for available slot counting - Count all non-terminated instances (including failed) against capacity - Fix sfcInstanceToBrevInstance to use sfcLocation constant instead of c.location - Add CapabilityTags to declared capabilities - Remove legacy sfcNameToBrevData fallback logic (V1 and V2 are mutually exclusive) - Remove speculative stageTesting/stageProduction normalization; pass stage through raw - Add validation_test.go following the same pattern as the V1 provider Co-Authored-By: Claude Sonnet 4.6 --- v1/providers/sfcomputev2/brev_constants.go | 16 +++--- v1/providers/sfcomputev2/capabilities.go | 1 + v1/providers/sfcomputev2/client.go | 37 ++++++-------- v1/providers/sfcomputev2/instance.go | 51 ++++++++----------- v1/providers/sfcomputev2/instancetype.go | 56 ++++++++++----------- v1/providers/sfcomputev2/validation_test.go | 50 ++++++++++++++++++ 6 files changed, 123 insertions(+), 88 deletions(-) create mode 100644 v1/providers/sfcomputev2/validation_test.go diff --git a/v1/providers/sfcomputev2/brev_constants.go b/v1/providers/sfcomputev2/brev_constants.go index 388479d..c9ec88f 100644 --- a/v1/providers/sfcomputev2/brev_constants.go +++ b/v1/providers/sfcomputev2/brev_constants.go @@ -1,19 +1,23 @@ package v2 +// Package-internal constants — SSH defaults and internal tag keys. const ( defaultPort = 22 defaultSSHUsername = "ubuntu" - // Tag keys used to persist Brev metadata on SFCompute V2 instances. + // Internal tag keys written to every SFCompute V2 instance. These are stripped from + // v1.Instance.Tags on read so they don't surface as user-facing tags. tagKeyCloudCredRefID = "brev-cloud-cred-ref-id" - tagKeyStage = "brev-stage" tagKeyRefID = "brev-ref-id" - tagKeyName = "brev-name" +) - // BrevProductionCapacityID is the SFCompute capacity used for Brev-managed instances. - // TODO: replace with dynamic lookup from the Brev credential config; this is a stand-in. +// Brev environment config for SFCompute V2. +// TODO: source these from environment variables rather than hardcoding them here. +const ( + // BrevProductionCapacityID is the SFCompute V2 capacity ID for Brev production instances. BrevProductionCapacityID = "brev-production-capacity" - // BrevProductionImageID is the public SFCompute image "ubuntu-24.04.4-cuda-12.8" (vm_images.vm_image_id). + // BrevProductionImageID is the SFCompute image for Brev production instances + // (ubuntu-24.04.4-cuda-12.8, vm_images.vm_image_id). BrevProductionImageID = "vmi_4GwEvmclFURy7ztFQjOdr" ) diff --git a/v1/providers/sfcomputev2/capabilities.go b/v1/providers/sfcomputev2/capabilities.go index 6ca8e19..e9b62d6 100644 --- a/v1/providers/sfcomputev2/capabilities.go +++ b/v1/providers/sfcomputev2/capabilities.go @@ -11,6 +11,7 @@ func getSFCCapabilitiesV2() v1.Capabilities { v1.CapabilityCreateInstance, v1.CapabilityTerminateInstance, v1.CapabilityCreateTerminateInstance, + v1.CapabilityTags, } } diff --git a/v1/providers/sfcomputev2/client.go b/v1/providers/sfcomputev2/client.go index ba0e69f..b5abb9a 100644 --- a/v1/providers/sfcomputev2/client.go +++ b/v1/providers/sfcomputev2/client.go @@ -9,23 +9,18 @@ import ( const CloudProviderID = "sfcompute" -// SFCCredentialV2 holds only authentication details. Operational config (capacity, image) -// is set on SFCClientV2 at MakeClient time via the Brev credential config. +// SFCCredentialV2 holds authentication details for a Brev-managed SFCompute V2 account. type SFCCredentialV2 struct { - RefID string - APIKey string `json:"api_key"` - CapacityID string `json:"capacity_id"` - ImageID string `json:"image_id"` + RefID string + APIKey string `json:"api_key"` } var _ v1.CloudCredential = &SFCCredentialV2{} -func NewSFCCredentialV2(refID, apiKey, capacityID, imageID string) *SFCCredentialV2 { +func NewSFCCredentialV2(refID, apiKey string) *SFCCredentialV2 { return &SFCCredentialV2{ - RefID: refID, - APIKey: apiKey, - CapacityID: capacityID, - ImageID: imageID, + RefID: refID, + APIKey: apiKey, } } @@ -47,12 +42,10 @@ func (c *SFCCredentialV2) GetTenantID() (string, error) { type SFCClientV2 struct { v1.NotImplCloudClient - refID string - location string - capacityID string - imageID string - client *sfc.SDK - logger v1.Logger + refID string + location string + client *sfc.SDK + logger v1.Logger } var _ v1.CloudClient = &SFCClientV2{} @@ -67,12 +60,10 @@ func WithLogger(logger v1.Logger) SFCClientV2Option { func (c *SFCCredentialV2) MakeClientWithOptions(_ context.Context, location string, opts ...SFCClientV2Option) (v1.CloudClient, error) { sfcClient := &SFCClientV2{ - refID: c.RefID, - location: location, - capacityID: c.CapacityID, - imageID: c.ImageID, - client: sfc.New(sfc.WithSecurity(c.APIKey)), - logger: &v1.NoopLogger{}, + refID: c.RefID, + location: location, + client: sfc.New(sfc.WithSecurity(c.APIKey)), + logger: &v1.NoopLogger{}, } for _, opt := range opts { diff --git a/v1/providers/sfcomputev2/instance.go b/v1/providers/sfcomputev2/instance.go index 7b96202..fea8b53 100644 --- a/v1/providers/sfcomputev2/instance.go +++ b/v1/providers/sfcomputev2/instance.go @@ -4,8 +4,8 @@ import ( "context" "encoding/base64" "fmt" + "maps" "slices" - "strings" "time" "github.com/alecthomas/units" @@ -16,25 +16,21 @@ import ( "github.com/sfcompute/sfc-go/optionalnullable" ) - func (c *SFCClientV2) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) { c.logger.Debug(ctx, "sfcv2: CreateInstance start", v1.LogField("name", attrs.Name), v1.LogField("location", attrs.Location), ) - stage := getStageFromTags(attrs.Tags) - tags := map[string]string{ - tagKeyCloudCredRefID: c.refID, - tagKeyStage: stage, - tagKeyRefID: attrs.RefID, - tagKeyName: attrs.Name, - } + tags := make(map[string]string, len(attrs.Tags)+2) + maps.Copy(tags, attrs.Tags) + tags[tagKeyCloudCredRefID] = c.refID + tags[tagKeyRefID] = attrs.RefID cloudInit := sshKeyCloudInit(attrs.PublicKey) resp, err := c.client.Instances.Create(ctx, components.CreateInstanceRequest{ - Capacity: c.capacityID, - Image: c.imageID, + Capacity: BrevProductionCapacityID, + Image: BrevProductionImageID, CloudInitUserData: &cloudInit, Tags: optionalnullable.From(&tags), Name: optionalnullable.From(&attrs.Name), @@ -99,8 +95,9 @@ func (c *SFCClientV2) ListInstances(ctx context.Context, args v1.ListInstancesAr v1.LogField("location", c.location), ) + capacityID := BrevProductionCapacityID resp, err := c.client.Instances.List(ctx, operations.ListInstancesRequest{ - Capacity: &c.capacityID, + Capacity: &capacityID, }) if err != nil { return nil, errors.WrapAndTrace(err) @@ -111,8 +108,6 @@ func (c *SFCClientV2) ListInstances(ctx context.Context, args v1.ListInstancesAr var instances []v1.Instance for _, inst := range resp.ListInstancesResponse.Data { - inst := inst // capture loop variable - // Filter by instance IDs if specified. if len(args.InstanceIDs) > 0 && !slices.Contains(args.InstanceIDs, v1.CloudProviderInstanceID(inst.ID)) { continue @@ -185,10 +180,14 @@ func (c *SFCClientV2) sfcInstanceToBrevInstance(inst *components.InstanceRespons if cloudCredRefID == "" { cloudCredRefID = c.refID } - refID := tags[tagKeyRefID] - name := tags[tagKeyName] - if name == "" { - name = inst.Name + + userTags := make(v1.Tags) + for k, v := range tags { + switch k { + case tagKeyCloudCredRefID, tagKeyRefID: + default: + userTags[k] = v + } } status := sfcStatusToLifecycleStatus(inst.Status) @@ -200,9 +199,9 @@ func (c *SFCClientV2) sfcInstanceToBrevInstance(inst *components.InstanceRespons diskSize := units.Base2Bytes(diskInt64 * int64(units.Gibibyte)) return &v1.Instance{ - Name: name, + Name: inst.Name, CloudID: v1.CloudProviderInstanceID(inst.ID), - RefID: refID, + RefID: tags[tagKeyRefID], PublicDNS: sshHostname, PublicIP: sshHostname, SSHUser: defaultSSHUsername, @@ -215,11 +214,12 @@ func (c *SFCClientV2) sfcInstanceToBrevInstance(inst *components.InstanceRespons }, InstanceTypeID: h100InstanceTypeMetadata.instanceTypeID, InstanceType: h100InstanceType, - Location: c.location, + Location: sfcLocation, Spot: false, Stoppable: false, Rebootable: false, CloudCredRefID: cloudCredRefID, + Tags: userTags, }, nil } @@ -238,15 +238,6 @@ func sfcStatusToLifecycleStatus(status components.InstanceStatus) v1.LifecycleSt } } -func getStageFromTags(tags v1.Tags) string { - for k, v := range tags { - if strings.HasSuffix(k, "-stage") { - return v - } - } - return "unknown" -} - func (c *SFCClientV2) RebootInstance(_ context.Context, _ v1.CloudProviderInstanceID) error { return v1.ErrNotImplemented } diff --git a/v1/providers/sfcomputev2/instancetype.go b/v1/providers/sfcomputev2/instancetype.go index 21fc3da..af06792 100644 --- a/v1/providers/sfcomputev2/instancetype.go +++ b/v1/providers/sfcomputev2/instancetype.go @@ -132,10 +132,9 @@ func (c *SFCClientV2) GetInstanceTypes(ctx context.Context, args v1.GetInstanceT } // availableSlots returns how many more instances can be created in the configured capacity. -// It subtracts the count of active (running + awaiting_allocation) instances from the total -// procurement target. +// It subtracts the count of non-terminated instances from the current capacity allocation. func (c *SFCClientV2) availableSlots(ctx context.Context) (int, error) { - target, err := c.procurementTarget(ctx) + allocated, err := c.currentCapacityAllocation(ctx) if err != nil { return 0, errors.WrapAndTrace(err) } @@ -145,45 +144,38 @@ func (c *SFCClientV2) availableSlots(ctx context.Context) (int, error) { return 0, errors.WrapAndTrace(err) } - available := target - active - if available < 0 { - available = 0 - } - return available, nil + return max(allocated-active, 0), nil } -// procurementTarget sums the Integer targets from all procurements on c.capacityID. -func (c *SFCClientV2) procurementTarget(ctx context.Context) (int, error) { - resp, err := c.client.Procurements.List(ctx, operations.ListProcurementsRequest{ - Capacity: &c.capacityID, - }) +// currentCapacityAllocation returns the NodeAllocation from the most recent schedule entry +// in BrevProductionCapacityID that is currently in effect (EffectiveAt <= now). +func (c *SFCClientV2) currentCapacityAllocation(ctx context.Context) (int, error) { + resp, err := c.client.Capacities.Fetch(ctx, BrevProductionCapacityID, nil, nil) if err != nil { return 0, errors.WrapAndTrace(err) } - if resp.ListProcurementsResponse == nil { + if resp.CapacityResponse == nil { return 0, nil } - total := 0 - for _, p := range resp.ListProcurementsResponse.Data { - if p.Target.Type == components.ProcurementTargetTypeInteger && p.Target.Integer != nil { - total += int(*p.Target.Integer) + now := time.Now().Unix() + allocation := 0 + latestAt := int64(-1) + for _, entry := range resp.CapacityResponse.AllocationSchedule.Total { + if entry.EffectiveAt <= now && entry.EffectiveAt > latestAt { + latestAt = entry.EffectiveAt + allocation = entry.NodeAllocation } } - return total, nil + return allocation, nil } -// activeInstanceCount returns the number of running or awaiting_allocation instances -// in c.capacityID. +// activeInstanceCount returns the number of non-terminated instances in BrevProductionCapacityID. +// All non-terminated instances occupy a slot in the capacity, including failed ones. func (c *SFCClientV2) activeInstanceCount(ctx context.Context) (int, error) { - activeStatuses := []components.InstanceStatus{ - components.InstanceStatusRunning, - components.InstanceStatusAwaitingAllocation, - } - + capacityID := BrevProductionCapacityID resp, err := c.client.Instances.List(ctx, operations.ListInstancesRequest{ - Capacity: &c.capacityID, - Status: activeStatuses, + Capacity: &capacityID, }) if err != nil { return 0, errors.WrapAndTrace(err) @@ -192,7 +184,13 @@ func (c *SFCClientV2) activeInstanceCount(ctx context.Context) (int, error) { return 0, nil } - return len(resp.ListInstancesResponse.Data), nil + count := 0 + for _, inst := range resp.ListInstancesResponse.Data { + if inst.Status != components.InstanceStatusTerminated { + count++ + } + } + return count, nil } func (c *SFCClientV2) GetLocations(_ context.Context, _ v1.GetLocationsArgs) ([]v1.Location, error) { diff --git a/v1/providers/sfcomputev2/validation_test.go b/v1/providers/sfcomputev2/validation_test.go new file mode 100644 index 0000000..db4c265 --- /dev/null +++ b/v1/providers/sfcomputev2/validation_test.go @@ -0,0 +1,50 @@ +package v2 + +import ( + "os" + "testing" + + "github.com/brevdev/cloud/internal/validation" + v1 "github.com/brevdev/cloud/v1" +) + +func TestValidationFunctions(t *testing.T) { + t.Parallel() + checkSkip(t) + + config := validation.ProviderConfig{ + Credential: NewSFCCredentialV2("validation-test", getAPIKey()), + StableIDs: []v1.InstanceTypeID{ + h100InstanceTypeMetadata.instanceTypeID, + }, + } + + validation.RunValidationSuite(t, config) +} + +func TestInstanceLifecycleValidation(t *testing.T) { + t.Parallel() + checkSkip(t) + + config := validation.ProviderConfig{ + Credential: NewSFCCredentialV2("validation-test", getAPIKey()), + Location: sfcLocation, + } + + validation.RunInstanceLifecycleValidation(t, config) +} + +func checkSkip(t *testing.T) { + t.Helper() + apiKey := getAPIKey() + isValidationTest := os.Getenv("VALIDATION_TEST") + if apiKey == "" && isValidationTest != "" { + t.Fatal("SFCOMPUTE_API_KEY not set, but VALIDATION_TEST is set") + } else if apiKey == "" { + t.Skip("SFCOMPUTE_API_KEY not set, skipping sfcomputev2 validation tests") + } +} + +func getAPIKey() string { + return os.Getenv("SFCOMPUTE_API_KEY") +} From 0dd0e23f7b968a58a83906dceccad9644c2c511e Mon Sep 17 00:00:00 2001 From: andreaanez Date: Tue, 5 May 2026 15:06:30 -0700 Subject: [PATCH 4/4] chore: swap sfc-go local replace for tagged release v0.1.0-preview Co-Authored-By: Claude Sonnet 4.6 --- go.mod | 4 +--- go.sum | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index a608c70..5c04c3a 100644 --- a/go.mod +++ b/go.mod @@ -22,7 +22,7 @@ require ( github.com/nebius/gosdk v0.0.0-20250826102719-940ad1dfb5de github.com/pkg/errors v0.9.1 github.com/sfcompute/nodes-go v0.1.0-alpha.4 - github.com/sfcompute/sfc-go v0.0.0-local + github.com/sfcompute/sfc-go v0.1.0-preview github.com/stretchr/testify v1.11.1 golang.org/x/crypto v0.47.0 golang.org/x/text v0.33.0 @@ -115,5 +115,3 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) - -replace github.com/sfcompute/sfc-go v0.0.0-local => /Users/andreaanez/Documents/sf_compute/sfc-go diff --git a/go.sum b/go.sum index 26e4b65..f95ba1b 100644 --- a/go.sum +++ b/go.sum @@ -162,6 +162,8 @@ github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0t github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/sfcompute/nodes-go v0.1.0-alpha.4 h1:oFBWcMPSpqLYm/NDs5I1jTvzgx9rsXDL9Ghsm30Hc0Q= github.com/sfcompute/nodes-go v0.1.0-alpha.4/go.mod h1:nUviHgK+Fgt2hDFcRL3M8VoyiypC8fc0dsY8C30QU8M= +github.com/sfcompute/sfc-go v0.1.0-preview h1:yJ6ICglA/JZal2kauzb2aZlV9XdLPejsvFpsKwwThkQ= +github.com/sfcompute/sfc-go v0.1.0-preview/go.mod h1:vhUpRpAHKitZzzWPg87RjreC+pzK57PGe4ZuSIQSk94= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I=