From 62dc30cd57aeaff4620982f1afff6536a4283b63 Mon Sep 17 00:00:00 2001 From: Alex Timofeyev Date: Wed, 10 Jun 2026 17:44:20 +0000 Subject: [PATCH 1/2] DAOS-18972 control: add addr_format YAML key for fabric IP family The CaRT DAOS-18972 change exposed the fabric address-family preference as the D_ADDR_FORMAT environment variable / cio_addr_format API field (values: unspec, ipv4, ipv6, native). Operators currently have to inject that env var into each engine by hand to bring up an IPv6-only fabric NIC, which is easy to get wrong and undocumented in the server config. Surface the same knob declaratively in daos_server.yml as a per-engine "addr_format" key, mirroring the existing fabric_auth_key -> D_PROVIDER_AUTH_KEY pattern: * engine.FabricConfig grows an AddrFormat string field tagged `yaml:"addr_format,omitempty" cmdEnv:"D_ADDR_FORMAT"`. The existing reflection-based cmdEnv machinery emits D_ADDR_FORMAT= to the engine environment only when the field is non-empty, so omitting the key preserves the historical (Mercury-default, IPv4-preferring) behavior with no functional change for existing deployments. * FabricConfig.Update() propagates AddrFormat from the server-level fabric config to each engine, consistent with the other fabric fields. * FabricConfig.Validate() rejects unrecognized values up front (rather than relying on CaRT's silent fallback) and enforces one value per provider for multi-provider configs, matching the fabric_iface rule. An empty value remains valid. * Add a WithFabricAddrFormat() builder and an exported FabricAddrFormats slice enumerating the accepted hints, kept in sync with CaRT's crt_str_to_addr_format(). * Document the key in utils/config/daos_server.yml and extend the engine command-line env mapping test to assert D_ADDR_FORMAT=ipv6 is emitted. Validated with `go test ./server/engine/...` (env mapping and FabricConfig validation paths all pass). Signed-off-by: Alex Timofeyev --- src/control/server/engine/config.go | 50 ++++++++++++++++++++++++ src/control/server/engine/config_test.go | 2 + utils/config/daos_server.yml | 12 ++++++ 3 files changed, 64 insertions(+) diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index e49214ca111..e9f78f8589f 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -48,8 +48,18 @@ type FabricConfig struct { NumSecondaryEndpoints []int `yaml:"secondary_provider_endpoints,omitempty" cmdLongFlag:"--nr_sec_ctx,nonzero" cmdShortFlag:"-S,nonzero"` DisableSRX bool `yaml:"disable_srx,omitempty" cmdEnv:"FI_OFI_RXM_USE_SRX,invertBool,intBool"` AuthKey string `yaml:"fabric_auth_key,omitempty" cmdEnv:"D_PROVIDER_AUTH_KEY"` + // AddrFormat selects the preferred IP address family for fabric init. It is + // forwarded verbatim to CaRT/Mercury via the D_ADDR_FORMAT environment + // variable. An empty value leaves the historical (IPv4-preferring) default + // untouched. For multi-provider configurations it accepts a comma-separated + // list, one entry per provider, matching the provider ordering. + AddrFormat string `yaml:"addr_format,omitempty" cmdEnv:"D_ADDR_FORMAT"` } +// FabricAddrFormats enumerates the address-family hints accepted by addr_format, +// mirroring the values parsed by CaRT's crt_str_to_addr_format(). +var FabricAddrFormats = []string{"unspec", "ipv4", "ipv6", "native"} + // GetPrimaryProvider parses the primary provider from the Provider string. func (fc *FabricConfig) GetPrimaryProvider() (string, error) { providers, err := fc.GetProviders() @@ -157,6 +167,9 @@ func (fc *FabricConfig) Update(other FabricConfig) { if fc.AuthKey == "" { fc.AuthKey = other.AuthKey } + if fc.AddrFormat == "" { + fc.AddrFormat = other.AddrFormat + } if len(fc.NumSecondaryEndpoints) == 0 { fc.setNumSecondaryEndpoints(other.NumSecondaryEndpoints) } @@ -200,6 +213,10 @@ func (fc *FabricConfig) Validate() error { return errors.Errorf("provider, fabric_iface and fabric_iface_port must include the same number of items delimited by %q", MultiProviderSeparator) } + if err := fc.validateAddrFormat(numProv); err != nil { + return err + } + numSecProv := numProv - numPrimaryProviders if numSecProv > 0 { if len(fc.NumSecondaryEndpoints) != 0 && len(fc.NumSecondaryEndpoints) != numSecProv { @@ -216,6 +233,31 @@ func (fc *FabricConfig) Validate() error { return nil } +// validateAddrFormat ensures every comma-separated addr_format entry is a +// recognized address-family hint and that the count matches the number of +// providers (one hint per provider, as with fabric_iface). An empty +// addr_format is valid and preserves the default behavior. +func (fc *FabricConfig) validateAddrFormat(numProv int) error { + if fc.AddrFormat == "" { + return nil + } + + formats := splitMultiProviderStr(fc.AddrFormat) + for _, f := range formats { + if !common.Includes(FabricAddrFormats, f) { + return errors.Errorf("invalid addr_format %q, must be one of %v", + f, FabricAddrFormats) + } + } + + if len(formats) != numProv { + return errors.Errorf("addr_format must include one value per provider, delimited by %q", + MultiProviderSeparator) + } + + return nil +} + // cleanEnvVars scrubs the supplied slice of environment // variables by removing all variables not included in the // allow list. @@ -711,6 +753,14 @@ func (c *Config) WithFabricAuthKey(key string) *Config { return c } +// WithFabricAddrFormat sets the preferred IP address family for fabric init +// (forwarded to CaRT/Mercury as D_ADDR_FORMAT). Accepted values are listed in +// FabricAddrFormats; an empty value preserves the default behavior. +func (c *Config) WithFabricAddrFormat(format string) *Config { + c.Fabric.AddrFormat = format + return c +} + // WithSrxDisabled disables or enables SRX. func (c *Config) WithSrxDisabled(disable bool) *Config { c.Fabric.DisableSRX = disable diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go index 1c771cfe8b5..60faf94039b 100644 --- a/src/control/server/engine/config_test.go +++ b/src/control/server/engine/config_test.go @@ -693,6 +693,7 @@ func TestConfig_ToCmdVals(t *testing.T) { WithFabricProvider(provider). WithFabricInterface(interfaceName). WithFabricInterfacePort(interfacePort). + WithFabricAddrFormat("ipv6"). WithPinnedNumaNode(pinnedNumaNode). WithBypassHealthChk(&bypass). WithModules(modules). @@ -727,6 +728,7 @@ func TestConfig_ToCmdVals(t *testing.T) { "D_INTERFACE=" + interfaceName, "D_PORT=" + strconv.Itoa(interfacePort), "D_PROVIDER=" + provider, + "D_ADDR_FORMAT=ipv6", "D_LOG_FILE=" + logFile, "D_LOG_MASK=" + logMask, "CRT_TIMEOUT=" + strconv.FormatUint(uint64(crtTimeout), 10), diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index a16693c3564..67ae8e91775 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -136,6 +136,18 @@ #fabric_auth_key: foo:bar # # +## CART: Fabric address family +## Selects the preferred IP address family for fabric initialization, +## forwarded to CaRT/Mercury as D_ADDR_FORMAT. Leave unset to keep the +## historical default (the provider's own preference; IPv4 for verbs/RoCE). +## Set to "ipv6" for IPv6-only fabric NIC deployments, where the default +## IPv4 preference would otherwise hide the only usable interfaces. +## Accepted values: unspec (default), ipv4, ipv6, native. For multi-provider +## configurations, supply a comma-separated list, one entry per provider. +# +#addr_format: ipv6 +# +# ## Core Dump Filter ## Optional filter to control which mappings are written to the core ## dump in the event of a crash. See the following URL for more detail: From 8b6143068d3ede0c3655e09c96f3a7ba042b3e27 Mon Sep 17 00:00:00 2001 From: Alex Timofeyev Date: Wed, 10 Jun 2026 18:05:04 +0000 Subject: [PATCH 2/2] DAOS-18972 control: propagate addr_format to attaching clients The addr_format server config key configures the engine fabric address family, but DAOS clients (libdaos) initialize their own fabric and must select the same family to reach a given system. Address family is a property of each system's fabric, and a single client or agent may attach to several systems with different families (e.g. one IPv4 system and one IPv6 system), so the family cannot be a global client setting -- it has to travel per-system. Advertise addr_format to clients through the existing per-system GetAttachInfo network hint: when building each provider's ClientNetHint, append D_ADDR_FORMAT= to the hint's env_vars. The client applies hint env_vars in dc_mgmt_net_cfg_init() before crt_init(), where the CaRT D_ADDR_FORMAT handling (added with the cart half of DAOS-18972) picks it up. The agent already caches GetAttachInfo per system and copies the hint env_vars verbatim, so a client attached to multiple systems gets each system's family from that system's own hint, with no agent config. * engine.FabricConfig gains GetAddrFormats(), parsing the comma-separated addr_format into one entry per provider (mirrors GetProviders / GetInterfaces), returning empty when unset. * server.setupGrpc() derives the per-provider address family and injects D_ADDR_FORMAT into that provider's client hint env, copying the shared ClientEnvVars slice rather than mutating it. An unset addr_format leaves the hint untouched, so existing clients see no change. * Document the client-propagation behavior in utils/config/daos_server.yml and unit-test GetAddrFormats (nil/unset/single/multi/whitespace). No proto or client C changes are required: this reuses the env_vars hint field and the already-merged CaRT D_ADDR_FORMAT support. Validated with `go test ./server/engine/...` (GetAddrFormats and the existing fabric config/env-mapping tests pass). Signed-off-by: Alex Timofeyev --- src/control/server/engine/config.go | 11 +++++++ src/control/server/engine/config_test.go | 41 ++++++++++++++++++++++++ src/control/server/server.go | 18 ++++++++++- utils/config/daos_server.yml | 6 ++++ 4 files changed, 75 insertions(+), 1 deletion(-) diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index e9f78f8589f..82f122b11c5 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -130,6 +130,17 @@ func (fc *FabricConfig) GetInterfaces() ([]string, error) { return interfaces, nil } +// GetAddrFormats parses the AddrFormat string into zero or more per-provider +// address-family hints, matched one-to-one with the providers. Returns an empty +// slice when addr_format is unset, which preserves the default behavior. +func (fc *FabricConfig) GetAddrFormats() []string { + if fc == nil { + return nil + } + + return splitMultiProviderStr(fc.AddrFormat) +} + // GetInterfacePorts parses the InterfacePort string to one or more ports. func (fc *FabricConfig) GetInterfacePorts() ([]int, error) { if fc == nil { diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go index 60faf94039b..c92139f07ad 100644 --- a/src/control/server/engine/config_test.go +++ b/src/control/server/engine/config_test.go @@ -901,6 +901,47 @@ func TestFabricConfig_GetInterfaces(t *testing.T) { } } +func TestFabricConfig_GetAddrFormats(t *testing.T) { + for name, tc := range map[string]struct { + cfg *FabricConfig + expAddrFormats []string + }{ + "nil": { + expAddrFormats: nil, + }, + "unset": { + cfg: &FabricConfig{}, + expAddrFormats: []string{}, + }, + "single": { + cfg: &FabricConfig{ + AddrFormat: "ipv6", + }, + expAddrFormats: []string{"ipv6"}, + }, + "multi": { + cfg: &FabricConfig{ + AddrFormat: multiProviderString("ipv6", "ipv4"), + }, + expAddrFormats: []string{"ipv6", "ipv4"}, + }, + "excessive whitespace": { + cfg: &FabricConfig{ + AddrFormat: multiProviderString(" ipv6 ", "", "ipv4 "), + }, + expAddrFormats: []string{"ipv6", "ipv4"}, + }, + } { + t.Run(name, func(t *testing.T) { + addrFormats := tc.cfg.GetAddrFormats() + + if diff := cmp.Diff(tc.expAddrFormats, addrFormats); diff != "" { + t.Fatalf("(-want, +got):\n%s", diff) + } + }) + } +} + func TestFabricConfig_GetPrimaryInterface(t *testing.T) { for name, tc := range map[string]struct { cfg *FabricConfig diff --git a/src/control/server/server.go b/src/control/server/server.go index 69fa0703a4d..fafb846ebca 100644 --- a/src/control/server/server.go +++ b/src/control/server/server.go @@ -448,15 +448,31 @@ func (srv *server) setupGrpc() error { return err } + // Address family is a property of this system's fabric, so it must travel + // to attaching clients alongside the provider. Surface the operator's + // addr_format choice as a D_ADDR_FORMAT entry in the per-provider client + // hint env, which the client applies before crt_init (see + // dc_mgmt_net_cfg_init). A client/agent attached to several systems gets + // each system's family from that system's own hint. Empty addr_format + // leaves the hint untouched, preserving the historical default. + addrFormats := srv.cfg.Fabric.GetAddrFormats() + clientNetHints := make([]*mgmtpb.ClientNetHint, 0, len(providers)) for i, p := range providers { + envVars := srv.cfg.ClientEnvVars + if i < len(addrFormats) && addrFormats[i] != "" { + // Copy rather than append in place to avoid aliasing the shared + // ClientEnvVars slice across providers. + envVars = append(append([]string{}, envVars...), "D_ADDR_FORMAT="+addrFormats[i]) + } + clientNetHints = append(clientNetHints, &mgmtpb.ClientNetHint{ Provider: p, CrtTimeout: srv.cfg.Fabric.CrtTimeout, NetDevClass: uint32(srv.netDevClass[i]), SrvSrxSet: srxSetting, ProviderIdx: uint32(i), - EnvVars: srv.cfg.ClientEnvVars, + EnvVars: envVars, }) } srv.mgmtSvc.clientNetworkHint = clientNetHints diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index 67ae8e91775..0bbd8d610c3 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -144,6 +144,12 @@ ## IPv4 preference would otherwise hide the only usable interfaces. ## Accepted values: unspec (default), ipv4, ipv6, native. For multi-provider ## configurations, supply a comma-separated list, one entry per provider. +## +## This value is also advertised to attaching clients (as D_ADDR_FORMAT in the +## GetAttachInfo network hint), so clients select the same address family as +## this system's engines without separate client configuration. A client or +## agent attached to several systems picks up each system's family from that +## system's hint. # #addr_format: ipv6 #