From 675b6cc5061d100ef29d0eea9ef95de268028a29 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Thu, 4 Jun 2026 20:02:04 -0400 Subject: [PATCH 1/2] DAOS-19102 test: build Go binaries with the Valgrind tag for NLT memcheck Go 1.25+ includes a new valgrind build tag which instruments the Go runtime in a way that valgrind can comprehend. Instead of fighting a constantly losing battle with valgrind, let's work with it and ditch the unstable suppressions that need to be updated for every release of the Go toolchain. This commit introduces scons and Jenkinsfile changes to enable this build mode for NLT, but keeps the -race build for normal non-release testing. Signed-off-by: Michael MacDonald --- Jenkinsfile | 12 + ci/unit/test_nlt.sh | 8 +- ci/unit/test_nlt_node.sh | 6 +- site_scons/prereq_tools/base.py | 4 + site_scons/site_tools/go_builder.py | 16 + src/cart/utils/memcheck-cart.supp | 779 ++++------------------------ src/control/SConscript | 7 + utils/node_local_test.py | 26 + 8 files changed, 166 insertions(+), 692 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index bd4bf753c3c..62cf81a3713 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -579,6 +579,16 @@ pipeline { ' PREFIX=/opt/daos TARGET_TYPE=release')) sh label: 'Generate RPMs', script: './ci/rpm/gen_rpms.sh el9 "' + env.DAOS_RELVAL + '"' + // Valgrind-tagged variant for the NLT memcheck stage, + // stashed separately; the build above keeps -race for ftest. + job_step_update( + sconsBuild(parallel_build: true, + build_deps: 'no', + scons_args: sconsArgs() + + ' BUILD_VALGRIND=1 PREFIX=/opt/daos TARGET_TYPE=release')) + sh label: 'Stash valgrind install tree for NLT', + script: 'tar -C / -cf opt-daos-valgrind.tar opt/daos' + stash(name: 'opt-daos-valgrind', includes: 'opt-daos-valgrind.tar') } } post { @@ -715,6 +725,8 @@ pipeline { label params.CI_NLT_1_LABEL } steps { + // NLT memchecks the valgrind-tagged build, not the shared -race one. + unstash 'opt-daos-valgrind' job_step_update( unitTest(timeout_time: 60, inst_repos: daosRepos(), diff --git a/ci/unit/test_nlt.sh b/ci/unit/test_nlt.sh index 23e3bc8b549..708f25165d3 100755 --- a/ci/unit/test_nlt.sh +++ b/ci/unit/test_nlt.sh @@ -10,8 +10,12 @@ rm -rf dnt.*.memcheck.xml vm_test/ NODE=${NODELIST%%,*} mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" -# Copy over the install tree and some of the build tree. -rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/ +# Copy over the install tree and some of the build tree. The memcheck NLT stage +# ships the valgrind-tagged build (opt-daos-valgrind.tar); the fault-injection +# stage ships the standard opt-daos.tar. Use whichever was unstashed. +opt_tar=opt-daos.tar +[ -f opt-daos-valgrind.tar ] && opt_tar=opt-daos-valgrind.tar +rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* "$opt_tar" utils requirements-utest.txt jenkins@"$NODE":build/ ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \ "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 53630da411a..33b1117d335 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -12,7 +12,11 @@ if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then fi cd build -tar -xf opt-daos.tar +# Memcheck NLT ships opt-daos-valgrind.tar; the fault-injection stage ships +# the standard opt-daos.tar. Extract whichever was shipped. +opt_tar=opt-daos.tar +[ -f opt-daos-valgrind.tar ] && opt_tar=opt-daos-valgrind.tar +tar -xf "$opt_tar" sudo mv opt/daos /opt/ # Setup daos admin etc. diff --git a/site_scons/prereq_tools/base.py b/site_scons/prereq_tools/base.py index 67e157a071e..f6ceeb05ecc 100644 --- a/site_scons/prereq_tools/base.py +++ b/site_scons/prereq_tools/base.py @@ -516,6 +516,10 @@ def __init__(self, env, opts): opts.Add(EnumVariable('WARNING_LEVEL', "Set default warning level", 'error', ['warning', 'warn', 'error'], ignorecase=2)) opts.Add(('SANITIZERS', 'Instrument C code with Google Sanitizers', None)) + opts.Add(BoolVariable('BUILD_VALGRIND', + 'Build Go artifacts with the Go "valgrind" tag for Memcheck ' + '(also drops -race; ignored for release)', + False)) opts.Add(BoolVariable('CMOCKA_FILTER_SUPPORTED', 'Allows to filter cmocka tests', False)) opts.Add(BoolVariable('CRT_PP', 'Preprocess CaRT sources', False)) opts.Add(BoolVariable('HEAP_PROFILER', 'Instrument C code with Gperftools Heap Profiler', diff --git a/site_scons/site_tools/go_builder.py b/site_scons/site_tools/go_builder.py index e6657b3b314..6de49301108 100644 --- a/site_scons/site_tools/go_builder.py +++ b/site_scons/site_tools/go_builder.py @@ -11,6 +11,21 @@ include_re = re.compile(r'\#include [<"](\S+[>"])', re.M) +def _is_valgrind_build(env): + """Return True if Go artifacts should be built with the Go 1.25+ "valgrind" tag. + + BUILD_VALGRIND=1 makes the Go runtime cooperate with Memcheck. Ignored for + release builds. + """ + if not env.get('BUILD_VALGRIND'): + return False + if env.get('BUILD_TYPE') == 'release': + return False + if env.get('SANITIZERS'): + Exit('BUILD_VALGRIND=1 is incompatible with SANITIZERS') + return True + + def _scan_go_file(node, env, _path): """Scanner for go code""" src_dir = os.path.dirname(str(node)) @@ -119,6 +134,7 @@ def _check_go_version(context): return 1 env.d_go_bin = env.get("GO_BIN", env.WhereIs(GO_COMPILER, os.environ['PATH'])) + env.AddMethod(_is_valgrind_build, 'd_is_valgrind_build') if GetOption('help') or GetOption('clean'): return diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 5bdf8278a33..20b7bba8541 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -1,542 +1,156 @@ +# Go runtime { - dl init leaks + Go runtime: GC memory (valgrind integration entry point) Memcheck:Leak + match-leak-kinds: definite,indirect,possible,reachable + fun:runtime.valgrindClientRequest* ... - fun:_dl_init -} -{ - dl open leaks - Memcheck:Leak - ... - fun:_dl_open } { - _dl_fini leak + Go runtime: persistentalloc (cgo malloc, never freed) Memcheck:Leak match-leak-kinds: reachable + fun:malloc + fun:_cgo_*_Cfunc__Cmalloc + fun:runtime.asmcgocall.abi0 ... - fun:_dl_fini - ... + fun:runtime.persistentalloc } { - dlerror_run leak + Go runtime: newproc (cgo malloc, never freed) Memcheck:Leak match-leak-kinds: reachable + fun:malloc + fun:_cgo_*_Cfunc__Cmalloc + fun:runtime.asmcgocall.abi0 ... - fun:_dlerror_run - ... + fun:runtime.newproc.abi0 } { - _dl_fixup leak + Go runtime: bootstrap (cgo malloc, never freed) Memcheck:Leak match-leak-kinds: reachable + fun:malloc ... - fun:_dl_fixup - ... -} -{ - FI leak 6 - Memcheck:Leak - match-leak-kinds: possible - ... - fun:rdma_bind_addr - ... -} -{ - FI leak 7 - Memcheck:Leak - ... - fun:fi_ini - ... -} -{ - access-0 - Memcheck:Param - socketcall.sendto(msg) - ... - fun:send -} -{ - - Memcheck:Leak - ... - fun:x_cgo_thread_start -} -{ - - Memcheck:Addr8 - ... - fun:bufio.(*Reader).* -} -{ - - Memcheck:Value8 - ... - fun:bufio.(*Reader).* -} -{ - - Memcheck:Addr1 - ... - fun:bufio.(*Reader).* -} -{ - - Memcheck:Addr8 - fun:os.* -} -{ - - Memcheck:Addr1 - fun:bytes.* -} -{ - - Memcheck:Addr8 - fun:bytes.* -} -{ - - Memcheck:Addr1 - fun:internal/bytealg*IndexByte* -} -{ - - Memcheck:Addr4 - fun:internal/bytealg*IndexByte* -} -{ - - Memcheck:Addr8 - fun:internal/bytealg*IndexByte* -} -{ - - Memcheck:Addr1 - fun:strings.(*Builder).* -} -{ - - Memcheck:Addr8 - fun:strings.(*Builder).* -} -{ - - Memcheck:Cond - ... - fun:runtime.* -} -{ - - Memcheck:Addr1 - ... - fun:runtime.* -} -{ - - Memcheck:Addr2 - ... - fun:runtime.* -} -{ - - Memcheck:Addr4 + fun:runtime.asmcgocall.abi0 ... - fun:runtime.* + fun:runtime.rt0_go.abi0 } +# bytealg over-reads buffer ends by design; golang/go#27610 +# recommends this suppression { - + golang/go#27610: bytealg indexbytebody over-read Memcheck:Addr8 ... - fun:runtime.* + fun:indexbytebody } { - + golang/go#27610: bytealg indexbytebody over-read Memcheck:Addr16 ... - fun:runtime.* -} -{ - - Memcheck:Addr32 - ... - fun:runtime.* -} -{ - - Memcheck:Value8 - ... - fun:runtime.* -} -{ - Go conditional - Memcheck:Cond - src:*.go -} -{ - Go addr1 - Memcheck:Addr1 - src:*.go -} -{ - Go addr 2 - Memcheck:Addr2 - src:*.go -} -{ - Go addr 4 - Memcheck:Addr4 - src:*.go -} -{ - Go addr 8 - Memcheck:Addr8 - src:*.go -} -{ - Go addr 16 - Memcheck:Addr16 - src:*.go + fun:indexbytebody } { - Go addr 32 + golang/go#27610: bytealg indexbytebody over-read Memcheck:Addr32 - src:*.go -} -{ - Ga value 8 - Memcheck:Value8 - src:*.go -} -{ - - Memcheck:Addr8 - ... - fun:internal* -} -{ - - Memcheck:Addr8 - ... - fun:math.* -} -{ - - Memcheck:Addr1 - ... - fun:regexp* -} -{ - - Memcheck:Addr2 - ... - fun:regexp* -} -{ - - Memcheck:Addr4 - ... - fun:regexp* -} -{ - - Memcheck:Addr8 - ... - fun:regexp* -} -{ - - Memcheck:Addr16 ... - fun:regexp* -} -{ - - Memcheck:Addr8 - ... - fun:sort.* -} -{ - - Memcheck:Addr1 - ... - fun:encoding/* -} -{ - - Memcheck:Addr8 - ... - fun:encoding/* -} -{ - - Memcheck:Addr16 - ... - fun:encoding/* -} -{ - - Memcheck:Addr1 - ... - fun:hash/* -} -{ - - Memcheck:Addr4 - ... - fun:hash/* -} -{ - - Memcheck:Addr16 - ... - fun:hash/* -} -{ - - Memcheck:Value8 - ... - fun:hash/* -} -{ - - Memcheck:Addr1 - ... - fun:compress/* -} -{ - - Memcheck:Addr8 - ... - fun:compress/* -} -{ - - Memcheck:Addr8 - ... - fun:sync.* -} -{ - - Memcheck:Addr16 - ... - fun:sync.* -} -{ - - Memcheck:Addr16 - fun:aeshash* -} -{ - - Memcheck:Addr8 - fun:memeqbody + fun:indexbytebody } { - + golang/go#27610: bytealg indexbytebody over-read Memcheck:Cond - fun:memeqbody -} -{ - - Memcheck:Addr1 ... - fun:main.* -} -{ - - Memcheck:Addr8 - ... - fun:main.* -} -{ - - Memcheck:Addr16 - ... - fun:main.* -} -{ - - Memcheck:Addr8 - fun:fmt.* -} -{ - - Memcheck:Addr1 - ... - fun:github.com/* -} -{ - - Memcheck:Addr4 - ... - fun:github.com/* + fun:indexbytebody } + +# glibc / loader / NSS { - - Memcheck:Addr8 + glibc: per-thread TLS (dtv) + Memcheck:Leak + match-leak-kinds: possible,reachable + fun:calloc ... - fun:github.com/* -} -{ - - Memcheck:Addr16 + fun:_dl_allocate_tls ... - fun:github.com/* -} -{ - - Memcheck:Cond + fun:pthread_create* ... - fun:*golang.org/* } { - - Memcheck:Addr1 + glibc: dl_init + Memcheck:Leak ... - fun:*golang.org/* + fun:_dl_init } { - - Memcheck:Addr4 + glibc: dl_open + Memcheck:Leak ... - fun:*golang.org/* + fun:_dl_open } { - - Memcheck:Addr8 + glibc: dl_fini + Memcheck:Leak + match-leak-kinds: reachable ... - fun:*golang.org/* -} -{ - - Memcheck:Addr16 + fun:_dl_fini ... - fun:*golang.org/* } { - - Memcheck:Value8 + glibc: dlerror_run + Memcheck:Leak + match-leak-kinds: reachable ... - fun:*golang.org/* -} -{ - - Memcheck:Addr8 - fun:reflect.* -} -{ - - Memcheck:Addr16 - fun:reflect.* -} -{ - - Memcheck:Addr8 - fun:unicode/utf8* -} -{ - - Memcheck:Addr8 - fun:strconv.Unquote -} -{ - - Memcheck:Addr8 - fun:racecall -} -{ - - Memcheck:Cond - fun:racecalladdr -} -{ - - Memcheck:Addr8 + fun:_dlerror_run ... - fun:indexbytebody } { - - Memcheck:Addr32 + glibc: dl_fixup + Memcheck:Leak + match-leak-kinds: reachable ... - fun:indexbytebody -} -{ - - Memcheck:Cond + fun:_dl_fixup ... - fun:indexbytebody } { - go-cond-racecall - Memcheck:Cond + glibc: getpwnam_r (NSS) + Memcheck:Leak + fun:*alloc ... - fun:racecall -} -{ - go-value8-write_racecall - Memcheck:Value8 - fun:__tsan_write - fun:racecall -} -{ - go-value8-racecall - Memcheck:Value8 - fun:_ZN6__tsan9ShadowSetEPNS_9RawShadowES1_S0_ - fun:racecall -} -{ - MemoryRangeSet ShadowSet - Memcheck:Value8 - fun:ShadowSet - fun:_ZN6__tsanL14MemoryRangeSetEmmNS_9RawShadowE - fun:racecall + fun:getpwnam_r* } { - FI leak 8 + glibc: getpwuid_r (NSS) Memcheck:Leak - match-leak-kinds: reachable fun:calloc - fun:_dlerror_run - fun:dlopen* - fun:_goboringcrypto_DLOPEN_OPENSSL - fun:_cgo_*_Cfunc__goboringcrypto_DLOPEN_OPENSSL - fun:runtime.asmcgocall + ... + fun:getpwuid_r* } { - + glibc: localtime / tz data Memcheck:Leak - match-leak-kinds: reachable fun:malloc - fun:hg_dlog_mkcount32 ... + fun:__tz_convert } + +# libfabric (OFI) { - + libfabric: provider init (fi_ini) Memcheck:Leak - match-leak-kinds: reachable - fun:malloc - fun:hg_dlog_mkcount64 + ... + fun:fi_ini ... } { - FI leak 9 + libfabric: rdma_bind_addr init Memcheck:Leak match-leak-kinds: possible - fun:calloc - fun:_dl_allocate_tls - fun:pthread_create* ... - fun:na_ofi_initialize - fun:NA_Initialize_opt - fun:hg_core_init - fun:HG_Core_init_opt - fun:HG_Init_opt - fun:crt_hg_class_init + fun:rdma_bind_addr + ... } { - Tcp provider with ofi rxm + libfabric: tcp/rxm sends uninitialized bytes (msg_iov[1]) Memcheck:Param sendmsg(msg.msg_iov[1]) ... @@ -544,7 +158,7 @@ ... } { - Tcp provider with ofi rxm 2 + libfabric: tcp/rxm sends uninitialized bytes (msg_iov[2]) Memcheck:Param sendmsg(msg.msg_iov[2]) ... @@ -552,253 +166,40 @@ ... } { - Go syscall write - Memcheck:Param - write(buf) - fun:internal/runtime/*Syscall6 -} -{ - Go syscall read + libfabric: send() uninitialized bytes Memcheck:Param - read(buf) - fun:internal/runtime/*Syscall6 -} -{ - context Err() - Memcheck:Addr8 - fun:context.(*valueCtx).Err -} -{ - Racecall cgo malloc - Memcheck:Leak - match-leak-kinds: reachable - fun:malloc - fun:_cgo_*_Cfunc__Cmalloc - fun:runtime.asmcgocall.abi0 - ... - fun:racecall -} -{ - DAOS-14680-2 - Memcheck:Value8 - fun:memeqbody -} -{ - DAOS-14680-3 - Memcheck:Cond - fun:aeshashbody -} -{ - DAOS-14680-4 - Memcheck:Value8 - fun:aeshashbody -} -{ - DAOS-15548 - Memcheck:Addr1 - fun:racecallatomic -} -{ - __tsan_go_atomic64_load - Memcheck:Addr8 - ... - fun:__tsan_go_atomic64_load - fun:racecall -} -{ - __tsan_go_atomic64_store - Memcheck:Addr8 - ... - fun:__tsan_go_atomic64_store - fun:racecall -} -{ - __tsan_go_atomic64_compare_exchange - Memcheck:Addr8 + socketcall.sendto(msg) ... - fun:__tsan_go_atomic64_compare_exchange - fun:racecall + fun:send } + +# Mercury (HG) { - Persistentalloc cgo malloc + mercury: hg_dlog_mkcount32 Memcheck:Leak match-leak-kinds: reachable fun:malloc - fun:_cgo_*_Cfunc__Cmalloc - fun:runtime.asmcgocall.abi0 + fun:hg_dlog_mkcount32 ... - fun:runtime.persistentalloc } { - Newproc cgo malloc + mercury: hg_dlog_mkcount64 Memcheck:Leak match-leak-kinds: reachable fun:malloc - fun:_cgo_*_Cfunc__Cmalloc - fun:runtime.asmcgocall.abi0 - ... - fun:runtime.newproc.abi0 -} -{ - __tsan_write_pc - Memcheck:Value8 - ... - fun:__tsan_write_pc - fun:racecall -} -{ - __tsan_read_pc - Memcheck:Value8 - ... - fun:__tsan_read_pc - fun:racecall -} -{ - tsan::MemoryAccessRange - Memcheck:Value8 - ... - fun:_ZN6__tsan18MemoryAccessRangeTILb0EEEvPNS_11ThreadStateEmmm - ... - fun:racecall -} -{ - tsan::MemoryAccessRange - Memcheck:Value8 - ... - fun:_ZN6__tsan18MemoryAccessRangeTILb1EEEvPNS_11ThreadStateEmmm - ... - fun:racecall -} -{ - tsan::TraceRestartMemoryAccess - Memcheck:Value8 - ... - fun:_ZN6__tsan24TraceRestartMemoryAccessEPNS_11ThreadStateEmmmm - ... - fun:racecall -} -{ - __tsan_read - Memcheck:Value8 - ... - fun:__tsan_read - fun:racecall -} -{ - __tsan_write - Memcheck:Value8 - ... - fun:__tsan_write - fun:racecall -} -{ - racecallatomic - Memcheck:Addr8 - fun:racecallatomic -} -{ - racecalladdr - Memcheck:Addr8 - fun:racecalladdr -} -{ - __tsan_go_atomic32_load - Memcheck:Addr4 - ... - fun:__tsan_go_atomic32_load - fun:racecall -} -{ - __tsan_go_atomic32_store - Memcheck:Addr4 - ... - fun:__tsan_go_atomic32_store - fun:racecall -} -{ - __tsan_go_atomic32_compare_exchange - Memcheck:Addr4 + fun:hg_dlog_mkcount64 ... - fun:__tsan_go_atomic32_compare_exchange - fun:racecall } + +# BoringCrypto / OpenSSL { - racefuncenter - Memcheck:Addr8 - fun:racefuncenter -} -{ - Runtime bootstrap memory leak + boringcrypto: dlopen of libssl Memcheck:Leak match-leak-kinds: reachable - fun:malloc - ... - fun:runtime.asmcgocall.abi0 - ... - fun:runtime.rt0_go.abi0 -} -{ - bytealg.cmpbody 32-bit - Memcheck:Addr32 - fun:cmpbody -} -{ - bytealg.cmpbody 16-bit - Memcheck:Addr16 - fun:cmpbody -} -{ - bytealg.cmpbody 8-bit - Memcheck:Addr8 - fun:cmpbody -} -{ - bytealg.cmpbody 1-bit - Memcheck:Addr1 - fun:cmpbody -} -{ - bytealg.indexbytebody - Memcheck:Addr16 - fun:indexbytebody -} -{ - bytealg.countbody - Memcheck:Addr16 - fun:countbody -} -{ - __tsan_go_atomic32_fetch_add - Memcheck:Addr4 - ... - fun:__tsan_go_atomic32_fetch_add - fun:racecall -} -{ - __tsan_go_atomic64_fetch_add - Memcheck:Addr8 - ... - fun:__tsan_go_atomic64_fetch_add - fun:racecall -} -{ - getpwnam_r() leak - Memcheck:Leak - fun:*alloc - ... - fun:getpwnam_r* -} -{ - getpwuid_r() leak - Memcheck:Leak fun:calloc - ... - fun:getpwuid_r* -} -{ - localtime() leak - Memcheck:Leak - fun:malloc - ... - fun:__tz_convert + fun:_dlerror_run + fun:dlopen* + fun:_goboringcrypto_DLOPEN_OPENSSL + fun:_cgo_*_Cfunc__goboringcrypto_DLOPEN_OPENSSL + fun:runtime.asmcgocall } diff --git a/src/control/SConscript b/src/control/SConscript index 754ebfd3463..3af67828687 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -27,6 +27,8 @@ def get_build_tags(benv): if is_server_build(benv): print("Building server go binary: adding 'server' build tag") tags.append("server") + if benv.d_is_valgrind_build(): + tags.append("valgrind") return f"-tags {','.join(tags)}" @@ -52,6 +54,11 @@ def get_build_flags(benv): return '-buildmode=pie' # Disable optimizations and inlining for debugger support flags = '-gcflags "all=-N -l"' + # Valgrind variant: instrument the Go runtime for Memcheck and leave the + # race detector off because it does not stack with memcheck. + # + if benv.d_is_valgrind_build(): + return flags # enable AddressSanitizer to detect memory safety issues at runtime if 'SANITIZERS' in benv and benv['SANITIZERS'] != "": return f'-asan {flags}' diff --git a/utils/node_local_test.py b/utils/node_local_test.py index c51b26ad33f..ce341838406 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -456,6 +456,21 @@ def get_base_env(clean=False): return env +def check_memcheck_build(conf): + """Fail early if the daos binary is not valgrind-tagged for a memcheck run. + + memcheck-cart.supp no longer suppresses the Go runtime; that relies on the + binary being built with the Go 1.25+ "valgrind" tag (BUILD_VALGRIND=1). + """ + daos_bin = join(conf['PREFIX'], 'bin', 'daos') + with open(daos_bin, 'rb') as fd: + if b'runtime.valgrindRegisterStack' not in fd.read(): + raise NLTestFail( + f'{daos_bin} is not built with the Go "valgrind" tag (needs ' + 'Go 1.25+ and BUILD_VALGRIND=1), to run under memcheck.' + 'Rebuild with: scons install BUILD_VALGRIND=1') + + class DaosPool(): """Class to store data about daos pools""" @@ -1051,6 +1066,7 @@ def run_daos_client_cmd(self, cmd): exec_cmd.extend(cmd) cmd_env = get_base_env() + valgrind_hdl.add_memcheck_env(cmd_env) with tempfile.NamedTemporaryFile(prefix=f'dnt_cmd_{get_inc_id()}_', suffix='.log', @@ -1294,6 +1310,13 @@ def get_cmd_prefix(self): return cmd + def add_memcheck_env(self, env): + """Disable Go async preemption for a command run under memcheck.""" + if not self.use_valgrind: + return + godebug = env.get('GODEBUG') + env['GODEBUG'] = f'{godebug},asyncpreemptoff=1' if godebug else 'asyncpreemptoff=1' + def convert_xml(self): """Modify the xml file""" if not self.use_valgrind: @@ -1731,6 +1754,7 @@ def run_daos_cmd(conf, exec_cmd.extend(daos_cmd) cmd_env = get_base_env() + valgrind_hdl.add_memcheck_env(cmd_env) if conf.args.client_debug: cmd_env['D_LOG_MASK'] = conf.args.client_debug @@ -6657,6 +6681,8 @@ def run(wf, args): conf.set_wf(wf) conf.set_args(args) + if args.memcheck != 'no': + check_memcheck_build(conf) setup_log_test(conf) fi_test = False From 283a0c07e28590a413983de6b1c46453628f1932 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Mon, 8 Jun 2026 18:36:25 -0400 Subject: [PATCH 2/2] DAOS-19102 test: add --repeat/--failfast to NLT By default, NLT runs each test once. Add a --repeat N flag to allow for multiple iterations to really soak a change. Add a --failfast flag to allow the loop to be broken if any iteration fails; otherwise the loop will keep going until the requested number of loops has completed. New test pragmas: - NLT-repeat: N - NLT-repeat-failfast: true Signed-off-by: Michael MacDonald --- Jenkinsfile | 8 ++- utils/node_local_test.py | 134 ++++++++++++++++++++++++--------------- 2 files changed, 90 insertions(+), 52 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 62cf81a3713..bac919cb484 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -728,7 +728,7 @@ pipeline { // NLT memchecks the valgrind-tagged build, not the shared -race one. unstash 'opt-daos-valgrind' job_step_update( - unitTest(timeout_time: 60, + unitTest(timeout_time: 60 * cachedCommitPragma(pragma: 'NLT-repeat', def_val: '1').toInteger(), inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh' + ' --system-ram-reserved 4' + @@ -736,7 +736,11 @@ pipeline { ' --dfuse-dir /localhome/jenkins/' + ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + - ' --class-name nlt all', + ' --class-name nlt' + + " --repeat ${cachedCommitPragma(pragma: 'NLT-repeat', def_val: '1')}" + + /* groovylint-disable-next-line LineLength */ + (cachedCommitPragma(pragma: 'NLT-repeat-failfast', def_val: 'false').toLowerCase() == 'true' ? ' --failfast' : '') + + ' all', with_valgrind: 'memcheck', valgrind_pattern: '*memcheck.xml', always_script: 'ci/unit/test_nlt_post.sh', diff --git a/utils/node_local_test.py b/utils/node_local_test.py index ce341838406..0a4eab7d7ef 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -565,7 +565,7 @@ class DaosServer(): """Manage a DAOS server instance""" def __init__(self, conf, test_class=None, valgrind=False, wf=None, fatal_errors=None, - enable_fi=False): + enable_fi=False, wipe_on_exit=False): self.running = False self._file = __file__.lstrip('./') self._sp = None @@ -622,6 +622,8 @@ def __init__(self, conf, test_class=None, valgrind=False, wf=None, fatal_errors= self.network_provider = None self.fuse_procs = [] + self.wipe_on_exit = wipe_on_exit + self.scm_mounts = [] def __enter__(self): self._start() @@ -631,6 +633,10 @@ def __exit__(self, _type, _value, _traceback): rc = self._stop(self.wf) if rc != 0 and self.fatal_errors is not None: self.fatal_errors.fail() + if self.wipe_on_exit: + for mount in self.scm_mounts: + ret = subprocess.run(['sudo', 'umount', mount], check=False) + print(f'rc from umount {mount}: {ret.returncode}') return False def add_fuse(self, fuse): @@ -792,6 +798,7 @@ def _start(self): engine['first_core'] = ref_engine['targets'] * idx engine['fabric_iface_port'] += server_port_count * idx engine['storage'][0]['scm_mount'] = f'{ref_engine["storage"][0]["scm_mount"]}_{idx}' + self.scm_mounts.append(engine['storage'][0]['scm_mount']) scyaml['engines'].append(engine) self._yaml_file = tempfile.NamedTemporaryFile(prefix='nlt-server-config-', suffix='.yaml') self._yaml_file.write(yaml.dump(scyaml, encoding='utf-8')) @@ -6659,6 +6666,59 @@ def expand_test_list(raw_test_list, excluded_name_dict): return test_variants +def _run_test_pass(conf, args, server, fatal_errors, special_list, test_dict, excluded_dict): + """Run one pass of the requested tests against server; return whether FI/dfuse is wanted.""" + fi_test_dfuse = False + if args.mode == 'launch': + run_in_fg(server, conf, args) + elif args.mode == 'overlay' and 'special_dfuse_overlay' in special_list: + fatal_errors.add_result(run_duns_overlay_test(server, conf)) + elif args.mode == 'set-fi': + fatal_errors.add_result(server.set_fi()) + elif args.mode == 'all': + fi_test_dfuse = True + fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) + if 'special_dfuse_multi' in special_list: + fatal_errors.add_result(run_dfuse(server, conf)) + if 'special_dfuse_overlay' in special_list: + fatal_errors.add_result(run_duns_overlay_test(server, conf)) + test_pydaos_kv(server, conf) + test_pydaos_kv_obj_class(server, conf) + fatal_errors.add_result(server.set_fi()) + elif args.test == 'all': + fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) + elif args.test: + special_list = [x for x in args.test if is_special_testname(x)] + despecialed_list = ['test_' + x for x in args.test if not is_special_testname(x)] + custom_test_dict = expand_input_list(despecialed_list) + custom_exclusions = explicit_list_to_exclusion_list(custom_test_dict) + exclusion_union = {} + for key in custom_test_dict: + exclusion_list = \ + list(set(custom_exclusions.get(key, [])).union( + set(excluded_dict.get(key, [])))) + if len(exclusion_list) > 0: + exclusion_union[key] = exclusion_list + needs_dfuse_with_opt.record_exclusions(exclusion_union) + custom_filtered_dict = expand_test_list(custom_test_dict.keys(), exclusion_union) + if len(custom_filtered_dict) == 0 and len(special_list) == 0: + print('No tests to run!') + sys.exit(1) + if len(custom_filtered_dict) > 0: + fatal_errors.add_result( + run_posix_tests(server, conf, custom_filtered_dict.keys())) + if 'special_dfuse_multi' in special_list: + fatal_errors.add_result(run_dfuse(server, conf)) + if 'special_dfuse_overlay' in special_list: + fatal_errors.add_result(run_duns_overlay_test(server, conf)) + else: + fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) + if 'special_dfuse_multi' in special_list: + fatal_errors.add_result(run_dfuse(server, conf)) + fatal_errors.add_result(server.set_fi()) + return fi_test_dfuse + + def run(wf, args): """Main entry point""" # pylint: disable=too-many-branches @@ -6693,55 +6753,25 @@ def run(wf, args): if args.mode == 'fi': fi_test = True else: - with DaosServer(conf, test_class='first', wf=wf_server, - fatal_errors=fatal_errors) as server: - if args.mode == 'launch': - run_in_fg(server, conf, args) - elif args.mode == 'overlay' and 'special_dfuse_overlay' in special_list: - fatal_errors.add_result(run_duns_overlay_test(server, conf)) - elif args.mode == 'set-fi': - fatal_errors.add_result(server.set_fi()) - elif args.mode == 'all': - fi_test_dfuse = True - fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) - if 'special_dfuse_multi' in special_list: - fatal_errors.add_result(run_dfuse(server, conf)) - if 'special_dfuse_overlay' in special_list: - fatal_errors.add_result(run_duns_overlay_test(server, conf)) - test_pydaos_kv(server, conf) - test_pydaos_kv_obj_class(server, conf) - fatal_errors.add_result(server.set_fi()) - elif args.test == 'all': - fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) - elif args.test: - special_list = [x for x in args.test if is_special_testname(x)] - despecialed_list = ['test_' + x for x in args.test if not is_special_testname(x)] - custom_test_dict = expand_input_list(despecialed_list) - custom_exclusions = explicit_list_to_exclusion_list(custom_test_dict) - exclusion_union = {} - for key in custom_test_dict: - exclusion_list = \ - list(set(custom_exclusions.get(key, [])).union( - set(excluded_dict.get(key, [])))) - if len(exclusion_list) > 0: - exclusion_union[key] = exclusion_list - needs_dfuse_with_opt.record_exclusions(exclusion_union) - custom_filtered_dict = expand_test_list(custom_test_dict.keys(), exclusion_union) - if len(custom_filtered_dict) == 0 and len(special_list) == 0: - print('No tests to run!') - sys.exit(1) - if len(custom_filtered_dict) > 0: - fatal_errors.add_result( - run_posix_tests(server, conf, custom_filtered_dict.keys())) - if 'special_dfuse_multi' in special_list: - fatal_errors.add_result(run_dfuse(server, conf)) - if 'special_dfuse_overlay' in special_list: - fatal_errors.add_result(run_duns_overlay_test(server, conf)) - else: - fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) - if 'special_dfuse_multi' in special_list: - fatal_errors.add_result(run_dfuse(server, conf)) - fatal_errors.add_result(server.set_fi()) + for rep in range(args.repeat): + if args.repeat > 1: + print(f'=== NLT repeat iteration {rep + 1}/{args.repeat} ===') + + try: + # reset after each iteration, except on the last one + with DaosServer(conf, test_class='first', wf=wf_server, + fatal_errors=fatal_errors, + wipe_on_exit=rep < args.repeat - 1) as server: + fi_test_dfuse = _run_test_pass(conf, args, server, fatal_errors, + special_list, test_dict, excluded_dict) + except Exception as error: # pylint: disable=broad-exception-caught + if args.repeat == 1: + raise + print(f'NLT repeat iteration {rep + 1} raised: {error}') + fatal_errors.add_result(True) + if args.failfast and fatal_errors.errors and rep < args.repeat - 1: + print(f'--failfast set; stopping after iteration {rep + 1}/{args.repeat}') + break if args.mode == 'all': with DaosServer(conf, test_class='restart', wf=wf_server, @@ -6877,6 +6907,10 @@ def main(): parser.add_argument('--no-root', action='store_true') parser.add_argument('--max-log-size', default=None) parser.add_argument('--engine-count', type=int, default=1, help='Number of daos engines to run') + parser.add_argument('--repeat', type=int, default=1, + help='Repeat the test execution N times (soak/stability testing)') + parser.add_argument('--failfast', action='store_true', + help='With --repeat, stop after the first failing iteration') parser.add_argument('--system-ram-reserved', type=int, default=None, help='GiB reserved RAM') parser.add_argument('--dfuse-dir', default='/tmp', help='parent directory for all dfuse mounts') parser.add_argument('--perf-check', action='store_true')