diff --git a/Jenkinsfile b/Jenkinsfile index bd4bf753c3c..bac919cb484 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -579,6 +579,16 @@ pipeline { ' PREFIX=/opt/daos TARGET_TYPE=release')) sh label: 'Generate RPMs', script: './ci/rpm/gen_rpms.sh el9 "' + env.DAOS_RELVAL + '"' + // Valgrind-tagged variant for the NLT memcheck stage, + // stashed separately; the build above keeps -race for ftest. + job_step_update( + sconsBuild(parallel_build: true, + build_deps: 'no', + scons_args: sconsArgs() + + ' BUILD_VALGRIND=1 PREFIX=/opt/daos TARGET_TYPE=release')) + sh label: 'Stash valgrind install tree for NLT', + script: 'tar -C / -cf opt-daos-valgrind.tar opt/daos' + stash(name: 'opt-daos-valgrind', includes: 'opt-daos-valgrind.tar') } } post { @@ -715,8 +725,10 @@ pipeline { label params.CI_NLT_1_LABEL } steps { + // NLT memchecks the valgrind-tagged build, not the shared -race one. + unstash 'opt-daos-valgrind' job_step_update( - unitTest(timeout_time: 60, + unitTest(timeout_time: 60 * cachedCommitPragma(pragma: 'NLT-repeat', def_val: '1').toInteger(), inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh' + ' --system-ram-reserved 4' + @@ -724,7 +736,11 @@ pipeline { ' --dfuse-dir /localhome/jenkins/' + ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + - ' --class-name nlt all', + ' --class-name nlt' + + " --repeat ${cachedCommitPragma(pragma: 'NLT-repeat', def_val: '1')}" + + /* groovylint-disable-next-line LineLength */ + (cachedCommitPragma(pragma: 'NLT-repeat-failfast', def_val: 'false').toLowerCase() == 'true' ? ' --failfast' : '') + + ' all', with_valgrind: 'memcheck', valgrind_pattern: '*memcheck.xml', always_script: 'ci/unit/test_nlt_post.sh', diff --git a/ci/unit/test_nlt.sh b/ci/unit/test_nlt.sh index 23e3bc8b549..708f25165d3 100755 --- a/ci/unit/test_nlt.sh +++ b/ci/unit/test_nlt.sh @@ -10,8 +10,12 @@ rm -rf dnt.*.memcheck.xml vm_test/ NODE=${NODELIST%%,*} mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" -# Copy over the install tree and some of the build tree. -rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/ +# Copy over the install tree and some of the build tree. The memcheck NLT stage +# ships the valgrind-tagged build (opt-daos-valgrind.tar); the fault-injection +# stage ships the standard opt-daos.tar. Use whichever was unstashed. +opt_tar=opt-daos.tar +[ -f opt-daos-valgrind.tar ] && opt_tar=opt-daos-valgrind.tar +rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* "$opt_tar" utils requirements-utest.txt jenkins@"$NODE":build/ ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \ "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 53630da411a..33b1117d335 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -12,7 +12,11 @@ if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then fi cd build -tar -xf opt-daos.tar +# Memcheck NLT ships opt-daos-valgrind.tar; the fault-injection stage ships +# the standard opt-daos.tar. Extract whichever was shipped. +opt_tar=opt-daos.tar +[ -f opt-daos-valgrind.tar ] && opt_tar=opt-daos-valgrind.tar +tar -xf "$opt_tar" sudo mv opt/daos /opt/ # Setup daos admin etc. diff --git a/site_scons/prereq_tools/base.py b/site_scons/prereq_tools/base.py index 67e157a071e..f6ceeb05ecc 100644 --- a/site_scons/prereq_tools/base.py +++ b/site_scons/prereq_tools/base.py @@ -516,6 +516,10 @@ def __init__(self, env, opts): opts.Add(EnumVariable('WARNING_LEVEL', "Set default warning level", 'error', ['warning', 'warn', 'error'], ignorecase=2)) opts.Add(('SANITIZERS', 'Instrument C code with Google Sanitizers', None)) + opts.Add(BoolVariable('BUILD_VALGRIND', + 'Build Go artifacts with the Go "valgrind" tag for Memcheck ' + '(also drops -race; ignored for release)', + False)) opts.Add(BoolVariable('CMOCKA_FILTER_SUPPORTED', 'Allows to filter cmocka tests', False)) opts.Add(BoolVariable('CRT_PP', 'Preprocess CaRT sources', False)) opts.Add(BoolVariable('HEAP_PROFILER', 'Instrument C code with Gperftools Heap Profiler', diff --git a/site_scons/site_tools/go_builder.py b/site_scons/site_tools/go_builder.py index e6657b3b314..6de49301108 100644 --- a/site_scons/site_tools/go_builder.py +++ b/site_scons/site_tools/go_builder.py @@ -11,6 +11,21 @@ include_re = re.compile(r'\#include [<"](\S+[>"])', re.M) +def _is_valgrind_build(env): + """Return True if Go artifacts should be built with the Go 1.25+ "valgrind" tag. + + BUILD_VALGRIND=1 makes the Go runtime cooperate with Memcheck. Ignored for + release builds. + """ + if not env.get('BUILD_VALGRIND'): + return False + if env.get('BUILD_TYPE') == 'release': + return False + if env.get('SANITIZERS'): + Exit('BUILD_VALGRIND=1 is incompatible with SANITIZERS') + return True + + def _scan_go_file(node, env, _path): """Scanner for go code""" src_dir = os.path.dirname(str(node)) @@ -119,6 +134,7 @@ def _check_go_version(context): return 1 env.d_go_bin = env.get("GO_BIN", env.WhereIs(GO_COMPILER, os.environ['PATH'])) + env.AddMethod(_is_valgrind_build, 'd_is_valgrind_build') if GetOption('help') or GetOption('clean'): return diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 5bdf8278a33..20b7bba8541 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -1,542 +1,156 @@ +# Go runtime { - dl init leaks + Go runtime: GC memory (valgrind integration entry point) Memcheck:Leak + match-leak-kinds: definite,indirect,possible,reachable + fun:runtime.valgrindClientRequest* ... - fun:_dl_init -} -{ - dl open leaks - Memcheck:Leak - ... - fun:_dl_open } { - _dl_fini leak + Go runtime: persistentalloc (cgo malloc, never freed) Memcheck:Leak match-leak-kinds: reachable + fun:malloc + fun:_cgo_*_Cfunc__Cmalloc + fun:runtime.asmcgocall.abi0 ... - fun:_dl_fini - ... + fun:runtime.persistentalloc } { - dlerror_run leak + Go runtime: newproc (cgo malloc, never freed) Memcheck:Leak match-leak-kinds: reachable + fun:malloc + fun:_cgo_*_Cfunc__Cmalloc + fun:runtime.asmcgocall.abi0 ... - fun:_dlerror_run - ... + fun:runtime.newproc.abi0 } { - _dl_fixup leak + Go runtime: bootstrap (cgo malloc, never freed) Memcheck:Leak match-leak-kinds: reachable + fun:malloc ... - fun:_dl_fixup - ... -} -{ - FI leak 6 - Memcheck:Leak - match-leak-kinds: possible - ... - fun:rdma_bind_addr - ... -} -{ - FI leak 7 - Memcheck:Leak - ... - fun:fi_ini - ... -} -{ - access-0 - Memcheck:Param - socketcall.sendto(msg) - ... - fun:send -} -{ - - Memcheck:Leak - ... - fun:x_cgo_thread_start -} -{ - - Memcheck:Addr8 - ... - fun:bufio.(*Reader).* -} -{ - - Memcheck:Value8 - ... - fun:bufio.(*Reader).* -} -{ - - Memcheck:Addr1 - ... - fun:bufio.(*Reader).* -} -{ - - Memcheck:Addr8 - fun:os.* -} -{ - - Memcheck:Addr1 - fun:bytes.* -} -{ - - Memcheck:Addr8 - fun:bytes.* -} -{ - - Memcheck:Addr1 - fun:internal/bytealg*IndexByte* -} -{ - - Memcheck:Addr4 - fun:internal/bytealg*IndexByte* -} -{ - - Memcheck:Addr8 - fun:internal/bytealg*IndexByte* -} -{ - - Memcheck:Addr1 - fun:strings.(*Builder).* -} -{ - - Memcheck:Addr8 - fun:strings.(*Builder).* -} -{ - - Memcheck:Cond - ... - fun:runtime.* -} -{ - - Memcheck:Addr1 - ... - fun:runtime.* -} -{ - - Memcheck:Addr2 - ... - fun:runtime.* -} -{ - - Memcheck:Addr4 + fun:runtime.asmcgocall.abi0 ... - fun:runtime.* + fun:runtime.rt0_go.abi0 } +# bytealg over-reads buffer ends by design; golang/go#27610 +# recommends this suppression { - + golang/go#27610: bytealg indexbytebody over-read Memcheck:Addr8 ... - fun:runtime.* + fun:indexbytebody } { - + golang/go#27610: bytealg indexbytebody over-read Memcheck:Addr16 ... - fun:runtime.* -} -{ - - Memcheck:Addr32 - ... - fun:runtime.* -} -{ - - Memcheck:Value8 - ... - fun:runtime.* -} -{ - Go conditional - Memcheck:Cond - src:*.go -} -{ - Go addr1 - Memcheck:Addr1 - src:*.go -} -{ - Go addr 2 - Memcheck:Addr2 - src:*.go -} -{ - Go addr 4 - Memcheck:Addr4 - src:*.go -} -{ - Go addr 8 - Memcheck:Addr8 - src:*.go -} -{ - Go addr 16 - Memcheck:Addr16 - src:*.go + fun:indexbytebody } { - Go addr 32 + golang/go#27610: bytealg indexbytebody over-read Memcheck:Addr32 - src:*.go -} -{ - Ga value 8 - Memcheck:Value8 - src:*.go -} -{ - - Memcheck:Addr8 - ... - fun:internal* -} -{ - - Memcheck:Addr8 - ... - fun:math.* -} -{ - - Memcheck:Addr1 - ... - fun:regexp* -} -{ - - Memcheck:Addr2 - ... - fun:regexp* -} -{ - - Memcheck:Addr4 - ... - fun:regexp* -} -{ - - Memcheck:Addr8 - ... - fun:regexp* -} -{ - - Memcheck:Addr16 ... - fun:regexp* -} -{ - - Memcheck:Addr8 - ... - fun:sort.* -} -{ - - Memcheck:Addr1 - ... - fun:encoding/* -} -{ - - Memcheck:Addr8 - ... - fun:encoding/* -} -{ - - Memcheck:Addr16 - ... - fun:encoding/* -} -{ - - Memcheck:Addr1 - ... - fun:hash/* -} -{ - - Memcheck:Addr4 - ... - fun:hash/* -} -{ - - Memcheck:Addr16 - ... - fun:hash/* -} -{ - - Memcheck:Value8 - ... - fun:hash/* -} -{ - - Memcheck:Addr1 - ... - fun:compress/* -} -{ - - Memcheck:Addr8 - ... - fun:compress/* -} -{ - - Memcheck:Addr8 - ... - fun:sync.* -} -{ - - Memcheck:Addr16 - ... - fun:sync.* -} -{ - - Memcheck:Addr16 - fun:aeshash* -} -{ - - Memcheck:Addr8 - fun:memeqbody + fun:indexbytebody } { - + golang/go#27610: bytealg indexbytebody over-read Memcheck:Cond - fun:memeqbody -} -{ - - Memcheck:Addr1 ... - fun:main.* -} -{ - - Memcheck:Addr8 - ... - fun:main.* -} -{ - - Memcheck:Addr16 - ... - fun:main.* -} -{ - - Memcheck:Addr8 - fun:fmt.* -} -{ - - Memcheck:Addr1 - ... - fun:github.com/* -} -{ - - Memcheck:Addr4 - ... - fun:github.com/* + fun:indexbytebody } + +# glibc / loader / NSS { - - Memcheck:Addr8 + glibc: per-thread TLS (dtv) + Memcheck:Leak + match-leak-kinds: possible,reachable + fun:calloc ... - fun:github.com/* -} -{ - - Memcheck:Addr16 + fun:_dl_allocate_tls ... - fun:github.com/* -} -{ - - Memcheck:Cond + fun:pthread_create* ... - fun:*golang.org/* } { - - Memcheck:Addr1 + glibc: dl_init + Memcheck:Leak ... - fun:*golang.org/* + fun:_dl_init } { - - Memcheck:Addr4 + glibc: dl_open + Memcheck:Leak ... - fun:*golang.org/* + fun:_dl_open } { - - Memcheck:Addr8 + glibc: dl_fini + Memcheck:Leak + match-leak-kinds: reachable ... - fun:*golang.org/* -} -{ - - Memcheck:Addr16 + fun:_dl_fini ... - fun:*golang.org/* } { - - Memcheck:Value8 + glibc: dlerror_run + Memcheck:Leak + match-leak-kinds: reachable ... - fun:*golang.org/* -} -{ - - Memcheck:Addr8 - fun:reflect.* -} -{ - - Memcheck:Addr16 - fun:reflect.* -} -{ - - Memcheck:Addr8 - fun:unicode/utf8* -} -{ - - Memcheck:Addr8 - fun:strconv.Unquote -} -{ - - Memcheck:Addr8 - fun:racecall -} -{ - - Memcheck:Cond - fun:racecalladdr -} -{ - - Memcheck:Addr8 + fun:_dlerror_run ... - fun:indexbytebody } { - - Memcheck:Addr32 + glibc: dl_fixup + Memcheck:Leak + match-leak-kinds: reachable ... - fun:indexbytebody -} -{ - - Memcheck:Cond + fun:_dl_fixup ... - fun:indexbytebody } { - go-cond-racecall - Memcheck:Cond + glibc: getpwnam_r (NSS) + Memcheck:Leak + fun:*alloc ... - fun:racecall -} -{ - go-value8-write_racecall - Memcheck:Value8 - fun:__tsan_write - fun:racecall -} -{ - go-value8-racecall - Memcheck:Value8 - fun:_ZN6__tsan9ShadowSetEPNS_9RawShadowES1_S0_ - fun:racecall -} -{ - MemoryRangeSet ShadowSet - Memcheck:Value8 - fun:ShadowSet - fun:_ZN6__tsanL14MemoryRangeSetEmmNS_9RawShadowE - fun:racecall + fun:getpwnam_r* } { - FI leak 8 + glibc: getpwuid_r (NSS) Memcheck:Leak - match-leak-kinds: reachable fun:calloc - fun:_dlerror_run - fun:dlopen* - fun:_goboringcrypto_DLOPEN_OPENSSL - fun:_cgo_*_Cfunc__goboringcrypto_DLOPEN_OPENSSL - fun:runtime.asmcgocall + ... + fun:getpwuid_r* } { - + glibc: localtime / tz data Memcheck:Leak - match-leak-kinds: reachable fun:malloc - fun:hg_dlog_mkcount32 ... + fun:__tz_convert } + +# libfabric (OFI) { - + libfabric: provider init (fi_ini) Memcheck:Leak - match-leak-kinds: reachable - fun:malloc - fun:hg_dlog_mkcount64 + ... + fun:fi_ini ... } { - FI leak 9 + libfabric: rdma_bind_addr init Memcheck:Leak match-leak-kinds: possible - fun:calloc - fun:_dl_allocate_tls - fun:pthread_create* ... - fun:na_ofi_initialize - fun:NA_Initialize_opt - fun:hg_core_init - fun:HG_Core_init_opt - fun:HG_Init_opt - fun:crt_hg_class_init + fun:rdma_bind_addr + ... } { - Tcp provider with ofi rxm + libfabric: tcp/rxm sends uninitialized bytes (msg_iov[1]) Memcheck:Param sendmsg(msg.msg_iov[1]) ... @@ -544,7 +158,7 @@ ... } { - Tcp provider with ofi rxm 2 + libfabric: tcp/rxm sends uninitialized bytes (msg_iov[2]) Memcheck:Param sendmsg(msg.msg_iov[2]) ... @@ -552,253 +166,40 @@ ... } { - Go syscall write - Memcheck:Param - write(buf) - fun:internal/runtime/*Syscall6 -} -{ - Go syscall read + libfabric: send() uninitialized bytes Memcheck:Param - read(buf) - fun:internal/runtime/*Syscall6 -} -{ - context Err() - Memcheck:Addr8 - fun:context.(*valueCtx).Err -} -{ - Racecall cgo malloc - Memcheck:Leak - match-leak-kinds: reachable - fun:malloc - fun:_cgo_*_Cfunc__Cmalloc - fun:runtime.asmcgocall.abi0 - ... - fun:racecall -} -{ - DAOS-14680-2 - Memcheck:Value8 - fun:memeqbody -} -{ - DAOS-14680-3 - Memcheck:Cond - fun:aeshashbody -} -{ - DAOS-14680-4 - Memcheck:Value8 - fun:aeshashbody -} -{ - DAOS-15548 - Memcheck:Addr1 - fun:racecallatomic -} -{ - __tsan_go_atomic64_load - Memcheck:Addr8 - ... - fun:__tsan_go_atomic64_load - fun:racecall -} -{ - __tsan_go_atomic64_store - Memcheck:Addr8 - ... - fun:__tsan_go_atomic64_store - fun:racecall -} -{ - __tsan_go_atomic64_compare_exchange - Memcheck:Addr8 + socketcall.sendto(msg) ... - fun:__tsan_go_atomic64_compare_exchange - fun:racecall + fun:send } + +# Mercury (HG) { - Persistentalloc cgo malloc + mercury: hg_dlog_mkcount32 Memcheck:Leak match-leak-kinds: reachable fun:malloc - fun:_cgo_*_Cfunc__Cmalloc - fun:runtime.asmcgocall.abi0 + fun:hg_dlog_mkcount32 ... - fun:runtime.persistentalloc } { - Newproc cgo malloc + mercury: hg_dlog_mkcount64 Memcheck:Leak match-leak-kinds: reachable fun:malloc - fun:_cgo_*_Cfunc__Cmalloc - fun:runtime.asmcgocall.abi0 - ... - fun:runtime.newproc.abi0 -} -{ - __tsan_write_pc - Memcheck:Value8 - ... - fun:__tsan_write_pc - fun:racecall -} -{ - __tsan_read_pc - Memcheck:Value8 - ... - fun:__tsan_read_pc - fun:racecall -} -{ - tsan::MemoryAccessRange - Memcheck:Value8 - ... - fun:_ZN6__tsan18MemoryAccessRangeTILb0EEEvPNS_11ThreadStateEmmm - ... - fun:racecall -} -{ - tsan::MemoryAccessRange - Memcheck:Value8 - ... - fun:_ZN6__tsan18MemoryAccessRangeTILb1EEEvPNS_11ThreadStateEmmm - ... - fun:racecall -} -{ - tsan::TraceRestartMemoryAccess - Memcheck:Value8 - ... - fun:_ZN6__tsan24TraceRestartMemoryAccessEPNS_11ThreadStateEmmmm - ... - fun:racecall -} -{ - __tsan_read - Memcheck:Value8 - ... - fun:__tsan_read - fun:racecall -} -{ - __tsan_write - Memcheck:Value8 - ... - fun:__tsan_write - fun:racecall -} -{ - racecallatomic - Memcheck:Addr8 - fun:racecallatomic -} -{ - racecalladdr - Memcheck:Addr8 - fun:racecalladdr -} -{ - __tsan_go_atomic32_load - Memcheck:Addr4 - ... - fun:__tsan_go_atomic32_load - fun:racecall -} -{ - __tsan_go_atomic32_store - Memcheck:Addr4 - ... - fun:__tsan_go_atomic32_store - fun:racecall -} -{ - __tsan_go_atomic32_compare_exchange - Memcheck:Addr4 + fun:hg_dlog_mkcount64 ... - fun:__tsan_go_atomic32_compare_exchange - fun:racecall } + +# BoringCrypto / OpenSSL { - racefuncenter - Memcheck:Addr8 - fun:racefuncenter -} -{ - Runtime bootstrap memory leak + boringcrypto: dlopen of libssl Memcheck:Leak match-leak-kinds: reachable - fun:malloc - ... - fun:runtime.asmcgocall.abi0 - ... - fun:runtime.rt0_go.abi0 -} -{ - bytealg.cmpbody 32-bit - Memcheck:Addr32 - fun:cmpbody -} -{ - bytealg.cmpbody 16-bit - Memcheck:Addr16 - fun:cmpbody -} -{ - bytealg.cmpbody 8-bit - Memcheck:Addr8 - fun:cmpbody -} -{ - bytealg.cmpbody 1-bit - Memcheck:Addr1 - fun:cmpbody -} -{ - bytealg.indexbytebody - Memcheck:Addr16 - fun:indexbytebody -} -{ - bytealg.countbody - Memcheck:Addr16 - fun:countbody -} -{ - __tsan_go_atomic32_fetch_add - Memcheck:Addr4 - ... - fun:__tsan_go_atomic32_fetch_add - fun:racecall -} -{ - __tsan_go_atomic64_fetch_add - Memcheck:Addr8 - ... - fun:__tsan_go_atomic64_fetch_add - fun:racecall -} -{ - getpwnam_r() leak - Memcheck:Leak - fun:*alloc - ... - fun:getpwnam_r* -} -{ - getpwuid_r() leak - Memcheck:Leak fun:calloc - ... - fun:getpwuid_r* -} -{ - localtime() leak - Memcheck:Leak - fun:malloc - ... - fun:__tz_convert + fun:_dlerror_run + fun:dlopen* + fun:_goboringcrypto_DLOPEN_OPENSSL + fun:_cgo_*_Cfunc__goboringcrypto_DLOPEN_OPENSSL + fun:runtime.asmcgocall } diff --git a/src/control/SConscript b/src/control/SConscript index 754ebfd3463..3af67828687 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -27,6 +27,8 @@ def get_build_tags(benv): if is_server_build(benv): print("Building server go binary: adding 'server' build tag") tags.append("server") + if benv.d_is_valgrind_build(): + tags.append("valgrind") return f"-tags {','.join(tags)}" @@ -52,6 +54,11 @@ def get_build_flags(benv): return '-buildmode=pie' # Disable optimizations and inlining for debugger support flags = '-gcflags "all=-N -l"' + # Valgrind variant: instrument the Go runtime for Memcheck and leave the + # race detector off because it does not stack with memcheck. + # + if benv.d_is_valgrind_build(): + return flags # enable AddressSanitizer to detect memory safety issues at runtime if 'SANITIZERS' in benv and benv['SANITIZERS'] != "": return f'-asan {flags}' diff --git a/utils/node_local_test.py b/utils/node_local_test.py index c51b26ad33f..0a4eab7d7ef 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -456,6 +456,21 @@ def get_base_env(clean=False): return env +def check_memcheck_build(conf): + """Fail early if the daos binary is not valgrind-tagged for a memcheck run. + + memcheck-cart.supp no longer suppresses the Go runtime; that relies on the + binary being built with the Go 1.25+ "valgrind" tag (BUILD_VALGRIND=1). + """ + daos_bin = join(conf['PREFIX'], 'bin', 'daos') + with open(daos_bin, 'rb') as fd: + if b'runtime.valgrindRegisterStack' not in fd.read(): + raise NLTestFail( + f'{daos_bin} is not built with the Go "valgrind" tag (needs ' + 'Go 1.25+ and BUILD_VALGRIND=1), to run under memcheck.' + 'Rebuild with: scons install BUILD_VALGRIND=1') + + class DaosPool(): """Class to store data about daos pools""" @@ -550,7 +565,7 @@ class DaosServer(): """Manage a DAOS server instance""" def __init__(self, conf, test_class=None, valgrind=False, wf=None, fatal_errors=None, - enable_fi=False): + enable_fi=False, wipe_on_exit=False): self.running = False self._file = __file__.lstrip('./') self._sp = None @@ -607,6 +622,8 @@ def __init__(self, conf, test_class=None, valgrind=False, wf=None, fatal_errors= self.network_provider = None self.fuse_procs = [] + self.wipe_on_exit = wipe_on_exit + self.scm_mounts = [] def __enter__(self): self._start() @@ -616,6 +633,10 @@ def __exit__(self, _type, _value, _traceback): rc = self._stop(self.wf) if rc != 0 and self.fatal_errors is not None: self.fatal_errors.fail() + if self.wipe_on_exit: + for mount in self.scm_mounts: + ret = subprocess.run(['sudo', 'umount', mount], check=False) + print(f'rc from umount {mount}: {ret.returncode}') return False def add_fuse(self, fuse): @@ -777,6 +798,7 @@ def _start(self): engine['first_core'] = ref_engine['targets'] * idx engine['fabric_iface_port'] += server_port_count * idx engine['storage'][0]['scm_mount'] = f'{ref_engine["storage"][0]["scm_mount"]}_{idx}' + self.scm_mounts.append(engine['storage'][0]['scm_mount']) scyaml['engines'].append(engine) self._yaml_file = tempfile.NamedTemporaryFile(prefix='nlt-server-config-', suffix='.yaml') self._yaml_file.write(yaml.dump(scyaml, encoding='utf-8')) @@ -1051,6 +1073,7 @@ def run_daos_client_cmd(self, cmd): exec_cmd.extend(cmd) cmd_env = get_base_env() + valgrind_hdl.add_memcheck_env(cmd_env) with tempfile.NamedTemporaryFile(prefix=f'dnt_cmd_{get_inc_id()}_', suffix='.log', @@ -1294,6 +1317,13 @@ def get_cmd_prefix(self): return cmd + def add_memcheck_env(self, env): + """Disable Go async preemption for a command run under memcheck.""" + if not self.use_valgrind: + return + godebug = env.get('GODEBUG') + env['GODEBUG'] = f'{godebug},asyncpreemptoff=1' if godebug else 'asyncpreemptoff=1' + def convert_xml(self): """Modify the xml file""" if not self.use_valgrind: @@ -1731,6 +1761,7 @@ def run_daos_cmd(conf, exec_cmd.extend(daos_cmd) cmd_env = get_base_env() + valgrind_hdl.add_memcheck_env(cmd_env) if conf.args.client_debug: cmd_env['D_LOG_MASK'] = conf.args.client_debug @@ -6635,6 +6666,59 @@ def expand_test_list(raw_test_list, excluded_name_dict): return test_variants +def _run_test_pass(conf, args, server, fatal_errors, special_list, test_dict, excluded_dict): + """Run one pass of the requested tests against server; return whether FI/dfuse is wanted.""" + fi_test_dfuse = False + if args.mode == 'launch': + run_in_fg(server, conf, args) + elif args.mode == 'overlay' and 'special_dfuse_overlay' in special_list: + fatal_errors.add_result(run_duns_overlay_test(server, conf)) + elif args.mode == 'set-fi': + fatal_errors.add_result(server.set_fi()) + elif args.mode == 'all': + fi_test_dfuse = True + fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) + if 'special_dfuse_multi' in special_list: + fatal_errors.add_result(run_dfuse(server, conf)) + if 'special_dfuse_overlay' in special_list: + fatal_errors.add_result(run_duns_overlay_test(server, conf)) + test_pydaos_kv(server, conf) + test_pydaos_kv_obj_class(server, conf) + fatal_errors.add_result(server.set_fi()) + elif args.test == 'all': + fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) + elif args.test: + special_list = [x for x in args.test if is_special_testname(x)] + despecialed_list = ['test_' + x for x in args.test if not is_special_testname(x)] + custom_test_dict = expand_input_list(despecialed_list) + custom_exclusions = explicit_list_to_exclusion_list(custom_test_dict) + exclusion_union = {} + for key in custom_test_dict: + exclusion_list = \ + list(set(custom_exclusions.get(key, [])).union( + set(excluded_dict.get(key, [])))) + if len(exclusion_list) > 0: + exclusion_union[key] = exclusion_list + needs_dfuse_with_opt.record_exclusions(exclusion_union) + custom_filtered_dict = expand_test_list(custom_test_dict.keys(), exclusion_union) + if len(custom_filtered_dict) == 0 and len(special_list) == 0: + print('No tests to run!') + sys.exit(1) + if len(custom_filtered_dict) > 0: + fatal_errors.add_result( + run_posix_tests(server, conf, custom_filtered_dict.keys())) + if 'special_dfuse_multi' in special_list: + fatal_errors.add_result(run_dfuse(server, conf)) + if 'special_dfuse_overlay' in special_list: + fatal_errors.add_result(run_duns_overlay_test(server, conf)) + else: + fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) + if 'special_dfuse_multi' in special_list: + fatal_errors.add_result(run_dfuse(server, conf)) + fatal_errors.add_result(server.set_fi()) + return fi_test_dfuse + + def run(wf, args): """Main entry point""" # pylint: disable=too-many-branches @@ -6657,6 +6741,8 @@ def run(wf, args): conf.set_wf(wf) conf.set_args(args) + if args.memcheck != 'no': + check_memcheck_build(conf) setup_log_test(conf) fi_test = False @@ -6667,55 +6753,25 @@ def run(wf, args): if args.mode == 'fi': fi_test = True else: - with DaosServer(conf, test_class='first', wf=wf_server, - fatal_errors=fatal_errors) as server: - if args.mode == 'launch': - run_in_fg(server, conf, args) - elif args.mode == 'overlay' and 'special_dfuse_overlay' in special_list: - fatal_errors.add_result(run_duns_overlay_test(server, conf)) - elif args.mode == 'set-fi': - fatal_errors.add_result(server.set_fi()) - elif args.mode == 'all': - fi_test_dfuse = True - fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) - if 'special_dfuse_multi' in special_list: - fatal_errors.add_result(run_dfuse(server, conf)) - if 'special_dfuse_overlay' in special_list: - fatal_errors.add_result(run_duns_overlay_test(server, conf)) - test_pydaos_kv(server, conf) - test_pydaos_kv_obj_class(server, conf) - fatal_errors.add_result(server.set_fi()) - elif args.test == 'all': - fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) - elif args.test: - special_list = [x for x in args.test if is_special_testname(x)] - despecialed_list = ['test_' + x for x in args.test if not is_special_testname(x)] - custom_test_dict = expand_input_list(despecialed_list) - custom_exclusions = explicit_list_to_exclusion_list(custom_test_dict) - exclusion_union = {} - for key in custom_test_dict: - exclusion_list = \ - list(set(custom_exclusions.get(key, [])).union( - set(excluded_dict.get(key, [])))) - if len(exclusion_list) > 0: - exclusion_union[key] = exclusion_list - needs_dfuse_with_opt.record_exclusions(exclusion_union) - custom_filtered_dict = expand_test_list(custom_test_dict.keys(), exclusion_union) - if len(custom_filtered_dict) == 0 and len(special_list) == 0: - print('No tests to run!') - sys.exit(1) - if len(custom_filtered_dict) > 0: - fatal_errors.add_result( - run_posix_tests(server, conf, custom_filtered_dict.keys())) - if 'special_dfuse_multi' in special_list: - fatal_errors.add_result(run_dfuse(server, conf)) - if 'special_dfuse_overlay' in special_list: - fatal_errors.add_result(run_duns_overlay_test(server, conf)) - else: - fatal_errors.add_result(run_posix_tests(server, conf, test_dict.keys())) - if 'special_dfuse_multi' in special_list: - fatal_errors.add_result(run_dfuse(server, conf)) - fatal_errors.add_result(server.set_fi()) + for rep in range(args.repeat): + if args.repeat > 1: + print(f'=== NLT repeat iteration {rep + 1}/{args.repeat} ===') + + try: + # reset after each iteration, except on the last one + with DaosServer(conf, test_class='first', wf=wf_server, + fatal_errors=fatal_errors, + wipe_on_exit=rep < args.repeat - 1) as server: + fi_test_dfuse = _run_test_pass(conf, args, server, fatal_errors, + special_list, test_dict, excluded_dict) + except Exception as error: # pylint: disable=broad-exception-caught + if args.repeat == 1: + raise + print(f'NLT repeat iteration {rep + 1} raised: {error}') + fatal_errors.add_result(True) + if args.failfast and fatal_errors.errors and rep < args.repeat - 1: + print(f'--failfast set; stopping after iteration {rep + 1}/{args.repeat}') + break if args.mode == 'all': with DaosServer(conf, test_class='restart', wf=wf_server, @@ -6851,6 +6907,10 @@ def main(): parser.add_argument('--no-root', action='store_true') parser.add_argument('--max-log-size', default=None) parser.add_argument('--engine-count', type=int, default=1, help='Number of daos engines to run') + parser.add_argument('--repeat', type=int, default=1, + help='Repeat the test execution N times (soak/stability testing)') + parser.add_argument('--failfast', action='store_true', + help='With --repeat, stop after the first failing iteration') parser.add_argument('--system-ram-reserved', type=int, default=None, help='GiB reserved RAM') parser.add_argument('--dfuse-dir', default='/tmp', help='parent directory for all dfuse mounts') parser.add_argument('--perf-check', action='store_true')