I was able to reproduce the issue, but even with a Debug build of ROOT and LLVM, the symbols in the stacktrace do not show (memory corruption?). The last call from Python is self.engine.GetTrainBatch(). See Python trace output + stacktrace:
...
_ml_dataloader.py(837): for cls in [RDataLoader, FormattedLoader]:
_facade.py(632): return ns
--- modulename: _cpython_cppyy, funcname: __call__
_cpython_cppyy.py(118): if args:
_cpython_cppyy.py(119): args0 = args[0]
_cpython_cppyy.py(120): if args0 and isinstance(args0, (tuple, list)):
_cpython_cppyy.py(131): if args0 and isinstance(args0, dict):
_cpython_cppyy.py(146): return self.__getitem__(*args)
--- modulename: _cpython_cppyy, funcname: __getitem__
_cpython_cppyy.py(74): if args and isinstance(args[0], tuple):
_cpython_cppyy.py(78): try:
_cpython_cppyy.py(79): return self._instantiations[args]
_ml_dataloader.py(233): self.noded_rdfs,
_ml_dataloader.py(234): batch_size,
_ml_dataloader.py(235): batches_in_memory,
_ml_dataloader.py(236): self.given_columns,
_ml_dataloader.py(237): max_vec_sizes_list,
_ml_dataloader.py(238): vec_padding,
_ml_dataloader.py(239): test_size,
_ml_dataloader.py(240): shuffle,
_ml_dataloader.py(241): drop_remainder,
_ml_dataloader.py(242): set_seed,
_ml_dataloader.py(243): load_eager,
_ml_dataloader.py(244): sampling_type,
_ml_dataloader.py(245): sampling_ratio,
_ml_dataloader.py(246): replacement,
_ml_dataloader.py(232): self.engine = ROOT.Experimental.Internal.ML.RDataLoaderEngine(template)(
_ml_dataloader.py(249): atexit.register(self.DeActivate)
_ml_dataloader.py(687): self._test_size = test_size
_ml_dataloader.py(710): return FormattedLoader(self._internal, self._internal.ConvertBatchToNumpy, self._is_training)
--- modulename: _ml_dataloader, funcname: __init__
_ml_dataloader.py(533): self._internal = internal
_ml_dataloader.py(534): self._conversion_fn = conversion_fn
_ml_dataloader.py(535): self._is_training = is_training
--- modulename: _ml_dataloader, funcname: __iter__
_ml_dataloader.py(549): return self._make_gen()
ml_dataloader.py(3960): for _ in range(3):
ml_dataloader.py(3962): x, y = next(train_iter)
--- modulename: _ml_dataloader, funcname: _make_gen
_ml_dataloader.py(538): ctx_cls = _TrainingEpochContext if self._is_training else _ValidationEpochContext
_ml_dataloader.py(539): get_batch = self._internal.GetTrainBatch if self._is_training else self._internal.GetValidationBatch
_ml_dataloader.py(541): with ctx_cls(self._internal):
--- modulename: _ml_dataloader, funcname: __init__
_ml_dataloader.py(492): self._internal = internal
_ml_dataloader.py(494): internal.Activate()
--- modulename: _ml_dataloader, funcname: Activate
_ml_dataloader.py(263): self.engine.Activate()
[New Thread 0x7c45eb8dd6c0 (LWP 29875)]
_ml_dataloader.py(495): internal.CreateTrainBatches()
--- modulename: _ml_dataloader, funcname: CreateTrainBatches
_ml_dataloader.py(287): self.engine.CreateTrainBatches()
--- modulename: _ml_dataloader, funcname: __enter__
_ml_dataloader.py(498): self._internal.ActivateTrainingEpoch()
--- modulename: _ml_dataloader, funcname: ActivateTrainingEpoch
_ml_dataloader.py(271): self.engine.ActivateTrainingEpoch()
_ml_dataloader.py(499): return self
_ml_dataloader.py(542): while True:
_ml_dataloader.py(543): batch = get_batch()
--- modulename: _ml_dataloader, funcname: GetTrainBatch
_ml_dataloader.py(475): batch = self.engine.GetTrainBatch()
/opt/rh/gcc-toolset-15/root//usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/stl_vector.h:1263: reference std::vector<unsigned long long>::operator[](size_type) [_Tp = unsigned long long, _Alloc = std::allocator<unsigned long long>]: Assertion '__n < this->size()' failed.
Thread 17 "python3.12" received signal SIGABRT, Aborted.
[Switching to Thread 0x7c45eb8dd6c0 (LWP 29875)]
0x00007c463ed26dd4 in __pthread_kill_implementation () from /lib64/libc.so.6
Missing rpms, try: dnf --enablerepo='*debug*' install python3-libs-debuginfo-3.12.13-2.el10_2.x86_64 glibc-debuginfo-2.39-121.el10_2.alma.1.x86_64 openssl-libs-debuginfo-3.5.5-2.el10_2.alma.1.x86_64 zlib-ng-compat-debuginfo-2.2.3-3.el10_1.x86_64 libstdc++-debuginfo-14.3.1-4.4.el10.alma.2.x86_64 libgcc-debuginfo-14.3.1-4.4.el10.alma.2.x86_64 libffi-debuginfo-3.4.4-10.el10.x86_64 bzip2-libs-debuginfo-1.0.8-25.el10.x86_64 xz-libs-debuginfo-5.6.2-4.el10_0.x86_64 tbb-debuginfo-2021.11.0-7.el10.x86_64 sqlite-libs-debuginfo-3.46.1-5.el10_1.x86_64 libuuid-debuginfo-2.40.2-18.el10.x86_64
(gdb) bt
#0 0x00007c463ed26dd4 in __pthread_kill_implementation () from /lib64/libc.so.6
#1 0x00007c463eccd57e in raise () from /lib64/libc.so.6
#2 0x00007c463ecb4905 in abort () from /lib64/libc.so.6
#3 0x00007c463b8d8d30 in std::__glibcxx_assert_fail(char const*, int, char const*, char const*) () from /lib64/libstdc++.so.6
#4 0x00007c45ec51df78 in ?? ()
#5 0x01007c45eb8db348 in ?? ()
#6 0x00007c45eb8dc920 in ?? ()
#7 0x00007c45ec5103c8 in ?? ()
#8 0x0000000000000005 in ?? ()
#9 0x00007c45eb8dca50 in ?? ()
#10 0x00007c45ec03b640 in ?? ()
#11 0x0000000000000000 in ?? ()
Check duplicate issues.
Description
The ml-dataloader test is persistently failing on alma10 with auto-registration off on 6.40 and main.
I was able to reproduce the issue, but even with a Debug build of ROOT and LLVM, the symbols in the stacktrace do not show (memory corruption?). The last call from Python is
self.engine.GetTrainBatch(). See Python trace output + stacktrace:Reproducer
follow instructions to reproduce at https://github.com/root-project/root/actions/runs/26613028722/job/78465105221
ROOT version
6.40 and main
Installation method
build from source
Operating system
alma10
Additional context
No response