From 0dc8459732b4b0c62832ade38343914e4e51108e Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Sat, 6 Jun 2026 04:18:37 +0000 Subject: [PATCH 1/7] DAOS-18881 test: fix the csum_error_logging test Since #17828 faults injected by this test does not increase the NVMe device error counter. Instead a RAS event is emitted. The test has to be adjusted accordingly. Test-tag: test_csum_error_logging Skip-unit-tests:true Skip-NLT: true Skip-unit-test-memcheck: true Skip-func-vm: true Skip-fault-injection-test: true Skip-test-rpms: true Signed-off-by: Jan Michalski --- .../ftest/checksum/csum_error_logging.py | 99 ++++++------------- .../ftest/checksum/csum_error_logging.yaml | 2 + 2 files changed, 31 insertions(+), 70 deletions(-) diff --git a/src/tests/ftest/checksum/csum_error_logging.py b/src/tests/ftest/checksum/csum_error_logging.py index 567f81e9702..c0e22b75527 100644 --- a/src/tests/ftest/checksum/csum_error_logging.py +++ b/src/tests/ftest/checksum/csum_error_logging.py @@ -1,102 +1,61 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ from avocado import fail_on from daos_core_base import DaosCoreBase -from dmg_utils import get_dmg_smd_info from exception_utils import CommandFailure -from general_utils import get_log_file +from general_utils import get_host_data, get_journalctl_command, journalctl_time class CsumErrorLog(DaosCoreBase): """ - Test Class Description: This test runs - daos_test -z (Checksum tests) and verifies - whether Checksum Error Counters are incremented - in the NVME device due to checksum fault injection. + Test Class Description: Test checksum error logging. + :avocado: recursive """ - # pylint: disable=too-many-instance-attributes @fail_on(CommandFailure) - def get_checksum_error_value(self, dmg, device_id): - """Get checksum error value from dmg storage_query_list_devices with health. + def get_checksum_error_value(self, t_start, t_end): + """Query journalctl logs and count checksum error occurrences. Args: - dmg (DmgCommand): the DmgCommand object used to call storage_query_list_devices() - device_id (str): Device UUID. + t_start (str): The start time for the journalctl query. + t_end (str): The end time for the journalctl query. Returns: - int: the number of checksum errors on the device + int: the number of checksum errors found """ - info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=device_id, - health=True) - for devices in info.values(): - for device in devices: - try: - if device['uuid'] == device_id: - return device['ctrlr']['health_stats']['checksum_errs'] - except KeyError as error: - self.fail( - 'Error parsing dmg storage query list-devices --health output: {}'.format( - error)) - return 0 + cmd = get_journalctl_command(t_start, t_end, system=True, units="daos_server") + results = get_host_data(self.hostlist_servers, cmd, text="journalctl", + error="Error gathering system log events") + self.log.debug(results) + str_to_match = "CSUM error" + occurrence = 0 + for host_result in results: + occurrence += host_result["data"].count(str_to_match) + return occurrence @fail_on(CommandFailure) def test_csum_error_logging(self): - """Jira ID: DAOS-3927. + """Jira ID: DAOS-3927, DAOS-18881. - Test Description: Write Avocado Test to verify single data after - pool/container disconnect/reconnect. + Test Description: Inject checksum errors using daos_test -z and verify that the errors are + logged in the system journal. :avocado: tags=all,daily_regression :avocado: tags=hw,medium :avocado: tags=checksum,faults,daos_test :avocado: tags=CsumErrorLog,test_csum_error_logging """ - self.log_step('Detecting server devices (dmg storage query list-devices)') - test_run = False - dmg = self.get_dmg_command() - dmg.hostlist = self.hostlist_servers[0] - host_devices = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices') - for host, devices in host_devices.items(): - for device in devices: - for entry in ('uuid', 'tgt_ids', 'role_bits'): - if entry not in device: - self.fail( - 'Missing {} info from dmg storage query list devices'.format(entry)) - self.log.info( - 'Host %s device: uuid=%s, targets=%s, role_bits=%s', - host, device['uuid'], device['tgt_ids'], device['role_bits']) - if not device['tgt_ids']: - self.log_step('Skipping device without targets on {}'.format(device['uuid'])) - continue - if (int(device['role_bits']) > 0) and not int(device['role_bits']) & 1: - self.log_step( - 'Skipping {} device without data on {}'.format( - device['role_bits'], device['uuid'])) - continue - if not device['uuid']: - self.fail('Device uuid undefined') - self.log_step( - 'Get checksum errors before running the test (dmg storage query list-devices ' - '--health)') - check_sum = self.get_checksum_error_value(dmg, device['uuid']) - dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) - dmg.copy_configuration(self.hostlist_clients) - self.log.info("Checksum Errors before: %d", check_sum) - self.log_step('Run the test (daos_test -z)') - self.run_subtest() - test_run = True - self.log_step( - 'Get checksum errors after running the test (dmg storage query list-devices ' - '--health)') - check_sum_latest = self.get_checksum_error_value(dmg, device['uuid']) - self.log.info('Checksum Errors after: %d', check_sum_latest) - self.assertTrue(check_sum_latest > check_sum, 'Checksum Error Log not incremented') - if not test_run: - self.fail('No tests run for the devices found') - self.log_step('Test Passed') + t_start = journalctl_time() + self.log_step('Run the test (daos_test -z)') + self.run_subtest() + t_end = journalctl_time() + self.log_step('Check checksum error logs') + checksum_errs = self.get_checksum_error_value(t_start, t_end) + self.log.info('Checksum Errors reported: %d', checksum_errs) + self.assertTrue(checksum_errs > 0, 'Checksum Errors not detected') diff --git a/src/tests/ftest/checksum/csum_error_logging.yaml b/src/tests/ftest/checksum/csum_error_logging.yaml index 2a3a790b656..5bd0966010a 100644 --- a/src/tests/ftest/checksum/csum_error_logging.yaml +++ b/src/tests/ftest/checksum/csum_error_logging.yaml @@ -13,6 +13,8 @@ server_config: targets: 1 nr_xs_helpers: 1 storage: auto + env_vars: + - DAOS_DMA_INIT_PCT=20 # workaround for DAOS-18194 # Run only checksum tests daos_tests: num_clients: 1 From 56701a1a7a91f86db558a10112d6bb026cb77276 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Sat, 6 Jun 2026 05:11:37 +0000 Subject: [PATCH 2/7] SRE-3827 test: workaround Should not be landed. Master Jenkinsfile is not able to run a single test employing fault injection. Please see the ticket for more details. Test-tag: test_csum_error_logging Skip-unit-tests:true Skip-NLT: true Skip-unit-test-memcheck: true Skip-func-vm: true Skip-fault-injection-test: true Skip-test-rpms: true Signed-off-by: Jan Michalski --- src/tests/ftest/util/launch_utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index cd8bf8eeed2..3b2e8b2384c 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -44,14 +44,16 @@ def fault_injection_enabled(logger): Returns: bool: whether or not fault injection is enabled """ - logger.debug("-" * 80) - logger.debug("Checking for fault injection enablement via 'fault_status':") - if run_local(logger, "fault_status").passed: - logger.debug(" Fault injection is enabled") - return True - # Command failed or yielded a non-zero return status - logger.debug(" Fault injection is disabled") - return False + # Workaround for SRE-3827. Needs to be reverted before the landing. + # + # logger.debug("-" * 80) + # logger.debug("Checking for fault injection enablement via 'fault_status':") + # if run_local(logger, "fault_status").passed: + # logger.debug(" Fault injection is enabled") + # return True + # # Command failed or yielded a non-zero return status + # logger.debug(" Fault injection is disabled") + return True def setup_fuse_config(logger, hosts): From ad8ce0094b387649b257b3fc27f0c526ac941684 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Sun, 7 Jun 2026 07:14:18 +0000 Subject: [PATCH 3/7] Revert "SRE-3827 test: workaround" This reverts commit 56701a1a7a91f86db558a10112d6bb026cb77276. Doc-only: true --- src/tests/ftest/util/launch_utils.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index 3b2e8b2384c..cd8bf8eeed2 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. - (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -44,16 +44,14 @@ def fault_injection_enabled(logger): Returns: bool: whether or not fault injection is enabled """ - # Workaround for SRE-3827. Needs to be reverted before the landing. - # - # logger.debug("-" * 80) - # logger.debug("Checking for fault injection enablement via 'fault_status':") - # if run_local(logger, "fault_status").passed: - # logger.debug(" Fault injection is enabled") - # return True - # # Command failed or yielded a non-zero return status - # logger.debug(" Fault injection is disabled") - return True + logger.debug("-" * 80) + logger.debug("Checking for fault injection enablement via 'fault_status':") + if run_local(logger, "fault_status").passed: + logger.debug(" Fault injection is enabled") + return True + # Command failed or yielded a non-zero return status + logger.debug(" Fault injection is disabled") + return False def setup_fuse_config(logger, hosts): From 608a91b1fc419535d35c1d821f5bc60eac2acff7 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Mon, 8 Jun 2026 15:39:10 +0000 Subject: [PATCH 4/7] DAOS-18881 test: remove DAOS-18194 WA Signed-off-by: Jan Michalski --- src/tests/ftest/checksum/csum_error_logging.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tests/ftest/checksum/csum_error_logging.yaml b/src/tests/ftest/checksum/csum_error_logging.yaml index 5bd0966010a..2a3a790b656 100644 --- a/src/tests/ftest/checksum/csum_error_logging.yaml +++ b/src/tests/ftest/checksum/csum_error_logging.yaml @@ -13,8 +13,6 @@ server_config: targets: 1 nr_xs_helpers: 1 storage: auto - env_vars: - - DAOS_DMA_INIT_PCT=20 # workaround for DAOS-18194 # Run only checksum tests daos_tests: num_clients: 1 From 5d5acffe8ab0350b0c0ac8d335690626a6c17cb8 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Mon, 8 Jun 2026 15:42:47 +0000 Subject: [PATCH 5/7] DAOS-18881 test: assertTrue -> assertGreater Signed-off-by: Jan Michalski --- src/tests/ftest/checksum/csum_error_logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/checksum/csum_error_logging.py b/src/tests/ftest/checksum/csum_error_logging.py index c0e22b75527..a3430fbe801 100644 --- a/src/tests/ftest/checksum/csum_error_logging.py +++ b/src/tests/ftest/checksum/csum_error_logging.py @@ -58,4 +58,4 @@ def test_csum_error_logging(self): self.log_step('Check checksum error logs') checksum_errs = self.get_checksum_error_value(t_start, t_end) self.log.info('Checksum Errors reported: %d', checksum_errs) - self.assertTrue(checksum_errs > 0, 'Checksum Errors not detected') + self.assertGreater(checksum_errs, 0, 'Checksum Errors not detected') From 15ffb8db2e012b44a3da2c535015178aade46fba Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Sat, 6 Jun 2026 05:11:37 +0000 Subject: [PATCH 6/7] SRE-3827 test: workaround Should not be landed. Master Jenkinsfile is not able to run a single test employing fault injection. Please see the ticket for more details. Test-tag: test_csum_error_logging Skip-unit-tests:true Skip-NLT: true Skip-unit-test-memcheck: true Skip-func-vm: true Skip-fault-injection-test: true Skip-test-rpms: true Signed-off-by: Jan Michalski --- src/tests/ftest/util/launch_utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index cd8bf8eeed2..3b2e8b2384c 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. - (C) Copyright 2025 Hewlett Packard Enterprise Development LP + (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -44,14 +44,16 @@ def fault_injection_enabled(logger): Returns: bool: whether or not fault injection is enabled """ - logger.debug("-" * 80) - logger.debug("Checking for fault injection enablement via 'fault_status':") - if run_local(logger, "fault_status").passed: - logger.debug(" Fault injection is enabled") - return True - # Command failed or yielded a non-zero return status - logger.debug(" Fault injection is disabled") - return False + # Workaround for SRE-3827. Needs to be reverted before the landing. + # + # logger.debug("-" * 80) + # logger.debug("Checking for fault injection enablement via 'fault_status':") + # if run_local(logger, "fault_status").passed: + # logger.debug(" Fault injection is enabled") + # return True + # # Command failed or yielded a non-zero return status + # logger.debug(" Fault injection is disabled") + return True def setup_fuse_config(logger, hosts): From a954c075734cde81f7042f0ba2eb12c9e7fec9c8 Mon Sep 17 00:00:00 2001 From: Jan Michalski Date: Mon, 8 Jun 2026 16:41:36 +0000 Subject: [PATCH 7/7] Revert "SRE-3827 test: workaround" This reverts commit 15ffb8db2e012b44a3da2c535015178aade46fba. Doc-only: true Skip-build: true --- src/tests/ftest/util/launch_utils.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index 3b2e8b2384c..cd8bf8eeed2 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -1,6 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. - (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -44,16 +44,14 @@ def fault_injection_enabled(logger): Returns: bool: whether or not fault injection is enabled """ - # Workaround for SRE-3827. Needs to be reverted before the landing. - # - # logger.debug("-" * 80) - # logger.debug("Checking for fault injection enablement via 'fault_status':") - # if run_local(logger, "fault_status").passed: - # logger.debug(" Fault injection is enabled") - # return True - # # Command failed or yielded a non-zero return status - # logger.debug(" Fault injection is disabled") - return True + logger.debug("-" * 80) + logger.debug("Checking for fault injection enablement via 'fault_status':") + if run_local(logger, "fault_status").passed: + logger.debug(" Fault injection is enabled") + return True + # Command failed or yielded a non-zero return status + logger.debug(" Fault injection is disabled") + return False def setup_fuse_config(logger, hosts):