diff --git a/src/tests/ftest/checksum/csum_error_logging.py b/src/tests/ftest/checksum/csum_error_logging.py index 567f81e9702..a3430fbe801 100644 --- a/src/tests/ftest/checksum/csum_error_logging.py +++ b/src/tests/ftest/checksum/csum_error_logging.py @@ -1,102 +1,61 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ from avocado import fail_on from daos_core_base import DaosCoreBase -from dmg_utils import get_dmg_smd_info from exception_utils import CommandFailure -from general_utils import get_log_file +from general_utils import get_host_data, get_journalctl_command, journalctl_time class CsumErrorLog(DaosCoreBase): """ - Test Class Description: This test runs - daos_test -z (Checksum tests) and verifies - whether Checksum Error Counters are incremented - in the NVME device due to checksum fault injection. + Test Class Description: Test checksum error logging. + :avocado: recursive """ - # pylint: disable=too-many-instance-attributes @fail_on(CommandFailure) - def get_checksum_error_value(self, dmg, device_id): - """Get checksum error value from dmg storage_query_list_devices with health. + def get_checksum_error_value(self, t_start, t_end): + """Query journalctl logs and count checksum error occurrences. Args: - dmg (DmgCommand): the DmgCommand object used to call storage_query_list_devices() - device_id (str): Device UUID. + t_start (str): The start time for the journalctl query. + t_end (str): The end time for the journalctl query. Returns: - int: the number of checksum errors on the device + int: the number of checksum errors found """ - info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=device_id, - health=True) - for devices in info.values(): - for device in devices: - try: - if device['uuid'] == device_id: - return device['ctrlr']['health_stats']['checksum_errs'] - except KeyError as error: - self.fail( - 'Error parsing dmg storage query list-devices --health output: {}'.format( - error)) - return 0 + cmd = get_journalctl_command(t_start, t_end, system=True, units="daos_server") + results = get_host_data(self.hostlist_servers, cmd, text="journalctl", + error="Error gathering system log events") + self.log.debug(results) + str_to_match = "CSUM error" + occurrence = 0 + for host_result in results: + occurrence += host_result["data"].count(str_to_match) + return occurrence @fail_on(CommandFailure) def test_csum_error_logging(self): - """Jira ID: DAOS-3927. + """Jira ID: DAOS-3927, DAOS-18881. - Test Description: Write Avocado Test to verify single data after - pool/container disconnect/reconnect. + Test Description: Inject checksum errors using daos_test -z and verify that the errors are + logged in the system journal. :avocado: tags=all,daily_regression :avocado: tags=hw,medium :avocado: tags=checksum,faults,daos_test :avocado: tags=CsumErrorLog,test_csum_error_logging """ - self.log_step('Detecting server devices (dmg storage query list-devices)') - test_run = False - dmg = self.get_dmg_command() - dmg.hostlist = self.hostlist_servers[0] - host_devices = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices') - for host, devices in host_devices.items(): - for device in devices: - for entry in ('uuid', 'tgt_ids', 'role_bits'): - if entry not in device: - self.fail( - 'Missing {} info from dmg storage query list devices'.format(entry)) - self.log.info( - 'Host %s device: uuid=%s, targets=%s, role_bits=%s', - host, device['uuid'], device['tgt_ids'], device['role_bits']) - if not device['tgt_ids']: - self.log_step('Skipping device without targets on {}'.format(device['uuid'])) - continue - if (int(device['role_bits']) > 0) and not int(device['role_bits']) & 1: - self.log_step( - 'Skipping {} device without data on {}'.format( - device['role_bits'], device['uuid'])) - continue - if not device['uuid']: - self.fail('Device uuid undefined') - self.log_step( - 'Get checksum errors before running the test (dmg storage query list-devices ' - '--health)') - check_sum = self.get_checksum_error_value(dmg, device['uuid']) - dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) - dmg.copy_configuration(self.hostlist_clients) - self.log.info("Checksum Errors before: %d", check_sum) - self.log_step('Run the test (daos_test -z)') - self.run_subtest() - test_run = True - self.log_step( - 'Get checksum errors after running the test (dmg storage query list-devices ' - '--health)') - check_sum_latest = self.get_checksum_error_value(dmg, device['uuid']) - self.log.info('Checksum Errors after: %d', check_sum_latest) - self.assertTrue(check_sum_latest > check_sum, 'Checksum Error Log not incremented') - if not test_run: - self.fail('No tests run for the devices found') - self.log_step('Test Passed') + t_start = journalctl_time() + self.log_step('Run the test (daos_test -z)') + self.run_subtest() + t_end = journalctl_time() + self.log_step('Check checksum error logs') + checksum_errs = self.get_checksum_error_value(t_start, t_end) + self.log.info('Checksum Errors reported: %d', checksum_errs) + self.assertGreater(checksum_errs, 0, 'Checksum Errors not detected')