Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 29 additions & 70 deletions src/tests/ftest/checksum/csum_error_logging.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,61 @@
"""
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2026 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
"""

from avocado import fail_on
from daos_core_base import DaosCoreBase
from dmg_utils import get_dmg_smd_info
from exception_utils import CommandFailure
from general_utils import get_log_file
from general_utils import get_host_data, get_journalctl_command, journalctl_time


class CsumErrorLog(DaosCoreBase):
"""
Test Class Description: This test runs
daos_test -z (Checksum tests) and verifies
whether Checksum Error Counters are incremented
in the NVME device due to checksum fault injection.
Test Class Description: Test checksum error logging.

:avocado: recursive
"""
# pylint: disable=too-many-instance-attributes

@fail_on(CommandFailure)
def get_checksum_error_value(self, dmg, device_id):
"""Get checksum error value from dmg storage_query_list_devices with health.
def get_checksum_error_value(self, t_start, t_end):
"""Query journalctl logs and count checksum error occurrences.

Args:
dmg (DmgCommand): the DmgCommand object used to call storage_query_list_devices()
device_id (str): Device UUID.
t_start (str): The start time for the journalctl query.
t_end (str): The end time for the journalctl query.

Returns:
int: the number of checksum errors on the device
int: the number of checksum errors found
"""
info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=device_id,
health=True)
for devices in info.values():
for device in devices:
try:
if device['uuid'] == device_id:
return device['ctrlr']['health_stats']['checksum_errs']
except KeyError as error:
self.fail(
'Error parsing dmg storage query list-devices --health output: {}'.format(
error))
return 0
cmd = get_journalctl_command(t_start, t_end, system=True, units="daos_server")
results = get_host_data(self.hostlist_servers, cmd, text="journalctl",
error="Error gathering system log events")
self.log.debug(results)
str_to_match = "CSUM error"
occurrence = 0
for host_result in results:
occurrence += host_result["data"].count(str_to_match)
return occurrence

@fail_on(CommandFailure)
def test_csum_error_logging(self):
"""Jira ID: DAOS-3927.
"""Jira ID: DAOS-3927, DAOS-18881.

Test Description: Write Avocado Test to verify single data after
pool/container disconnect/reconnect.
Test Description: Inject checksum errors using daos_test -z and verify that the errors are
logged in the system journal.

:avocado: tags=all,daily_regression
:avocado: tags=hw,medium
:avocado: tags=checksum,faults,daos_test
:avocado: tags=CsumErrorLog,test_csum_error_logging
"""
self.log_step('Detecting server devices (dmg storage query list-devices)')
test_run = False
dmg = self.get_dmg_command()
dmg.hostlist = self.hostlist_servers[0]
host_devices = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices')
for host, devices in host_devices.items():
for device in devices:
for entry in ('uuid', 'tgt_ids', 'role_bits'):
if entry not in device:
self.fail(
'Missing {} info from dmg storage query list devices'.format(entry))
self.log.info(
'Host %s device: uuid=%s, targets=%s, role_bits=%s',
host, device['uuid'], device['tgt_ids'], device['role_bits'])
if not device['tgt_ids']:
self.log_step('Skipping device without targets on {}'.format(device['uuid']))
continue
if (int(device['role_bits']) > 0) and not int(device['role_bits']) & 1:
self.log_step(
'Skipping {} device without data on {}'.format(
device['role_bits'], device['uuid']))
continue
if not device['uuid']:
self.fail('Device uuid undefined')
self.log_step(
'Get checksum errors before running the test (dmg storage query list-devices '
'--health)')
check_sum = self.get_checksum_error_value(dmg, device['uuid'])
dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients)
dmg.copy_configuration(self.hostlist_clients)
self.log.info("Checksum Errors before: %d", check_sum)
self.log_step('Run the test (daos_test -z)')
self.run_subtest()
test_run = True
self.log_step(
'Get checksum errors after running the test (dmg storage query list-devices '
'--health)')
check_sum_latest = self.get_checksum_error_value(dmg, device['uuid'])
self.log.info('Checksum Errors after: %d', check_sum_latest)
self.assertTrue(check_sum_latest > check_sum, 'Checksum Error Log not incremented')
if not test_run:
self.fail('No tests run for the devices found')
self.log_step('Test Passed')
t_start = journalctl_time()
self.log_step('Run the test (daos_test -z)')
self.run_subtest()
t_end = journalctl_time()
self.log_step('Check checksum error logs')
checksum_errs = self.get_checksum_error_value(t_start, t_end)
self.log.info('Checksum Errors reported: %d', checksum_errs)
self.assertGreater(checksum_errs, 0, 'Checksum Errors not detected')
Loading