Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
63ae7a8
DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS si…
jgmoore-or Dec 18, 2025
7da4dd7
DAOS-17931 engine: engine: Terminate engine process upon receipt of S…
jgmoore-or Jan 16, 2026
101717a
DAOS-17381 engine: Terminate engine process upon receipt of SIGBUS si…
jgmoore-or Feb 5, 2026
efb4239
DAOS-17391 engine: Terminate engine process upon receipt of SIGBUS si…
jgmoore-or Feb 12, 2026
fa625d3
Merge branch 'release/2.6' into jgm/DAOS-17931-2.6
jgmoore-or May 5, 2026
caf2e98
DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS si…
jgmoore-or May 5, 2026
901d47c
DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS si…
jgmoore-or May 5, 2026
d5c324b
DAOS-19001 vos: set dth_need_validation when evict active DTX - b26
Nasf-Fan May 25, 2026
c4fa87f
DAOS-19059 vos: cache vos object after DTX commit - b26
Nasf-Fan Jun 3, 2026
2f6af62
DAOS-19088 ci: Pin mercury to latest 15.5 version
grom72 Jun 8, 2026
fb8e762
Fix: narrow solution to only mercury-devel RPM
grom72 Jun 8, 2026
921dabd
DAOS-19036 dtx: handle DTX race issues - b26
Nasf-Fan Jun 9, 2026
78c4b4b
Merge branch 'Nasf-Fan/DAOS-19059_b26' into rpadma2/daos_265_p1
rpadma2 Jun 9, 2026
8545aa2
Merge branch 'Nasf-Fan/DAOS-19001_b26' into rpadma2/daos_265_p1
rpadma2 Jun 9, 2026
08addb9
Merge branch 'Nasf-Fan/DAOS-19036_1_b26' into rpadma2/daos_265_p1
rpadma2 Jun 9, 2026
ed32678
Merge branch 'jgm/DAOS-17931-2.6' into rpadma2/daos_265_p1
rpadma2 Jun 9, 2026
4ae2929
Merge branch 'grom72/DAOS-19088' into rpadma2/daos_265_p1
rpadma2 Jun 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/dtx/dtx_coll.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2023-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -43,6 +43,7 @@ struct dtx_coll_local_args {
struct dtx_id dcla_xid;
daos_epoch_t dcla_epoch;
uint32_t dcla_opc;
uint32_t dcla_ver;
int *dcla_results;
};

Expand Down Expand Up @@ -368,7 +369,7 @@ dtx_coll_local_one(void *args)
rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, false, NULL);
break;
case DTX_COLL_ABORT:
rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch);
rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch, dcla->dcla_ver);
break;
case DTX_COLL_CHECK:
rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, false);
Expand Down Expand Up @@ -404,7 +405,8 @@ dtx_coll_local_one(void *args)

int
dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch,
uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results)
uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap,
int **p_results)
{
struct dtx_coll_local_args dcla = { 0 };
struct dss_coll_ops coll_ops = { 0 };
Expand All @@ -419,6 +421,7 @@ dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epo
uuid_copy(dcla.dcla_co_uuid, co_uuid);
dcla.dcla_xid = *xid;
dcla.dcla_epoch = epoch;
dcla.dcla_ver = version;
dcla.dcla_opc = opc;

coll_ops.co_func = dtx_coll_local_one;
Expand Down
5 changes: 3 additions & 2 deletions src/dtx/dtx_internal.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -303,7 +303,8 @@ int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid,
struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver,
uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce);
int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch,
uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results);
uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap,
int **p_results);
/* clang-format on */

enum dtx_status_handle_result {
Expand Down
19 changes: 16 additions & 3 deletions src/dtx/dtx_resync.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -41,6 +41,7 @@ struct dtx_resync_args {
daos_epoch_t epoch;
uint32_t resync_version;
uint32_t discard_version;
bool for_all;
};

static inline void
Expand Down Expand Up @@ -391,7 +392,8 @@ dtx_status_handle(struct dtx_resync_args *dra)

d_list_for_each_entry_safe(dre, next, &drh->drh_list, dre_link) {
if (dre->dre_dte.dte_ver < dra->discard_version) {
err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch);
err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch,
dre->dre_dte.dte_ver);
if (err == -DER_NONEXIST)
err = 0;
if (err != 0)
Expand Down Expand Up @@ -532,7 +534,17 @@ dtx_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args)
if (dra->resync_version == dra->discard_version)
return 0;

/* Skip unprepared entry which version is at least not older than discard version. */
/*
* The DTX version maybe refreshed via obj_handle_resend(). It means that either the
* DTX is generated against the latest pool map or related IO RPC is resent by client
* after pool map changed. Under both cases, the DTX resync that is triggered for pool
* map changes (@for_all is false) should not handle such DTX to avoid making conflict
* commit/abort decision (against regular IO handler) by race.
*/
if ((ent->ie_dtx_ver > dra->resync_version) ||
(ent->ie_dtx_ver == dra->resync_version && !dra->for_all))
return 0;

if (ent->ie_dtx_tgt_cnt == 0)
return 0;

Expand Down Expand Up @@ -658,6 +670,7 @@ dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool
dra.epoch = d_hlc_get();
D_INIT_LIST_HEAD(&dra.tables.drh_list);
dra.tables.drh_count = 0;
dra.for_all = !block;

/*
* Trigger DTX reindex. That will avoid DTX_CHECK from others being blocked.
Expand Down
19 changes: 10 additions & 9 deletions src/dtx/dtx_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch)
* to resend sometime later.
*/
if (epoch != 0)
rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch);
rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch, dte->dte_ver);
else
rc1 = vos_dtx_set_flags(cont->sc_hdl, &dte->dte_xid, 1, DTE_CORRUPTED);
if (rc1 > 0 || rc1 == -DER_NONEXIST)
Expand Down Expand Up @@ -1227,7 +1227,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
d_list_del(&dsp->dsp_link);
dtx_dsp_free(dsp);
} else {
rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch);
rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch,
dsp->dsp_version);
D_ASSERT(rc1 != -DER_NO_PERM);

if (rc1 == 0 || !for_io) {
Expand Down Expand Up @@ -1643,8 +1644,8 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
if (dce->dce_bitmap != NULL) {
clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0,
DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap,
&results);
dce->dce_ver, DTX_COLL_COMMIT, dce->dce_bitmap_sz,
dce->dce_bitmap, &results);
if (len < 0) {
rc1 = len;
} else {
Expand Down Expand Up @@ -1726,8 +1727,8 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
if (dce->dce_bitmap != NULL) {
clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch,
DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap,
&results);
dce->dce_ver, DTX_COLL_ABORT, dce->dce_bitmap_sz,
dce->dce_bitmap, &results);
if (len < 0) {
rc1 = len;
} else {
Expand All @@ -1747,7 +1748,7 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
}

if (epoch != 0)
rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch);
rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch, dce->dce_ver);
else
rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED);
if (rc2 > 0 || rc2 == -DER_NONEXIST)
Expand Down Expand Up @@ -1783,8 +1784,8 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc

if (dce->dce_bitmap != NULL) {
len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch,
DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap,
&results);
dce->dce_ver, DTX_COLL_CHECK, dce->dce_bitmap_sz,
dce->dce_bitmap, &results);
if (len < 0) {
rc1 = len;
} else {
Expand Down
6 changes: 3 additions & 3 deletions src/dtx/dtx_srv.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -241,7 +241,7 @@ dtx_handler(crt_rpc_t *rpc)

rc = vos_dtx_abort(cont->sc_hdl,
(struct dtx_id *)din->di_dtx_array.ca_arrays,
din->di_epoch);
din->di_epoch, din->di_version);
} else {
rc = vos_dtx_set_flags(cont->sc_hdl,
(struct dtx_id *)din->di_dtx_array.ca_arrays,
Expand Down Expand Up @@ -464,7 +464,7 @@ dtx_coll_handler(crt_rpc_t *rpc)
}

len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch,
opc, bitmap_sz, bitmap, &results);
dci->dci_version, opc, bitmap_sz, bitmap, &results);
if (len < 0)
D_GOTO(out, rc = len);

Expand Down
15 changes: 11 additions & 4 deletions src/engine/init.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1204,6 +1204,7 @@ int
main(int argc, char **argv)
{
sigset_t set;
bool exit_failure = false;
int sig;
int rc;

Expand Down Expand Up @@ -1238,6 +1239,7 @@ main(int argc, char **argv)

/** wait for shutdown signal */
sigemptyset(&set);
sigaddset(&set, SIGBUS);
sigaddset(&set, SIGINT);
sigaddset(&set, SIGTERM);
sigaddset(&set, SIGUSR1);
Expand All @@ -1248,7 +1250,6 @@ main(int argc, char **argv)
D_ERROR("failed to wait for signals: %d\n", rc);
break;
}

/* open specific file to dump ABT infos and ULTs stacks */
if (sig == SIGUSR1 || sig == SIGUSR2) {
struct timeval tv;
Expand Down Expand Up @@ -1322,12 +1323,18 @@ main(int argc, char **argv)
continue;
}

/* SIGINT/SIGTERM cause server shutdown */
/* Log error for SIGBUS occurrence */
if (sig == SIGBUS) {
D_ERROR("SIGBUS signal received; proceeding to shutdown.\n");
exit_failure = true;
}

/* SIGINT/SIGTERM/SIGBUS cause server shutdown */
break;
}

/** shutdown */
server_fini(true);

exit(EXIT_SUCCESS);
exit(exit_failure ? EXIT_FAILURE : EXIT_SUCCESS);
}
5 changes: 3 additions & 2 deletions src/include/daos_srv/vos.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2015-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -201,11 +201,12 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool keep_act
* \param coh [IN] Container open handle.
* \param dti [IN] The DTX identifiers to be aborted.
* \param epoch [IN] The max epoch for the DTX to be aborted.
* \param version [IN] The max version for the DTX to be aborted.
*
* \return Zero on success, negative value if error.
*/
int
vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch);
vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version);

/**
* Discard the active DTX entry's records if invalid.
Expand Down
Loading
Loading