diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 106205e0c5b..06af0a09a3c 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -43,6 +43,7 @@ struct dtx_coll_local_args { struct dtx_id dcla_xid; daos_epoch_t dcla_epoch; uint32_t dcla_opc; + uint32_t dcla_ver; int *dcla_results; }; @@ -368,7 +369,7 @@ dtx_coll_local_one(void *args) rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, false, NULL); break; case DTX_COLL_ABORT: - rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch, dcla->dcla_ver); break; case DTX_COLL_CHECK: rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, false); @@ -404,7 +405,8 @@ dtx_coll_local_one(void *args) int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results) { struct dtx_coll_local_args dcla = { 0 }; struct dss_coll_ops coll_ops = { 0 }; @@ -419,6 +421,7 @@ dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epo uuid_copy(dcla.dcla_co_uuid, co_uuid); dcla.dcla_xid = *xid; dcla.dcla_epoch = epoch; + dcla.dcla_ver = version; dcla.dcla_opc = opc; coll_ops.co_func = dtx_coll_local_one; diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 9f25c3b19a6..4bdd3aa1a2f 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -303,7 +303,8 @@ int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results); /* clang-format on */ enum dtx_status_handle_result { diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 444e5a143e0..8bcf0de4b98 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -41,6 +41,7 @@ struct dtx_resync_args { daos_epoch_t epoch; uint32_t resync_version; uint32_t discard_version; + bool for_all; }; static inline void @@ -391,7 +392,8 @@ dtx_status_handle(struct dtx_resync_args *dra) d_list_for_each_entry_safe(dre, next, &drh->drh_list, dre_link) { if (dre->dre_dte.dte_ver < dra->discard_version) { - err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch); + err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch, + dre->dre_dte.dte_ver); if (err == -DER_NONEXIST) err = 0; if (err != 0) @@ -532,7 +534,17 @@ dtx_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) if (dra->resync_version == dra->discard_version) return 0; - /* Skip unprepared entry which version is at least not older than discard version. */ + /* + * The DTX version maybe refreshed via obj_handle_resend(). It means that either the + * DTX is generated against the latest pool map or related IO RPC is resent by client + * after pool map changed. Under both cases, the DTX resync that is triggered for pool + * map changes (@for_all is false) should not handle such DTX to avoid making conflict + * commit/abort decision (against regular IO handler) by race. + */ + if ((ent->ie_dtx_ver > dra->resync_version) || + (ent->ie_dtx_ver == dra->resync_version && !dra->for_all)) + return 0; + if (ent->ie_dtx_tgt_cnt == 0) return 0; @@ -658,6 +670,7 @@ dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool dra.epoch = d_hlc_get(); D_INIT_LIST_HEAD(&dra.tables.drh_list); dra.tables.drh_count = 0; + dra.for_all = !block; /* * Trigger DTX reindex. That will avoid DTX_CHECK from others being blocked. diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index a7cb78ae373..72348cd9d73 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -955,7 +955,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) * to resend sometime later. */ if (epoch != 0) - rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch, dte->dte_ver); else rc1 = vos_dtx_set_flags(cont->sc_hdl, &dte->dte_xid, 1, DTE_CORRUPTED); if (rc1 > 0 || rc1 == -DER_NONEXIST) @@ -1227,7 +1227,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che d_list_del(&dsp->dsp_link); dtx_dsp_free(dsp); } else { - rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch, + dsp->dsp_version); D_ASSERT(rc1 != -DER_NO_PERM); if (rc1 == 0 || !for_io) { @@ -1643,8 +1644,8 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, - DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_COMMIT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1726,8 +1727,8 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_ABORT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1747,7 +1748,7 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } if (epoch != 0) - rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch); + rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch, dce->dce_ver); else rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED); if (rc2 > 0 || rc2 == -DER_NONEXIST) @@ -1783,8 +1784,8 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_CHECK, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index d48ab663d13..8ca87cebc38 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -241,7 +241,7 @@ dtx_handler(crt_rpc_t *rpc) rc = vos_dtx_abort(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, - din->di_epoch); + din->di_epoch, din->di_version); } else { rc = vos_dtx_set_flags(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, @@ -464,7 +464,7 @@ dtx_coll_handler(crt_rpc_t *rpc) } len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, - opc, bitmap_sz, bitmap, &results); + dci->dci_version, opc, bitmap_sz, bitmap, &results); if (len < 0) D_GOTO(out, rc = len); diff --git a/src/engine/init.c b/src/engine/init.c index dd70dfe721e..2ad123eb161 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1204,6 +1204,7 @@ int main(int argc, char **argv) { sigset_t set; + bool exit_failure = false; int sig; int rc; @@ -1238,6 +1239,7 @@ main(int argc, char **argv) /** wait for shutdown signal */ sigemptyset(&set); + sigaddset(&set, SIGBUS); sigaddset(&set, SIGINT); sigaddset(&set, SIGTERM); sigaddset(&set, SIGUSR1); @@ -1248,7 +1250,6 @@ main(int argc, char **argv) D_ERROR("failed to wait for signals: %d\n", rc); break; } - /* open specific file to dump ABT infos and ULTs stacks */ if (sig == SIGUSR1 || sig == SIGUSR2) { struct timeval tv; @@ -1322,12 +1323,18 @@ main(int argc, char **argv) continue; } - /* SIGINT/SIGTERM cause server shutdown */ + /* Log error for SIGBUS occurrence */ + if (sig == SIGBUS) { + D_ERROR("SIGBUS signal received; proceeding to shutdown.\n"); + exit_failure = true; + } + + /* SIGINT/SIGTERM/SIGBUS cause server shutdown */ break; } /** shutdown */ server_fini(true); - exit(EXIT_SUCCESS); + exit(exit_failure ? EXIT_FAILURE : EXIT_SUCCESS); } diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 714e8c11799..019d6521bce 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -201,11 +201,12 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool keep_act * \param coh [IN] Container open handle. * \param dti [IN] The DTX identifiers to be aborted. * \param epoch [IN] The max epoch for the DTX to be aborted. + * \param version [IN] The max version for the DTX to be aborted. * * \return Zero on success, negative value if error. */ int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch); +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version); /** * Discard the active DTX entry's records if invalid. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a477da31be2..f0b989791c6 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2710,11 +2710,11 @@ enum obj_resend_status { }; static int -obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t *pm_ver, +obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t pm_ver, uint32_t *flags, struct dtx_memberships *mbs, bool leader, bool dist) { daos_epoch_t e; - uint32_t ver = *pm_ver; + uint32_t ver = pm_ver; int rc; if (!leader || dist || (flags != NULL && *flags & ORF_RESEND)) @@ -2731,16 +2731,13 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui /* For 'prepared' DTX, if pool map has been changed, then DTX membership maybe * changed also. Let's refresh it if necessary. */ - if (ver < *pm_ver) { - rc = vos_dtx_refresh_mbs(coh, dti, mbs, *pm_ver, leader); + if (ver < pm_ver) { + rc = vos_dtx_refresh_mbs(coh, dti, mbs, pm_ver, leader); if (rc < 0) goto out; if (rc > 0) rc = 0; - - if (leader && !dist) - *pm_ver = ver; } if (flags != NULL) { @@ -2768,7 +2765,7 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui D_GOTO(out, rc = -DER_INPROGRESS); /* Abort it if exist but with different epoch, then re-execute with new epoch. */ - rc = vos_dtx_abort(coh, dti, e); + rc = vos_dtx_abort(coh, dti, e, ver); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); /* Fall through */ @@ -2834,7 +2831,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) /* Handle resend. */ if (orw->orw_flags & ORF_RESEND) { rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, - &orw->orw_map_ver, NULL, mbs, false, false); + orw->orw_map_ver, NULL, mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -3019,7 +3016,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) struct obj_pool_metrics *opm; int dti_cos_cnt; uint32_t tgt_cnt; - uint32_t version = 0; uint32_t max_ver = 0; struct dtx_epoch epoch = {0}; int rc; @@ -3132,7 +3128,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (rc != 0) D_GOTO(out, rc); - version = orw->orw_map_ver; max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { @@ -3151,9 +3146,8 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); again: - version = orw->orw_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, + orw->orw_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -3196,9 +3190,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, - version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, orw->orw_map_ver, + &orw->orw_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -3266,7 +3260,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) int rc1; dte.dte_xid = orw->orw_dti; - dte.dte_ver = version; + dte.dte_ver = orw->orw_map_ver;; dte.dte_refs = 1; dte.dte_mbs = mbs; rc1 = dtx_abort(ioc.ioc_coc, &dte, orw->orw_epoch); @@ -3840,7 +3834,7 @@ obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) if (opi->opi_flags & ORF_RESEND) { rc = obj_handle_resend(p_ioc->ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, - &opi->opi_map_ver, NULL, otpa->mbs, false, false); + opi->opi_map_ver, NULL, otpa->mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -4007,7 +4001,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) uint32_t tgt_cnt; uint32_t flags = 0; uint32_t dtx_flags = 0; - uint32_t version = 0; uint32_t max_ver = 0; struct dtx_epoch epoch; int rc; @@ -4047,7 +4040,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (rc == PE_OK_LOCAL) opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN; - version = opi->opi_map_ver; max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -4067,9 +4059,8 @@ ds_obj_punch_handler(crt_rpc_t *rpc) /* Handle resend. */ if (opi->opi_flags & ORF_RESEND) { again: - version = opi->opi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, + opi->opi_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -4112,9 +4103,9 @@ ds_obj_punch_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, opi->opi_map_ver, + &opi->opi_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -4176,7 +4167,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) int rc1; dte.dte_xid = opi->opi_dti; - dte.dte_ver = version; + dte.dte_ver = opi->opi_map_ver; dte.dte_refs = 1; dte.dte_mbs = mbs; rc1 = dtx_abort(ioc.ioc_coc, &dte, opi->opi_epoch); @@ -5097,7 +5088,7 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) D_ASSERT(epoch != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, &oci->oci_map_ver, + rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, oci->oci_map_ver, NULL, dcsh->dcsh_mbs, false, true); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); @@ -5228,7 +5219,7 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) * that the DTX has been restarted with newer epoch. */ rc = obj_handle_resend(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch.oe_value, &oci->oci_map_ver, &flags, + &dcsh->dcsh_epoch.oe_value, oci->oci_map_ver, &flags, dcsh->dcsh_mbs, true, true); if (rc < 0) goto out; @@ -5812,7 +5803,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) goto out; } - version = ocpi->ocpi_map_ver; max_ver = ocpi->ocpi_map_ver; if (ocpi->ocpi_flags & ORF_DTX_SYNC) @@ -5823,15 +5813,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { again: - version = ocpi->ocpi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, - &version, &flags, odm->odm_mbs, leader, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, &flags, odm->odm_mbs, leader, false); if (rc < 0) goto out; if (rc == ORS_DONE) D_GOTO(out, rc = 0); - - dce->dce_ver = version; } epoch.oe_value = ocpi->ocpi_epoch; @@ -5854,10 +5841,10 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) &exec_arg.coll_cur); rc = dtx_leader_begin(ioc.ioc_vos_coh, &odm->odm_xid, &epoch, - dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, version, + dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, ocpi->ocpi_map_ver, &ocpi->ocpi_oid, NULL /* dti_cos */, 0 /* dti_cos_cnt */, - NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, - dtx_flags, odm->odm_mbs, dce, &dlh); + NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, dtx_flags, + odm->odm_mbs, dce, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); @@ -5902,9 +5889,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (max_ver < ioc.ioc_map_ver) max_ver = ioc.ioc_map_ver; - if (max_ver < version) - max_ver = version; - DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c index c267c812e81..d8207d84687 100644 --- a/src/utils/ddb/ddb_vos.c +++ b/src/utils/ddb/ddb_vos.c @@ -1423,7 +1423,7 @@ dv_dtx_commit_active_entry(daos_handle_t coh, struct dtx_id *dti) int dv_dtx_abort_active_entry(daos_handle_t coh, struct dtx_id *dti) { - return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX); + return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX, 0); } int diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index d83d2356d15..60633dc6156 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -15,6 +16,8 @@ #include #include "vts_io.h" +#define VTS_DTX_VER 3 + static void vts_init_dte(struct dtx_entry *dte) { @@ -33,7 +36,7 @@ vts_init_dte(struct dtx_entry *dte) /** Use unique API so new UUID is generated even on same thread */ daos_dti_gen_unique(&dte->dte_xid); - dte->dte_ver = 1; + dte->dte_ver = VTS_DTX_VER; dte->dte_refs = 1; dte->dte_mbs = mbs; } @@ -357,8 +360,16 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) /* The update DTX is 'prepared'. */ vts_dtx_end(dth); + /* Abort with old epoch should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch - 1, VTS_DTX_VER); + assert_rc_equal(rc, -DER_NONEXIST); + + /* Abort with old version should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER - 1); + assert_rc_equal(rc, -DER_NONEXIST); + /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -390,7 +401,7 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) vts_dtx_end(dth); /* Aborted the punch DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -489,7 +500,7 @@ dtx_14(void **state) assert_memory_equal(update_buf, fetch_buf, UPDATE_BUF_SIZE); /* Committed DTX cannot be aborted. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -551,11 +562,11 @@ dtx_15(void **state) vts_dtx_end(dth); /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); /* Double aborted the DTX is harmless. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index e08d63fd673..6381831ff9a 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -28,18 +28,19 @@ #define DTX_UMOFF_TYPES (DTX_UMOFF_ILOG | DTX_UMOFF_SVT | DTX_UMOFF_EVT) #define DTX_INDEX_INVAL (int32_t)(-1) -#define dtx_evict_lid(cont, dae) \ - do { \ - if (dae->dae_dth != NULL && dae->dae_dth->dth_ent != NULL) { \ - D_ASSERT(dae->dae_dth->dth_ent == dae); \ - dae->dae_dth->dth_ent = NULL; \ - } \ - D_DEBUG(DB_IO, "Evicting DTX "DF_DTI": lid=%x\n", \ - DP_DTI(&DAE_XID(dae)), DAE_LID(dae)); \ - d_list_del_init(&dae->dae_link); \ - lrua_evictx(cont->vc_dtx_array, \ - (DAE_LID(dae) & DTX_LID_SOLO_MASK) - DTX_LID_RESERVED, \ - DAE_EPOCH(dae)); \ +#define dtx_evict_lid(cont, dae) \ + do { \ + if (dae->dae_dth != NULL && dae->dae_dth->dth_ent != NULL) { \ + D_ASSERT(dae->dae_dth->dth_ent == dae); \ + dae->dae_dth->dth_need_validation = 1; \ + dae->dae_dth->dth_ent = NULL; \ + } \ + D_DEBUG(DB_IO, "Evicting DTX " DF_DTI ": lid = %x\n", DP_DTI(&DAE_XID(dae)), \ + DAE_LID(dae)); \ + d_list_del_init(&dae->dae_link); \ + lrua_evictx(cont->vc_dtx_array, \ + (DAE_LID(dae) & DTX_LID_SOLO_MASK) - DTX_LID_RESERVED, \ + DAE_EPOCH(dae)); \ } while (0) bool vos_skip_old_partial_dtx; @@ -116,6 +117,9 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_version = DAE_VER(dae); dsp->dsp_dkey_hash = DAE_DKEY_HASH(dae); + if (unlikely(DAE_MBS_DSIZE(dae) == 0)) + goto add; + mbs = (struct dtx_memberships *)(dsp + 1); mbs->dm_tgt_cnt = DAE_TGT_CNT(dae); mbs->dm_grp_cnt = DAE_GRP_CNT(dae); @@ -135,6 +139,7 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_inline_mbs = 1; dsp->dsp_mbs = mbs; +add: d_list_add_tail(&dsp->dsp_link, &dth->dth_share_tbd_list); dth->dth_share_tbd_count++; @@ -1067,15 +1072,17 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) if (dth->dth_mbs != NULL) { DAE_TGT_CNT(dae) = dth->dth_mbs->dm_tgt_cnt; DAE_GRP_CNT(dae) = dth->dth_mbs->dm_grp_cnt; - DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; DAE_MBS_FLAGS(dae) = dth->dth_mbs->dm_flags; } else { DAE_TGT_CNT(dae) = 1; DAE_GRP_CNT(dae) = 1; - DAE_MBS_DSIZE(dae) = 0; DAE_MBS_FLAGS(dae) = 0; } + /* Will set DAE_MBS_DSIZE and DAE_MBS_OFF via vos_dtx_prepared(). */ + DAE_MBS_DSIZE(dae) = 0; + DAE_MBS_OFF(dae) = UMOFF_NULL; + /* Will be set as dbd::dbd_index via vos_dtx_prepared(). */ DAE_INDEX(dae) = DTX_INDEX_INVAL; dae->dae_dth = dth; @@ -1793,20 +1800,23 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) dae->dae_oid_cnt = 1; } - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { - memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, - DAE_MBS_DSIZE(dae)); - } else { - rec_off = umem_zalloc(umm, DAE_MBS_DSIZE(dae)); - if (UMOFF_IS_NULL(rec_off)) { - D_ERROR("No space to store DTX mbs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - return -DER_NOSPACE; - } + if (dth->dth_mbs != NULL) { + if (dth->dth_mbs->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) { + memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + } else { + rec_off = umem_zalloc(umm, dth->dth_mbs->dm_data_size); + if (UMOFF_IS_NULL(rec_off)) { + D_ERROR("No space (%u) to store MBS for DTX " DF_DTI "\n", + dth->dth_mbs->dm_data_size, DP_DTI(&DAE_XID(dae))); + return -DER_NOSPACE; + } - memcpy(umem_off2ptr(umm, rec_off), - dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); - DAE_MBS_OFF(dae) = rec_off; + memcpy(umem_off2ptr(umm, rec_off), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + DAE_MBS_OFF(dae) = rec_off; + } + DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; } if (dae->dae_records != NULL) { @@ -1853,34 +1863,45 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) return rc; } -static struct dtx_memberships * -vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) +static int +vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae, + struct dtx_memberships **p_mbs) { struct dtx_handle *dth = dae->dae_dth; struct dtx_memberships *tmp; size_t size; - size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (dth != NULL) + size = sizeof(*tmp) + dth->dth_mbs->dm_data_size; + else + size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (unlikely(size == sizeof(*tmp))) + return -DER_NONEXIST; + D_ALLOC(tmp, size); if (tmp == NULL) - return NULL; + return -DER_NOMEM; tmp->dm_tgt_cnt = DAE_TGT_CNT(dae); tmp->dm_grp_cnt = DAE_GRP_CNT(dae); - tmp->dm_data_size = DAE_MBS_DSIZE(dae); tmp->dm_flags = DAE_MBS_FLAGS(dae); tmp->dm_dte_flags = DAE_FLAGS(dae); - /* The DTX is not prepared yet, copy the MBS from DTX handle. */ - if (dth != NULL) + /* The DTX maybe not prepared yet, copy the MBS from DTX handle. */ + if (dth != NULL) { + tmp->dm_data_size = dth->dth_mbs->dm_data_size; memcpy(tmp->dm_data, dth->dth_mbs->dm_data, tmp->dm_data_size); - else if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) - memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); - else - memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), - tmp->dm_data_size); + } else { + tmp->dm_data_size = DAE_MBS_DSIZE(dae); + if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) + memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); + else + memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), + tmp->dm_data_size); + } - return tmp; + *p_mbs = tmp; + return 0; } int @@ -1952,6 +1973,9 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (dae->dae_dth != NULL) return -DER_INPROGRESS; + if (pm_ver != NULL) + *pm_ver = DAE_VER(dae); + if (epoch != NULL) { daos_epoch_t e = *epoch; @@ -1972,11 +1996,8 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (pm_ver == NULL) return DTX_ST_PREPARED; - if (*pm_ver <= cont->vc_dtx_resync_ver) { - if (!for_refresh) - *pm_ver = DAE_VER(dae); + if (*pm_ver <= cont->vc_dtx_resync_ver) return DTX_ST_PREPARED; - } /* * Before DTX resync completed, it is not sure whether related DTX is @@ -2012,7 +2033,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, struct dtx_memberships **mbs) { struct vos_container *cont; - struct dtx_memberships *tmp; struct vos_dtx_act_ent *dae; d_iov_t kiov; d_iov_t riov; @@ -2026,14 +2046,9 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { dae = riov.iov_buf; - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); - if (tmp == NULL) { - rc = -DER_NOMEM; - } else { - if (oid != NULL) - *oid = DAE_OID(dae); - *mbs = tmp; - } + rc = vos_dtx_pack_mbs(vos_cont2umm(cont), dae, mbs); + if (rc == 0 && oid != NULL) + *oid = DAE_OID(dae); } else if (rc == -DER_NONEXIST) { rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); if (rc == 0) @@ -2121,6 +2136,7 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership goto out; dae_df->dae_mbs_off = UMOFF_NULL; + dae_df->dae_mbs_dsize = 0; } if (new_inline) { @@ -2143,10 +2159,9 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership out: if (started) { if (rc == 0) { + memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); rc = umem_tx_commit(umm); D_ASSERTF(rc == 0, "local TX commit failure: %d\n", rc); - - memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); } else { rc = umem_tx_abort(umm, rc); } @@ -2300,6 +2315,8 @@ vos_dtx_post_handle(struct vos_container *cont, struct vos_dtx_cmt_ent **dces, int count, bool abort, bool rollback, bool keep_act) { + struct umem_instance *umm = vos_cont2umm(cont); + struct vos_dtx_act_ent_df *dae_df; d_iov_t kiov; int rc; int i; @@ -2354,6 +2371,8 @@ vos_dtx_post_handle(struct vos_container *cont, } for (i = 0; i < count; i++) { + struct vos_dtx_act_ent *dae = NULL; + if (daes[i] == NULL) continue; @@ -2370,9 +2389,18 @@ vos_dtx_post_handle(struct vos_container *cont, } d_iov_set(&kiov, &DAE_XID(daes[i]), sizeof(DAE_XID(daes[i]))); - rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_EQ, - &kiov, NULL); + /* + * For abort case, set @args as NULL, then related vos object will be evicted from + * cache via dbtree_delete(). + */ + rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_EQ, &kiov, + abort ? NULL : &dae); if (rc == 0 || rc == -DER_NONEXIST) { + if (dae != NULL) { + D_ASSERT(dae == daes[i]); + dtx_act_ent_cleanup(cont, dae, NULL, false, false); + } + dtx_evict_lid(cont, daes[i]); } else { /* The DTX entry has been committed or aborted, but we @@ -2385,7 +2413,9 @@ vos_dtx_post_handle(struct vos_container *cont, D_ASSERT(daes[i]->dae_preparing == 0); - daes[i]->dae_prepared = 0; + dae_df = umem_off2ptr(umm, daes[i]->dae_df_off); + memcpy(&daes[i]->dae_base, dae_df, sizeof(*dae_df)); + if (abort) { D_ASSERT(daes[i]->dae_committing == 0); @@ -2503,7 +2533,7 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, } int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version) { struct vos_container *cont; struct vos_dtx_act_ent *dae = NULL; @@ -2549,6 +2579,9 @@ vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) if (epoch != DAOS_EPOCH_MAX && epoch != DAE_EPOCH(dae)) D_GOTO(out, rc = -DER_NONEXIST); + if (version != 0 && version < DAE_VER(dae)) + D_GOTO(out, rc = -DER_NONEXIST); + if (unlikely(dae->dae_preparing)) { /* * NOTE: Abort in-preparing DTX entry. It may because the non-leader is some slow, diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index d52fa5ba75e..f35904cb6eb 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -11,6 +11,7 @@ %global daos_build_args client test %endif %global mercury_version 2.4.1 +%global mercury_version_max 2.4.1-2 %global libfabric_version 1.20 %global argobots_version 1.2-3 %global __python %{__python3} @@ -37,6 +38,7 @@ BuildRequires: scons >= 2.4 %endif BuildRequires: libfabric-devel >= %{libfabric_version} BuildRequires: mercury-devel >= %{mercury_version} +BuildRequires: mercury-devel <= %{mercury_version_max} BuildRequires: gcc-c++ %if (0%{?rhel} >= 8) %global openmpi openmpi @@ -135,6 +137,7 @@ Requires: openssl Requires: mercury-libfabric >= %{mercury_version} + %description The Distributed Asynchronous Object Storage (DAOS) is an open-source software-defined object store designed from the ground up for