From 33f4da0d47d2abec52f004866e76a3982417bb73 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Mon, 8 Jun 2026 14:06:33 +0800 Subject: [PATCH] DAOS-19036 dtx: handle DTX race issues - b28 Mainly including the following fixes: 1. When DTX leader switch, it is possible that the old DTX leader wanted to abort such DTX but not completed before its eviction. And then the new DTX leader may re-execute related modification successfully and try to commit such DTX. If without control, it is possible that those in-flight DTX ABORT RPC from the old DTX leader may abort the DTX that is to be committed by the new DTX leader, then break DTX semantics. The patch adds @version parameter when abort DTX: when new DTX leader handles resent RPC from client, related DTX version will be refreshed if it has been prepared by old DTX leader; anytime when abort DTX locally, the logic will compare the version from ABORT request with related DTX version and skip stale ABORT RPC. 2. vos_dtx_load_mbs() maybe triggered before related DTX prepared locally. Under such case, related MBS information is empty. We need to handle such case to avoid segmentation fault. 3. Handle race between DTX resync and IO handler for resent RPC. Signed-off-by: Fan Yong --- src/dtx/dtx_coll.c | 23 +++--- src/dtx/dtx_internal.h | 5 +- src/dtx/dtx_resync.c | 37 ++++++--- src/dtx/dtx_rpc.c | 19 +++-- src/dtx/dtx_srv.c | 6 +- src/include/daos_srv/vos.h | 11 +-- src/object/srv_obj.c | 106 +++++++++++------------- src/utils/ddb/ddb_vos.c | 2 +- src/vos/tests/vts_dtx.c | 22 +++-- src/vos/vos_dtx.c | 163 ++++++++++++++++++++++--------------- 10 files changed, 224 insertions(+), 170 deletions(-) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index f4f81e6a2fa..e641a9ade18 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -38,12 +38,13 @@ */ struct dtx_coll_local_args { - uuid_t dcla_po_uuid; - uuid_t dcla_co_uuid; - struct dtx_id dcla_xid; - daos_epoch_t dcla_epoch; - uint32_t dcla_opc; - int *dcla_results; + uuid_t dcla_po_uuid; + uuid_t dcla_co_uuid; + struct dtx_id dcla_xid; + daos_epoch_t dcla_epoch; + uint32_t dcla_ver; + uint32_t dcla_opc; + int *dcla_results; }; void @@ -361,7 +362,7 @@ dtx_coll_local_one(void *args) rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, false, NULL); break; case DTX_COLL_ABORT: - rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch, dcla->dcla_ver); break; case DTX_COLL_CHECK: rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, false); @@ -397,7 +398,8 @@ dtx_coll_local_one(void *args) int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results) { struct dtx_coll_local_args dcla = { 0 }; struct dss_coll_ops coll_ops = { 0 }; @@ -410,9 +412,10 @@ dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epo uuid_copy(dcla.dcla_po_uuid, po_uuid); uuid_copy(dcla.dcla_co_uuid, co_uuid); - dcla.dcla_xid = *xid; + dcla.dcla_xid = *xid; dcla.dcla_epoch = epoch; - dcla.dcla_opc = opc; + dcla.dcla_ver = version; + dcla.dcla_opc = opc; coll_ops.co_func = dtx_coll_local_one; coll_args.ca_func_args = &dcla; diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 1298b4b350a..6930976d08f 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -303,7 +303,8 @@ int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results); /* clang-format on */ enum dtx_status_handle_result { diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index aef563bb8ad..0b131a368d7 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -36,11 +36,12 @@ struct dtx_resync_head { }; struct dtx_resync_args { - struct ds_cont_child *cont; - struct dtx_resync_head tables; - daos_epoch_t epoch; - uint32_t resync_version; - uint32_t discard_version; + struct ds_cont_child *cont; + struct dtx_resync_head tables; + daos_epoch_t epoch; + uint32_t resync_version; + uint32_t discard_version; + bool for_all; }; static inline void @@ -329,8 +330,8 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un else rc = dtx_abort(cont, dte, epoch); - D_DEBUG(DB_TRACE, "As new leader for DTX "DF_DTI", abort it (2): "DF_RC"\n", - DP_DTI(&dte->dte_xid), DP_RC(rc)); + DL_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, rc, + "As new leader for DTX " DF_DTI ", abort it (2)", DP_DTI(&dte->dte_xid)); if (rc < 0) { if (err != NULL) @@ -392,7 +393,8 @@ dtx_status_handle(struct dtx_resync_args *dra) again: d_list_for_each_entry_safe(dre, next, &drh->drh_list, dre_link) { if (dre->dre_dte.dte_ver < dra->discard_version) { - err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch); + err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch, + dre->dre_dte.dte_ver); if (err == -DER_NONEXIST) err = 0; if (err != 0) @@ -538,7 +540,17 @@ dtx_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) if (dra->resync_version == dra->discard_version) return 0; - /* Skip unprepared entry which version is at least not older than discard version. */ + /* + * The DTX version maybe refreshed via obj_handle_resend(). It means that either the + * DTX is generated against the latest pool map or related IO RPC is resent by client + * after pool map changed. Under both cases, the DTX resync that is triggered for pool + * map changes (@for_all is false) should not handle such DTX to avoid making conflict + * commit/abort decision (against regular IO handler) by race. + */ + if ((ent->ie_dtx_ver > dra->resync_version) || + (ent->ie_dtx_ver == dra->resync_version && !dra->for_all)) + return 0; + if (ent->ie_dtx_tgt_cnt == 0) return 0; @@ -665,6 +677,11 @@ dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool D_INIT_LIST_HEAD(&dra.tables.drh_list); dra.tables.drh_count = 0; + if (block) + dra.for_all = false; + else + dra.for_all = true; + /* * Trigger DTX reindex. That will avoid DTX_CHECK from others being blocked. * It is harmless even if (committed) DTX entries have already been re-indexed. diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 140c93f0b7a..e6be4616a38 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -961,7 +961,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) * to resend sometime later. */ if (epoch != 0) - rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch, dte->dte_ver); else rc1 = vos_dtx_set_flags(cont->sc_hdl, &dte->dte_xid, 1, DTE_CORRUPTED); if (rc1 > 0 || rc1 == -DER_NONEXIST) @@ -1233,7 +1233,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che d_list_del(&dsp->dsp_link); dtx_dsp_free(dsp); } else { - rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch, + dsp->dsp_version); D_ASSERT(rc1 != -DER_NO_PERM); if (rc1 == 0 || !for_io) { @@ -1653,8 +1654,8 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, - DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_COMMIT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1736,8 +1737,8 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_ABORT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1757,7 +1758,7 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } if (epoch != 0) - rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch); + rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch, dce->dce_ver); else rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED); if (rc2 > 0 || rc2 == -DER_NONEXIST) @@ -1793,8 +1794,8 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_CHECK, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 5b68645cf54..0a04c47f2e9 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -241,7 +241,7 @@ dtx_handler(crt_rpc_t *rpc) rc = vos_dtx_abort(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, - din->di_epoch); + din->di_epoch, din->di_version); } else { rc = vos_dtx_set_flags(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, @@ -464,7 +464,7 @@ dtx_coll_handler(crt_rpc_t *rpc) } len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, - opc, bitmap_sz, bitmap, &results); + dci->dci_version, opc, bitmap_sz, bitmap, &results); if (len < 0) D_GOTO(out, rc = len); diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 730c2e88742..647009d388a 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -199,14 +199,15 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool keep_act /** * Abort the specified DTXs. * - * \param coh [IN] Container open handle. - * \param dti [IN] The DTX identifiers to be aborted. - * \param epoch [IN] The max epoch for the DTX to be aborted. + * \param coh [IN] Container open handle. + * \param dti [IN] The DTX identifiers to be aborted. + * \param epoch [IN] The max epoch for the DTX to be aborted. + * \param version [IN] The max version for the DTX to be aborted. * * \return Zero on success, negative value if error. */ int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch); +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version); /** * Discard the active DTX entry's records if invalid. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index e3c4fcfdb95..da6d682306d 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2712,11 +2712,11 @@ enum obj_resend_status { }; static int -obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t *pm_ver, +obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t pm_ver, uint32_t *flags, struct dtx_memberships *mbs, bool leader, bool dist) { daos_epoch_t e; - uint32_t ver = *pm_ver; + uint32_t ver = pm_ver; int rc; if (!leader || dist || (flags != NULL && *flags & ORF_RESEND)) @@ -2733,16 +2733,13 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui /* For 'prepared' DTX, if pool map has been changed, then DTX membership maybe * changed also. Let's refresh it if necessary. */ - if (ver < *pm_ver) { - rc = vos_dtx_refresh_mbs(coh, dti, mbs, *pm_ver, leader); + if (ver < pm_ver) { + rc = vos_dtx_refresh_mbs(coh, dti, mbs, pm_ver, leader); if (rc < 0) goto out; if (rc > 0) rc = 0; - - if (leader && !dist) - *pm_ver = ver; } if (flags != NULL) { @@ -2770,7 +2767,7 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui D_GOTO(out, rc = -DER_INPROGRESS); /* Abort it if exist but with different epoch, then re-execute with new epoch. */ - rc = vos_dtx_abort(coh, dti, e); + rc = vos_dtx_abort(coh, dti, e, ver); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); /* Fall through */ @@ -2836,7 +2833,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) /* Handle resend. */ if (orw->orw_flags & ORF_RESEND) { rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, - &orw->orw_map_ver, NULL, mbs, false, false); + orw->orw_map_ver, NULL, mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -3019,13 +3016,12 @@ ds_obj_rw_handler(crt_rpc_t *rpc) struct daos_shard_tgt *tgts = NULL; struct dtx_id *dti_cos = NULL; struct obj_pool_metrics *opm; - int dti_cos_cnt; - uint32_t tgt_cnt; - uint32_t version = 0; - uint32_t max_ver = 0; - struct dtx_epoch epoch = {0}; - int rc; - bool need_abort = false; + int dti_cos_cnt; + uint32_t tgt_cnt; + uint32_t max_ver = 0; + struct dtx_epoch epoch = {0}; + int rc; + bool need_abort = false; D_ASSERT(orw != NULL); D_ASSERT(orwo != NULL); @@ -3136,7 +3132,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (rc != 0) D_GOTO(out, rc); - version = orw->orw_map_ver; max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { @@ -3155,9 +3150,8 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); again: - version = orw->orw_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, + orw->orw_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -3200,9 +3194,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, - version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, orw->orw_map_ver, + &orw->orw_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -3269,10 +3263,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc) struct dtx_entry dte; int rc1; - dte.dte_xid = orw->orw_dti; - dte.dte_ver = version; + dte.dte_xid = orw->orw_dti; + dte.dte_ver = orw->orw_map_ver; dte.dte_refs = 1; - dte.dte_mbs = mbs; + dte.dte_mbs = mbs; + rc1 = dtx_abort(ioc.ioc_coc, &dte, orw->orw_epoch); if (rc1 != 0 && rc1 != -DER_NONEXIST) D_WARN("Failed to abort DTX "DF_DTI": "DF_RC"\n", @@ -3844,7 +3839,7 @@ obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) if (opi->opi_flags & ORF_RESEND) { rc = obj_handle_resend(p_ioc->ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, - &opi->opi_map_ver, NULL, otpa->mbs, false, false); + opi->opi_map_ver, NULL, otpa->mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -4007,15 +4002,14 @@ ds_obj_punch_handler(crt_rpc_t *rpc) struct dtx_memberships *mbs = NULL; struct daos_shard_tgt *tgts = NULL; struct dtx_id *dti_cos = NULL; - int dti_cos_cnt; - uint32_t tgt_cnt; - uint32_t flags = 0; - uint32_t dtx_flags = 0; - uint32_t version = 0; - uint32_t max_ver = 0; - struct dtx_epoch epoch; - int rc; - bool need_abort = false; + int dti_cos_cnt; + uint32_t tgt_cnt; + uint32_t flags = 0; + uint32_t dtx_flags = 0; + uint32_t max_ver = 0; + struct dtx_epoch epoch; + int rc; + bool need_abort = false; opi = crt_req_get(rpc); D_ASSERT(opi != NULL); @@ -4053,7 +4047,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) dtx_flags |= DTX_EPOCH_OWNER; } - version = opi->opi_map_ver; max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -4073,9 +4066,8 @@ ds_obj_punch_handler(crt_rpc_t *rpc) /* Handle resend. */ if (opi->opi_flags & ORF_RESEND) { again: - version = opi->opi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, + opi->opi_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -4118,9 +4110,9 @@ ds_obj_punch_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, opi->opi_map_ver, + &opi->opi_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -4181,10 +4173,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc) struct dtx_entry dte; int rc1; - dte.dte_xid = opi->opi_dti; - dte.dte_ver = version; + dte.dte_xid = opi->opi_dti; + dte.dte_ver = opi->opi_map_ver; dte.dte_refs = 1; - dte.dte_mbs = mbs; + dte.dte_mbs = mbs; + rc1 = dtx_abort(ioc.ioc_coc, &dte, opi->opi_epoch); if (rc1 != 0 && rc1 != -DER_NONEXIST) D_WARN("Failed to abort DTX "DF_DTI": "DF_RC"\n", @@ -5150,7 +5143,7 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) D_ASSERT(epoch != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, &oci->oci_map_ver, + rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, oci->oci_map_ver, NULL, dcsh->dcsh_mbs, false, true); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); @@ -5282,7 +5275,7 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) * that the DTX has been restarted with newer epoch. */ rc = obj_handle_resend(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch.oe_value, &oci->oci_map_ver, &flags, + &dcsh->dcsh_epoch.oe_value, oci->oci_map_ver, &flags, dcsh->dcsh_mbs, true, true); if (rc < 0) goto out; @@ -5865,7 +5858,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) goto out; } - version = ocpi->ocpi_map_ver; max_ver = ocpi->ocpi_map_ver; if (ocpi->ocpi_flags & ORF_DTX_SYNC) @@ -5876,15 +5868,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { again: - version = ocpi->ocpi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, - &version, &flags, odm->odm_mbs, leader, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, &flags, odm->odm_mbs, leader, false); if (rc < 0) goto out; if (rc == ORS_DONE) D_GOTO(out, rc = 0); - - dce->dce_ver = version; } epoch.oe_value = ocpi->ocpi_epoch; @@ -5907,10 +5896,10 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) &exec_arg.coll_cur); rc = dtx_leader_begin(ioc.ioc_vos_coh, &odm->odm_xid, &epoch, - dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, version, + dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, ocpi->ocpi_map_ver, &ocpi->ocpi_oid, NULL /* dti_cos */, 0 /* dti_cos_cnt */, - NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, - dtx_flags, odm->odm_mbs, dce, &dlh); + NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, dtx_flags, + odm->odm_mbs, dce, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); @@ -5955,9 +5944,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (max_ver < ioc.ioc_map_ver) max_ver = ioc.ioc_map_ver; - if (max_ver < version) - max_ver = version; - DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c index 55f6651d131..3101d25aa14 100644 --- a/src/utils/ddb/ddb_vos.c +++ b/src/utils/ddb/ddb_vos.c @@ -1450,7 +1450,7 @@ dv_dtx_commit_active_entry(daos_handle_t coh, struct dtx_id *dti) int dv_dtx_abort_active_entry(daos_handle_t coh, struct dtx_id *dti) { - return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX); + return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX, 0); } int diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index 57d80412c96..bafbc4a01b3 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -16,6 +16,8 @@ #include #include "vts_io.h" +#define VTS_DTX_VER 3 + static void vts_init_dte(struct dtx_entry *dte) { @@ -34,7 +36,7 @@ vts_init_dte(struct dtx_entry *dte) /** Use unique API so new UUID is generated even on same thread */ daos_dti_gen_unique(&dte->dte_xid); - dte->dte_ver = 1; + dte->dte_ver = VTS_DTX_VER; dte->dte_refs = 1; dte->dte_mbs = mbs; } @@ -334,8 +336,16 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) /* The update DTX is 'prepared'. */ vts_dtx_end(dth); + /* Abort with old epoch should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch - 1, VTS_DTX_VER); + assert_rc_equal(rc, -DER_NONEXIST); + + /* Abort with old version should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER - 1); + assert_rc_equal(rc, -DER_NONEXIST); + /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -367,7 +377,7 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) vts_dtx_end(dth); /* Aborted the punch DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -466,7 +476,7 @@ dtx_14(void **state) assert_memory_equal(update_buf, fetch_buf, UPDATE_BUF_SIZE); /* Committed DTX cannot be aborted. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -528,11 +538,11 @@ dtx_15(void **state) vts_dtx_end(dth); /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); /* Double aborted the DTX is harmless. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index ef5204c3503..d9f7a18197c 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -117,6 +117,9 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_version = DAE_VER(dae); dsp->dsp_dkey_hash = DAE_DKEY_HASH(dae); + if (unlikely(DAE_MBS_DSIZE(dae) == 0)) + goto add; + mbs = (struct dtx_memberships *)(dsp + 1); mbs->dm_tgt_cnt = DAE_TGT_CNT(dae); mbs->dm_grp_cnt = DAE_GRP_CNT(dae); @@ -128,6 +131,9 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, } else { struct umem_instance *umm; + D_ASSERTF(!UMOFF_IS_NULL(DAE_MBS_OFF(dae)), + "Unexpected empty MBS info for DTX " DF_DTI "\n", DP_DTI(&DAE_XID(dae))); + umm = vos_cont2umm(vos_hdl2cont(dth->dth_coh)); memcpy(mbs->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), DAE_MBS_DSIZE(dae)); @@ -136,6 +142,7 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_inline_mbs = 1; dsp->dsp_mbs = mbs; +add: d_list_add_tail(&dsp->dsp_link, &dth->dth_share_tbd_list); dth->dth_share_tbd_count++; @@ -1122,17 +1129,22 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) DAE_VER(dae) = dth->dth_ver; if (dth->dth_mbs != NULL) { - DAE_TGT_CNT(dae) = dth->dth_mbs->dm_tgt_cnt; - DAE_GRP_CNT(dae) = dth->dth_mbs->dm_grp_cnt; - DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; + D_ASSERTF(dth->dth_mbs->dm_data_size != 0, "Invalid MBS size for " DF_DTI "\n", + DP_DTI(&dth->dth_xid)); + + DAE_TGT_CNT(dae) = dth->dth_mbs->dm_tgt_cnt; + DAE_GRP_CNT(dae) = dth->dth_mbs->dm_grp_cnt; DAE_MBS_FLAGS(dae) = dth->dth_mbs->dm_flags; } else { - DAE_TGT_CNT(dae) = 1; - DAE_GRP_CNT(dae) = 1; - DAE_MBS_DSIZE(dae) = 0; + DAE_TGT_CNT(dae) = 1; + DAE_GRP_CNT(dae) = 1; DAE_MBS_FLAGS(dae) = 0; } + /* Will set DAE_MBS_DSIZE and DAE_MBS_OFF via vos_dtx_prepared(). */ + DAE_MBS_DSIZE(dae) = 0; + DAE_MBS_OFF(dae) = UMOFF_NULL; + /* Will be set as dbd::dbd_index via vos_dtx_prepared(). */ DAE_INDEX(dae) = DTX_INDEX_INVAL; dae->dae_dth = dth; @@ -1835,20 +1847,23 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) (dth->dth_modification_cnt > 0)) dth->dth_sync = 1; - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { - memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, - DAE_MBS_DSIZE(dae)); - } else { - rec_off = umem_zalloc(umm, DAE_MBS_DSIZE(dae)); - if (UMOFF_IS_NULL(rec_off)) { - D_ERROR("No space to store DTX mbs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - return -DER_NOSPACE; - } + if (dth->dth_mbs != NULL) { + if (dth->dth_mbs->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) { + memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + } else { + rec_off = umem_zalloc(umm, dth->dth_mbs->dm_data_size); + if (UMOFF_IS_NULL(rec_off)) { + D_ERROR("No space (%u) to store MBS for DTX " DF_DTI "\n", + dth->dth_mbs->dm_data_size, DP_DTI(&DAE_XID(dae))); + return -DER_NOSPACE; + } - memcpy(umem_off2ptr(umm, rec_off), - dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); - DAE_MBS_OFF(dae) = rec_off; + memcpy(umem_off2ptr(umm, rec_off), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + DAE_MBS_OFF(dae) = rec_off; + } + DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; } if (dae->dae_records != NULL) { @@ -1895,34 +1910,51 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) return rc; } -static struct dtx_memberships * -vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) +static int +vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae, + struct dtx_memberships **p_mbs) { struct dtx_handle *dth = dae->dae_dth; struct dtx_memberships *tmp; size_t size; - size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (dth != NULL) + size = sizeof(*tmp) + dth->dth_mbs->dm_data_size; + else + size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (unlikely(size == sizeof(*tmp))) + return -DER_NONEXIST; + D_ALLOC(tmp, size); if (tmp == NULL) - return NULL; + return -DER_NOMEM; - tmp->dm_tgt_cnt = DAE_TGT_CNT(dae); - tmp->dm_grp_cnt = DAE_GRP_CNT(dae); - tmp->dm_data_size = DAE_MBS_DSIZE(dae); - tmp->dm_flags = DAE_MBS_FLAGS(dae); + tmp->dm_tgt_cnt = DAE_TGT_CNT(dae); + tmp->dm_grp_cnt = DAE_GRP_CNT(dae); + tmp->dm_flags = DAE_MBS_FLAGS(dae); tmp->dm_dte_flags = DAE_FLAGS(dae); - /* The DTX is not prepared yet, copy the MBS from DTX handle. */ - if (dth != NULL) + /* The DTX maybe not prepared yet, copy the MBS from DTX handle. */ + if (dth != NULL) { + tmp->dm_data_size = dth->dth_mbs->dm_data_size; memcpy(tmp->dm_data, dth->dth_mbs->dm_data, tmp->dm_data_size); - else if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) - memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); - else - memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), - tmp->dm_data_size); + } else { + tmp->dm_data_size = DAE_MBS_DSIZE(dae); - return tmp; + if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) { + memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); + } else { + D_ASSERTF(!UMOFF_IS_NULL(DAE_MBS_OFF(dae)), + "Unexpected empty MBS info for DTX " DF_DTI "\n", + DP_DTI(&DAE_XID(dae))); + + memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), + tmp->dm_data_size); + } + } + + *p_mbs = tmp; + return 0; } int @@ -1994,6 +2026,9 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (dae->dae_dth != NULL) return -DER_INPROGRESS; + if (pm_ver != NULL) + *pm_ver = DAE_VER(dae); + if (epoch != NULL) { daos_epoch_t e = *epoch; @@ -2014,11 +2049,8 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (pm_ver == NULL) return DTX_ST_PREPARED; - if (*pm_ver <= cont->vc_dtx_resync_ver) { - if (!for_refresh) - *pm_ver = DAE_VER(dae); + if (*pm_ver <= cont->vc_dtx_resync_ver) return DTX_ST_PREPARED; - } /* * Before DTX resync completed, it is not sure whether related DTX is @@ -2053,12 +2085,11 @@ int vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, struct dtx_memberships **mbs) { - struct vos_container *cont; - struct dtx_memberships *tmp; - struct vos_dtx_act_ent *dae; - d_iov_t kiov; - d_iov_t riov; - int rc; + struct vos_container *cont; + struct vos_dtx_act_ent *dae; + d_iov_t kiov; + d_iov_t riov; + int rc; cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); @@ -2068,14 +2099,9 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { dae = riov.iov_buf; - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); - if (tmp == NULL) { - rc = -DER_NOMEM; - } else { - if (oid != NULL) - *oid = DAE_OID(dae); - *mbs = tmp; - } + rc = vos_dtx_pack_mbs(vos_cont2umm(cont), dae, mbs); + if (rc == 0 && oid != NULL) + *oid = DAE_OID(dae); } else if (rc == -DER_NONEXIST) { rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); if (rc == 0) @@ -2162,7 +2188,8 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership if (rc != 0) goto out; - dae_df->dae_mbs_off = UMOFF_NULL; + dae_df->dae_mbs_off = UMOFF_NULL; + dae_df->dae_mbs_dsize = 0; } if (new_inline) { @@ -2185,10 +2212,9 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership out: if (started) { if (rc == 0) { + memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); rc = umem_tx_commit(umm); D_ASSERTF(rc == 0, "local TX commit failure: %d\n", rc); - - memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); } else { rc = umem_tx_abort(umm, rc); } @@ -2346,11 +2372,13 @@ vos_dtx_post_handle(struct vos_container *cont, struct vos_dtx_cmt_ent **dces, int count, bool abort, bool rollback, bool keep_act) { - struct vos_tls *tls = vos_tls_get(false); - d_iov_t kiov; - int rc; - int i; - int j; + struct umem_instance *umm = vos_cont2umm(cont); + struct vos_tls *tls = vos_tls_get(false); + struct vos_dtx_act_ent_df *dae_df; + d_iov_t kiov; + int rc; + int i; + int j; D_ASSERT(daes != NULL); @@ -2451,7 +2479,9 @@ vos_dtx_post_handle(struct vos_container *cont, D_ASSERT(daes[i]->dae_preparing == 0); - daes[i]->dae_prepared = 0; + dae_df = umem_off2ptr(umm, daes[i]->dae_df_off); + memcpy(&daes[i]->dae_base, dae_df, sizeof(*dae_df)); + if (abort) { D_ASSERT(daes[i]->dae_committing == 0); @@ -2748,7 +2778,7 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, } int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version) { struct vos_container *cont; struct vos_dtx_act_ent *dae = NULL; @@ -2794,6 +2824,9 @@ vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) if (epoch != DAOS_EPOCH_MAX && epoch != DAE_EPOCH(dae)) D_GOTO(out, rc = -DER_NONEXIST); + if (version != 0 && version < DAE_VER(dae)) + D_GOTO(out, rc = -DER_NONEXIST); + if (unlikely(dae->dae_preparing)) { /* * NOTE: Abort in-preparing DTX entry. It may because the non-leader is some slow, @@ -2811,7 +2844,9 @@ vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) if (rc == -DER_ALREADY) rc = 0; else if (rc != -DER_NONEXIST) - DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, "Abort the DTX " DF_DTI, DP_DTI(dti)); + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Abort the DTX " DF_DTI ": with epoch " DF_U64 ", version %u", + DP_DTI(dti), epoch, version); return rc; }