diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index f4f81e6a2fa..e641a9ade18 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -38,12 +38,13 @@ */ struct dtx_coll_local_args { - uuid_t dcla_po_uuid; - uuid_t dcla_co_uuid; - struct dtx_id dcla_xid; - daos_epoch_t dcla_epoch; - uint32_t dcla_opc; - int *dcla_results; + uuid_t dcla_po_uuid; + uuid_t dcla_co_uuid; + struct dtx_id dcla_xid; + daos_epoch_t dcla_epoch; + uint32_t dcla_ver; + uint32_t dcla_opc; + int *dcla_results; }; void @@ -361,7 +362,7 @@ dtx_coll_local_one(void *args) rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, false, NULL); break; case DTX_COLL_ABORT: - rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch, dcla->dcla_ver); break; case DTX_COLL_CHECK: rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, false); @@ -397,7 +398,8 @@ dtx_coll_local_one(void *args) int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results) { struct dtx_coll_local_args dcla = { 0 }; struct dss_coll_ops coll_ops = { 0 }; @@ -410,9 +412,10 @@ dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epo uuid_copy(dcla.dcla_po_uuid, po_uuid); uuid_copy(dcla.dcla_co_uuid, co_uuid); - dcla.dcla_xid = *xid; + dcla.dcla_xid = *xid; dcla.dcla_epoch = epoch; - dcla.dcla_opc = opc; + dcla.dcla_ver = version; + dcla.dcla_opc = opc; coll_ops.co_func = dtx_coll_local_one; coll_args.ca_func_args = &dcla; diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 1298b4b350a..6930976d08f 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -303,7 +303,8 @@ int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results); /* clang-format on */ enum dtx_status_handle_result { diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index aef563bb8ad..0b131a368d7 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -36,11 +36,12 @@ struct dtx_resync_head { }; struct dtx_resync_args { - struct ds_cont_child *cont; - struct dtx_resync_head tables; - daos_epoch_t epoch; - uint32_t resync_version; - uint32_t discard_version; + struct ds_cont_child *cont; + struct dtx_resync_head tables; + daos_epoch_t epoch; + uint32_t resync_version; + uint32_t discard_version; + bool for_all; }; static inline void @@ -329,8 +330,8 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un else rc = dtx_abort(cont, dte, epoch); - D_DEBUG(DB_TRACE, "As new leader for DTX "DF_DTI", abort it (2): "DF_RC"\n", - DP_DTI(&dte->dte_xid), DP_RC(rc)); + DL_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, rc, + "As new leader for DTX " DF_DTI ", abort it (2)", DP_DTI(&dte->dte_xid)); if (rc < 0) { if (err != NULL) @@ -392,7 +393,8 @@ dtx_status_handle(struct dtx_resync_args *dra) again: d_list_for_each_entry_safe(dre, next, &drh->drh_list, dre_link) { if (dre->dre_dte.dte_ver < dra->discard_version) { - err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch); + err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch, + dre->dre_dte.dte_ver); if (err == -DER_NONEXIST) err = 0; if (err != 0) @@ -538,7 +540,17 @@ dtx_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) if (dra->resync_version == dra->discard_version) return 0; - /* Skip unprepared entry which version is at least not older than discard version. */ + /* + * The DTX version maybe refreshed via obj_handle_resend(). It means that either the + * DTX is generated against the latest pool map or related IO RPC is resent by client + * after pool map changed. Under both cases, the DTX resync that is triggered for pool + * map changes (@for_all is false) should not handle such DTX to avoid making conflict + * commit/abort decision (against regular IO handler) by race. + */ + if ((ent->ie_dtx_ver > dra->resync_version) || + (ent->ie_dtx_ver == dra->resync_version && !dra->for_all)) + return 0; + if (ent->ie_dtx_tgt_cnt == 0) return 0; @@ -665,6 +677,11 @@ dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool D_INIT_LIST_HEAD(&dra.tables.drh_list); dra.tables.drh_count = 0; + if (block) + dra.for_all = false; + else + dra.for_all = true; + /* * Trigger DTX reindex. That will avoid DTX_CHECK from others being blocked. * It is harmless even if (committed) DTX entries have already been re-indexed. diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 140c93f0b7a..e6be4616a38 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -961,7 +961,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) * to resend sometime later. */ if (epoch != 0) - rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch, dte->dte_ver); else rc1 = vos_dtx_set_flags(cont->sc_hdl, &dte->dte_xid, 1, DTE_CORRUPTED); if (rc1 > 0 || rc1 == -DER_NONEXIST) @@ -1233,7 +1233,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che d_list_del(&dsp->dsp_link); dtx_dsp_free(dsp); } else { - rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch, + dsp->dsp_version); D_ASSERT(rc1 != -DER_NO_PERM); if (rc1 == 0 || !for_io) { @@ -1653,8 +1654,8 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, - DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_COMMIT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1736,8 +1737,8 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_ABORT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1757,7 +1758,7 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } if (epoch != 0) - rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch); + rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch, dce->dce_ver); else rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED); if (rc2 > 0 || rc2 == -DER_NONEXIST) @@ -1793,8 +1794,8 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_CHECK, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 5b68645cf54..0a04c47f2e9 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -241,7 +241,7 @@ dtx_handler(crt_rpc_t *rpc) rc = vos_dtx_abort(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, - din->di_epoch); + din->di_epoch, din->di_version); } else { rc = vos_dtx_set_flags(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, @@ -464,7 +464,7 @@ dtx_coll_handler(crt_rpc_t *rpc) } len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, - opc, bitmap_sz, bitmap, &results); + dci->dci_version, opc, bitmap_sz, bitmap, &results); if (len < 0) D_GOTO(out, rc = len); diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 730c2e88742..647009d388a 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -199,14 +199,15 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool keep_act /** * Abort the specified DTXs. * - * \param coh [IN] Container open handle. - * \param dti [IN] The DTX identifiers to be aborted. - * \param epoch [IN] The max epoch for the DTX to be aborted. + * \param coh [IN] Container open handle. + * \param dti [IN] The DTX identifiers to be aborted. + * \param epoch [IN] The max epoch for the DTX to be aborted. + * \param version [IN] The max version for the DTX to be aborted. * * \return Zero on success, negative value if error. */ int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch); +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version); /** * Discard the active DTX entry's records if invalid. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index e3c4fcfdb95..da6d682306d 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2712,11 +2712,11 @@ enum obj_resend_status { }; static int -obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t *pm_ver, +obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t pm_ver, uint32_t *flags, struct dtx_memberships *mbs, bool leader, bool dist) { daos_epoch_t e; - uint32_t ver = *pm_ver; + uint32_t ver = pm_ver; int rc; if (!leader || dist || (flags != NULL && *flags & ORF_RESEND)) @@ -2733,16 +2733,13 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui /* For 'prepared' DTX, if pool map has been changed, then DTX membership maybe * changed also. Let's refresh it if necessary. */ - if (ver < *pm_ver) { - rc = vos_dtx_refresh_mbs(coh, dti, mbs, *pm_ver, leader); + if (ver < pm_ver) { + rc = vos_dtx_refresh_mbs(coh, dti, mbs, pm_ver, leader); if (rc < 0) goto out; if (rc > 0) rc = 0; - - if (leader && !dist) - *pm_ver = ver; } if (flags != NULL) { @@ -2770,7 +2767,7 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui D_GOTO(out, rc = -DER_INPROGRESS); /* Abort it if exist but with different epoch, then re-execute with new epoch. */ - rc = vos_dtx_abort(coh, dti, e); + rc = vos_dtx_abort(coh, dti, e, ver); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); /* Fall through */ @@ -2836,7 +2833,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) /* Handle resend. */ if (orw->orw_flags & ORF_RESEND) { rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, - &orw->orw_map_ver, NULL, mbs, false, false); + orw->orw_map_ver, NULL, mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -3019,13 +3016,12 @@ ds_obj_rw_handler(crt_rpc_t *rpc) struct daos_shard_tgt *tgts = NULL; struct dtx_id *dti_cos = NULL; struct obj_pool_metrics *opm; - int dti_cos_cnt; - uint32_t tgt_cnt; - uint32_t version = 0; - uint32_t max_ver = 0; - struct dtx_epoch epoch = {0}; - int rc; - bool need_abort = false; + int dti_cos_cnt; + uint32_t tgt_cnt; + uint32_t max_ver = 0; + struct dtx_epoch epoch = {0}; + int rc; + bool need_abort = false; D_ASSERT(orw != NULL); D_ASSERT(orwo != NULL); @@ -3136,7 +3132,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (rc != 0) D_GOTO(out, rc); - version = orw->orw_map_ver; max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { @@ -3155,9 +3150,8 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); again: - version = orw->orw_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, + orw->orw_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -3200,9 +3194,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, - version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, orw->orw_map_ver, + &orw->orw_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -3269,10 +3263,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc) struct dtx_entry dte; int rc1; - dte.dte_xid = orw->orw_dti; - dte.dte_ver = version; + dte.dte_xid = orw->orw_dti; + dte.dte_ver = orw->orw_map_ver; dte.dte_refs = 1; - dte.dte_mbs = mbs; + dte.dte_mbs = mbs; + rc1 = dtx_abort(ioc.ioc_coc, &dte, orw->orw_epoch); if (rc1 != 0 && rc1 != -DER_NONEXIST) D_WARN("Failed to abort DTX "DF_DTI": "DF_RC"\n", @@ -3844,7 +3839,7 @@ obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) if (opi->opi_flags & ORF_RESEND) { rc = obj_handle_resend(p_ioc->ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, - &opi->opi_map_ver, NULL, otpa->mbs, false, false); + opi->opi_map_ver, NULL, otpa->mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -4007,15 +4002,14 @@ ds_obj_punch_handler(crt_rpc_t *rpc) struct dtx_memberships *mbs = NULL; struct daos_shard_tgt *tgts = NULL; struct dtx_id *dti_cos = NULL; - int dti_cos_cnt; - uint32_t tgt_cnt; - uint32_t flags = 0; - uint32_t dtx_flags = 0; - uint32_t version = 0; - uint32_t max_ver = 0; - struct dtx_epoch epoch; - int rc; - bool need_abort = false; + int dti_cos_cnt; + uint32_t tgt_cnt; + uint32_t flags = 0; + uint32_t dtx_flags = 0; + uint32_t max_ver = 0; + struct dtx_epoch epoch; + int rc; + bool need_abort = false; opi = crt_req_get(rpc); D_ASSERT(opi != NULL); @@ -4053,7 +4047,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) dtx_flags |= DTX_EPOCH_OWNER; } - version = opi->opi_map_ver; max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -4073,9 +4066,8 @@ ds_obj_punch_handler(crt_rpc_t *rpc) /* Handle resend. */ if (opi->opi_flags & ORF_RESEND) { again: - version = opi->opi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, + opi->opi_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -4118,9 +4110,9 @@ ds_obj_punch_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, opi->opi_map_ver, + &opi->opi_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -4181,10 +4173,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc) struct dtx_entry dte; int rc1; - dte.dte_xid = opi->opi_dti; - dte.dte_ver = version; + dte.dte_xid = opi->opi_dti; + dte.dte_ver = opi->opi_map_ver; dte.dte_refs = 1; - dte.dte_mbs = mbs; + dte.dte_mbs = mbs; + rc1 = dtx_abort(ioc.ioc_coc, &dte, opi->opi_epoch); if (rc1 != 0 && rc1 != -DER_NONEXIST) D_WARN("Failed to abort DTX "DF_DTI": "DF_RC"\n", @@ -5150,7 +5143,7 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) D_ASSERT(epoch != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, &oci->oci_map_ver, + rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, oci->oci_map_ver, NULL, dcsh->dcsh_mbs, false, true); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); @@ -5282,7 +5275,7 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) * that the DTX has been restarted with newer epoch. */ rc = obj_handle_resend(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch.oe_value, &oci->oci_map_ver, &flags, + &dcsh->dcsh_epoch.oe_value, oci->oci_map_ver, &flags, dcsh->dcsh_mbs, true, true); if (rc < 0) goto out; @@ -5865,7 +5858,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) goto out; } - version = ocpi->ocpi_map_ver; max_ver = ocpi->ocpi_map_ver; if (ocpi->ocpi_flags & ORF_DTX_SYNC) @@ -5876,15 +5868,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { again: - version = ocpi->ocpi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, - &version, &flags, odm->odm_mbs, leader, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, &flags, odm->odm_mbs, leader, false); if (rc < 0) goto out; if (rc == ORS_DONE) D_GOTO(out, rc = 0); - - dce->dce_ver = version; } epoch.oe_value = ocpi->ocpi_epoch; @@ -5907,10 +5896,10 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) &exec_arg.coll_cur); rc = dtx_leader_begin(ioc.ioc_vos_coh, &odm->odm_xid, &epoch, - dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, version, + dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, ocpi->ocpi_map_ver, &ocpi->ocpi_oid, NULL /* dti_cos */, 0 /* dti_cos_cnt */, - NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, - dtx_flags, odm->odm_mbs, dce, &dlh); + NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, dtx_flags, + odm->odm_mbs, dce, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); @@ -5955,9 +5944,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (max_ver < ioc.ioc_map_ver) max_ver = ioc.ioc_map_ver; - if (max_ver < version) - max_ver = version; - DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c index 55f6651d131..3101d25aa14 100644 --- a/src/utils/ddb/ddb_vos.c +++ b/src/utils/ddb/ddb_vos.c @@ -1450,7 +1450,7 @@ dv_dtx_commit_active_entry(daos_handle_t coh, struct dtx_id *dti) int dv_dtx_abort_active_entry(daos_handle_t coh, struct dtx_id *dti) { - return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX); + return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX, 0); } int diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index 57d80412c96..bafbc4a01b3 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -16,6 +16,8 @@ #include #include "vts_io.h" +#define VTS_DTX_VER 3 + static void vts_init_dte(struct dtx_entry *dte) { @@ -34,7 +36,7 @@ vts_init_dte(struct dtx_entry *dte) /** Use unique API so new UUID is generated even on same thread */ daos_dti_gen_unique(&dte->dte_xid); - dte->dte_ver = 1; + dte->dte_ver = VTS_DTX_VER; dte->dte_refs = 1; dte->dte_mbs = mbs; } @@ -334,8 +336,16 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) /* The update DTX is 'prepared'. */ vts_dtx_end(dth); + /* Abort with old epoch should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch - 1, VTS_DTX_VER); + assert_rc_equal(rc, -DER_NONEXIST); + + /* Abort with old version should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER - 1); + assert_rc_equal(rc, -DER_NONEXIST); + /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -367,7 +377,7 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) vts_dtx_end(dth); /* Aborted the punch DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -466,7 +476,7 @@ dtx_14(void **state) assert_memory_equal(update_buf, fetch_buf, UPDATE_BUF_SIZE); /* Committed DTX cannot be aborted. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -528,11 +538,11 @@ dtx_15(void **state) vts_dtx_end(dth); /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); /* Double aborted the DTX is harmless. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index ef5204c3503..d9f7a18197c 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -117,6 +117,9 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_version = DAE_VER(dae); dsp->dsp_dkey_hash = DAE_DKEY_HASH(dae); + if (unlikely(DAE_MBS_DSIZE(dae) == 0)) + goto add; + mbs = (struct dtx_memberships *)(dsp + 1); mbs->dm_tgt_cnt = DAE_TGT_CNT(dae); mbs->dm_grp_cnt = DAE_GRP_CNT(dae); @@ -128,6 +131,9 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, } else { struct umem_instance *umm; + D_ASSERTF(!UMOFF_IS_NULL(DAE_MBS_OFF(dae)), + "Unexpected empty MBS info for DTX " DF_DTI "\n", DP_DTI(&DAE_XID(dae))); + umm = vos_cont2umm(vos_hdl2cont(dth->dth_coh)); memcpy(mbs->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), DAE_MBS_DSIZE(dae)); @@ -136,6 +142,7 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_inline_mbs = 1; dsp->dsp_mbs = mbs; +add: d_list_add_tail(&dsp->dsp_link, &dth->dth_share_tbd_list); dth->dth_share_tbd_count++; @@ -1122,17 +1129,22 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) DAE_VER(dae) = dth->dth_ver; if (dth->dth_mbs != NULL) { - DAE_TGT_CNT(dae) = dth->dth_mbs->dm_tgt_cnt; - DAE_GRP_CNT(dae) = dth->dth_mbs->dm_grp_cnt; - DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; + D_ASSERTF(dth->dth_mbs->dm_data_size != 0, "Invalid MBS size for " DF_DTI "\n", + DP_DTI(&dth->dth_xid)); + + DAE_TGT_CNT(dae) = dth->dth_mbs->dm_tgt_cnt; + DAE_GRP_CNT(dae) = dth->dth_mbs->dm_grp_cnt; DAE_MBS_FLAGS(dae) = dth->dth_mbs->dm_flags; } else { - DAE_TGT_CNT(dae) = 1; - DAE_GRP_CNT(dae) = 1; - DAE_MBS_DSIZE(dae) = 0; + DAE_TGT_CNT(dae) = 1; + DAE_GRP_CNT(dae) = 1; DAE_MBS_FLAGS(dae) = 0; } + /* Will set DAE_MBS_DSIZE and DAE_MBS_OFF via vos_dtx_prepared(). */ + DAE_MBS_DSIZE(dae) = 0; + DAE_MBS_OFF(dae) = UMOFF_NULL; + /* Will be set as dbd::dbd_index via vos_dtx_prepared(). */ DAE_INDEX(dae) = DTX_INDEX_INVAL; dae->dae_dth = dth; @@ -1835,20 +1847,23 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) (dth->dth_modification_cnt > 0)) dth->dth_sync = 1; - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { - memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, - DAE_MBS_DSIZE(dae)); - } else { - rec_off = umem_zalloc(umm, DAE_MBS_DSIZE(dae)); - if (UMOFF_IS_NULL(rec_off)) { - D_ERROR("No space to store DTX mbs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - return -DER_NOSPACE; - } + if (dth->dth_mbs != NULL) { + if (dth->dth_mbs->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) { + memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + } else { + rec_off = umem_zalloc(umm, dth->dth_mbs->dm_data_size); + if (UMOFF_IS_NULL(rec_off)) { + D_ERROR("No space (%u) to store MBS for DTX " DF_DTI "\n", + dth->dth_mbs->dm_data_size, DP_DTI(&DAE_XID(dae))); + return -DER_NOSPACE; + } - memcpy(umem_off2ptr(umm, rec_off), - dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); - DAE_MBS_OFF(dae) = rec_off; + memcpy(umem_off2ptr(umm, rec_off), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + DAE_MBS_OFF(dae) = rec_off; + } + DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; } if (dae->dae_records != NULL) { @@ -1895,34 +1910,51 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) return rc; } -static struct dtx_memberships * -vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) +static int +vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae, + struct dtx_memberships **p_mbs) { struct dtx_handle *dth = dae->dae_dth; struct dtx_memberships *tmp; size_t size; - size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (dth != NULL) + size = sizeof(*tmp) + dth->dth_mbs->dm_data_size; + else + size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (unlikely(size == sizeof(*tmp))) + return -DER_NONEXIST; + D_ALLOC(tmp, size); if (tmp == NULL) - return NULL; + return -DER_NOMEM; - tmp->dm_tgt_cnt = DAE_TGT_CNT(dae); - tmp->dm_grp_cnt = DAE_GRP_CNT(dae); - tmp->dm_data_size = DAE_MBS_DSIZE(dae); - tmp->dm_flags = DAE_MBS_FLAGS(dae); + tmp->dm_tgt_cnt = DAE_TGT_CNT(dae); + tmp->dm_grp_cnt = DAE_GRP_CNT(dae); + tmp->dm_flags = DAE_MBS_FLAGS(dae); tmp->dm_dte_flags = DAE_FLAGS(dae); - /* The DTX is not prepared yet, copy the MBS from DTX handle. */ - if (dth != NULL) + /* The DTX maybe not prepared yet, copy the MBS from DTX handle. */ + if (dth != NULL) { + tmp->dm_data_size = dth->dth_mbs->dm_data_size; memcpy(tmp->dm_data, dth->dth_mbs->dm_data, tmp->dm_data_size); - else if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) - memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); - else - memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), - tmp->dm_data_size); + } else { + tmp->dm_data_size = DAE_MBS_DSIZE(dae); - return tmp; + if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) { + memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); + } else { + D_ASSERTF(!UMOFF_IS_NULL(DAE_MBS_OFF(dae)), + "Unexpected empty MBS info for DTX " DF_DTI "\n", + DP_DTI(&DAE_XID(dae))); + + memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), + tmp->dm_data_size); + } + } + + *p_mbs = tmp; + return 0; } int @@ -1994,6 +2026,9 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (dae->dae_dth != NULL) return -DER_INPROGRESS; + if (pm_ver != NULL) + *pm_ver = DAE_VER(dae); + if (epoch != NULL) { daos_epoch_t e = *epoch; @@ -2014,11 +2049,8 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (pm_ver == NULL) return DTX_ST_PREPARED; - if (*pm_ver <= cont->vc_dtx_resync_ver) { - if (!for_refresh) - *pm_ver = DAE_VER(dae); + if (*pm_ver <= cont->vc_dtx_resync_ver) return DTX_ST_PREPARED; - } /* * Before DTX resync completed, it is not sure whether related DTX is @@ -2053,12 +2085,11 @@ int vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, struct dtx_memberships **mbs) { - struct vos_container *cont; - struct dtx_memberships *tmp; - struct vos_dtx_act_ent *dae; - d_iov_t kiov; - d_iov_t riov; - int rc; + struct vos_container *cont; + struct vos_dtx_act_ent *dae; + d_iov_t kiov; + d_iov_t riov; + int rc; cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); @@ -2068,14 +2099,9 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { dae = riov.iov_buf; - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); - if (tmp == NULL) { - rc = -DER_NOMEM; - } else { - if (oid != NULL) - *oid = DAE_OID(dae); - *mbs = tmp; - } + rc = vos_dtx_pack_mbs(vos_cont2umm(cont), dae, mbs); + if (rc == 0 && oid != NULL) + *oid = DAE_OID(dae); } else if (rc == -DER_NONEXIST) { rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); if (rc == 0) @@ -2162,7 +2188,8 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership if (rc != 0) goto out; - dae_df->dae_mbs_off = UMOFF_NULL; + dae_df->dae_mbs_off = UMOFF_NULL; + dae_df->dae_mbs_dsize = 0; } if (new_inline) { @@ -2185,10 +2212,9 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership out: if (started) { if (rc == 0) { + memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); rc = umem_tx_commit(umm); D_ASSERTF(rc == 0, "local TX commit failure: %d\n", rc); - - memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); } else { rc = umem_tx_abort(umm, rc); } @@ -2346,11 +2372,13 @@ vos_dtx_post_handle(struct vos_container *cont, struct vos_dtx_cmt_ent **dces, int count, bool abort, bool rollback, bool keep_act) { - struct vos_tls *tls = vos_tls_get(false); - d_iov_t kiov; - int rc; - int i; - int j; + struct umem_instance *umm = vos_cont2umm(cont); + struct vos_tls *tls = vos_tls_get(false); + struct vos_dtx_act_ent_df *dae_df; + d_iov_t kiov; + int rc; + int i; + int j; D_ASSERT(daes != NULL); @@ -2451,7 +2479,9 @@ vos_dtx_post_handle(struct vos_container *cont, D_ASSERT(daes[i]->dae_preparing == 0); - daes[i]->dae_prepared = 0; + dae_df = umem_off2ptr(umm, daes[i]->dae_df_off); + memcpy(&daes[i]->dae_base, dae_df, sizeof(*dae_df)); + if (abort) { D_ASSERT(daes[i]->dae_committing == 0); @@ -2748,7 +2778,7 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, } int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version) { struct vos_container *cont; struct vos_dtx_act_ent *dae = NULL; @@ -2794,6 +2824,9 @@ vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) if (epoch != DAOS_EPOCH_MAX && epoch != DAE_EPOCH(dae)) D_GOTO(out, rc = -DER_NONEXIST); + if (version != 0 && version < DAE_VER(dae)) + D_GOTO(out, rc = -DER_NONEXIST); + if (unlikely(dae->dae_preparing)) { /* * NOTE: Abort in-preparing DTX entry. It may because the non-leader is some slow, @@ -2811,7 +2844,9 @@ vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) if (rc == -DER_ALREADY) rc = 0; else if (rc != -DER_NONEXIST) - DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, "Abort the DTX " DF_DTI, DP_DTI(dti)); + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Abort the DTX " DF_DTI ": with epoch " DF_U64 ", version %u", + DP_DTI(dti), epoch, version); return rc; }