From 63ae7a8d5931130e999ac7b3cfbcbcc6488f10a6 Mon Sep 17 00:00:00 2001 From: Joseph Moore <26410038+jgmoore-or@users.noreply.github.com> Date: Wed, 17 Dec 2025 21:32:26 -0700 Subject: [PATCH 01/11] DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS signal. (#17268) * DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS signal. Signed-off-by: Joseph Moore --- src/engine/init.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/engine/init.c b/src/engine/init.c index dd70dfe721e..5611d1c8b09 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -1204,6 +1204,7 @@ int main(int argc, char **argv) { sigset_t set; + bool exit_failure = false; int sig; int rc; @@ -1238,6 +1239,7 @@ main(int argc, char **argv) /** wait for shutdown signal */ sigemptyset(&set); + sigaddset(&set, SIGBUS); sigaddset(&set, SIGINT); sigaddset(&set, SIGTERM); sigaddset(&set, SIGUSR1); @@ -1248,7 +1250,6 @@ main(int argc, char **argv) D_ERROR("failed to wait for signals: %d\n", rc); break; } - /* open specific file to dump ABT infos and ULTs stacks */ if (sig == SIGUSR1 || sig == SIGUSR2) { struct timeval tv; @@ -1322,12 +1323,18 @@ main(int argc, char **argv) continue; } - /* SIGINT/SIGTERM cause server shutdown */ + /* Log error for SIGBUS occurrence */ + if (sig == SIGBUS) { + D_ERROR("SIGBUS signal received; proceeding to shutdown.\n"); + exit_failure = true; + } + + /* SIGINT/SIGTERM/SIGBUS cause server shutdown */ break; } /** shutdown */ server_fini(true); - exit(EXIT_SUCCESS); + exit(exit_failure ? EXIT_FAILURE : EXIT_SUCCESS); } From 7da4dd7b05b548b3b5737bab64a118d89d2e8cc2 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 16 Jan 2026 16:51:19 +0000 Subject: [PATCH 02/11] DAOS-17931 engine: engine: Terminate engine process upon receipt of SIGBUS signal. Allow-unstable-test: true Signed-off-by: Joseph Moore From 101717a2da66dda2dba262f97316d52017365186 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 5 Feb 2026 15:29:00 +0000 Subject: [PATCH 03/11] DAOS-17381 engine: Terminate engine process upon receipt of SIGBUS signal. Signed-off-by: Joseph Moore From efb4239a7af9f0ac583f560d5b1958921c581594 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 12 Feb 2026 15:14:20 +0000 Subject: [PATCH 04/11] DAOS-17391 engine: Terminate engine process upon receipt of SIGBUS signal. Signed-off-by: Joseph Moore --- src/engine/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/init.c b/src/engine/init.c index 5611d1c8b09..2ad123eb161 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ From caf2e988daf1a86f13b617ec44c88d4c6aa4e972 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 5 May 2026 17:10:15 +0000 Subject: [PATCH 05/11] DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS signal. Skip-build-el8-gcc: true Signed-off-by: Joseph Moore From 901d47cbd453c6f5ac36c18526db1d5f1a561c1e Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 5 May 2026 18:43:01 +0000 Subject: [PATCH 06/11] DAOS-17931 engine: Terminate engine process upon receipt of SIGBUS signal. Signed-off-by: Joseph Moore From d5c324b88a62f2369236378a4b4f0cb01dd0255d Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Mon, 25 May 2026 17:57:59 +0800 Subject: [PATCH 07/11] DAOS-19001 vos: set dth_need_validation when evict active DTX - b26 There is race condition between IO RPC handler and DTX resync that may commit or abort the DTX when related DTX leader waiting for non-leader participants. To properly handle such case, anytime when an active DTX entry is evicted from the cache, in spite of it is for commit or abort, we need to set dtx_handle::dth_need_validation to notify the DTX owner about the event. Signed-off-by: Fan Yong --- src/vos/vos_dtx.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index e08d63fd673..73a54c70ee1 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -28,18 +28,19 @@ #define DTX_UMOFF_TYPES (DTX_UMOFF_ILOG | DTX_UMOFF_SVT | DTX_UMOFF_EVT) #define DTX_INDEX_INVAL (int32_t)(-1) -#define dtx_evict_lid(cont, dae) \ - do { \ - if (dae->dae_dth != NULL && dae->dae_dth->dth_ent != NULL) { \ - D_ASSERT(dae->dae_dth->dth_ent == dae); \ - dae->dae_dth->dth_ent = NULL; \ - } \ - D_DEBUG(DB_IO, "Evicting DTX "DF_DTI": lid=%x\n", \ - DP_DTI(&DAE_XID(dae)), DAE_LID(dae)); \ - d_list_del_init(&dae->dae_link); \ - lrua_evictx(cont->vc_dtx_array, \ - (DAE_LID(dae) & DTX_LID_SOLO_MASK) - DTX_LID_RESERVED, \ - DAE_EPOCH(dae)); \ +#define dtx_evict_lid(cont, dae) \ + do { \ + if (dae->dae_dth != NULL && dae->dae_dth->dth_ent != NULL) { \ + D_ASSERT(dae->dae_dth->dth_ent == dae); \ + dae->dae_dth->dth_need_validation = 1; \ + dae->dae_dth->dth_ent = NULL; \ + } \ + D_DEBUG(DB_IO, "Evicting DTX " DF_DTI ": lid = %x\n", DP_DTI(&DAE_XID(dae)), \ + DAE_LID(dae)); \ + d_list_del_init(&dae->dae_link); \ + lrua_evictx(cont->vc_dtx_array, \ + (DAE_LID(dae) & DTX_LID_SOLO_MASK) - DTX_LID_RESERVED, \ + DAE_EPOCH(dae)); \ } while (0) bool vos_skip_old_partial_dtx; From c4fa87fc9873bbc162fae44595f2fcd12cc41426 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Wed, 3 Jun 2026 17:35:16 +0800 Subject: [PATCH 08/11] DAOS-19059 vos: cache vos object after DTX commit - b26 It is unnecessary to evict the vos object from cache after related DTX committed; otherwise, other concurrent modification against the same object shard maybe required to retry. Signed-off-by: Fan Yong --- src/vos/vos_dtx.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index e08d63fd673..8eb6a09ea7d 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2354,6 +2354,8 @@ vos_dtx_post_handle(struct vos_container *cont, } for (i = 0; i < count; i++) { + struct vos_dtx_act_ent *dae = NULL; + if (daes[i] == NULL) continue; @@ -2370,9 +2372,18 @@ vos_dtx_post_handle(struct vos_container *cont, } d_iov_set(&kiov, &DAE_XID(daes[i]), sizeof(DAE_XID(daes[i]))); - rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_EQ, - &kiov, NULL); + /* + * For abort case, set @args as NULL, then related vos object will be evicted from + * cache via dbtree_delete(). + */ + rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_EQ, &kiov, + abort ? NULL : &dae); if (rc == 0 || rc == -DER_NONEXIST) { + if (dae != NULL) { + D_ASSERT(dae == daes[i]); + dtx_act_ent_cleanup(cont, dae, NULL, false, false); + } + dtx_evict_lid(cont, daes[i]); } else { /* The DTX entry has been committed or aborted, but we From 2f6af62aead982dccb96344690de34760d89ee7c Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 8 Jun 2026 15:04:39 +0200 Subject: [PATCH 09/11] DAOS-19088 ci: Pin mercury to latest 15.5 version The latest available leap 15.5 mercury RMP has version 2.4.1-2 This version must be used for proper DAOS build on leap 15.5 Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-test-el9: true Skip-func-test-leap15: false Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true Skip-fault-injection-test: true Skip-NLT: true --- utils/rpms/daos.spec | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index d52fa5ba75e..5ccb8a2a420 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -11,6 +11,7 @@ %global daos_build_args client test %endif %global mercury_version 2.4.1 +%global mercury_version_max 2.4.1-2 %global libfabric_version 1.20 %global argobots_version 1.2-3 %global __python %{__python3} @@ -37,6 +38,7 @@ BuildRequires: scons >= 2.4 %endif BuildRequires: libfabric-devel >= %{libfabric_version} BuildRequires: mercury-devel >= %{mercury_version} +BuildRequires: mercury-devel <= %{mercury_version_max} BuildRequires: gcc-c++ %if (0%{?rhel} >= 8) %global openmpi openmpi @@ -133,6 +135,7 @@ Requires: openssl # of mercury, at which time the autoprov shared library version should # suffice Requires: mercury-libfabric >= %{mercury_version} +Requires: mercury-libfabric <= %{mercury_version_max} %description @@ -164,6 +167,7 @@ Requires: libpmemobj >= 2.1.3-2 %endif Requires: libfabric >= %{libfabric_version} Requires: mercury-libfabric >= %{mercury_version} +Requires: mercury-libfabric <= %{mercury_version_max} Requires(post): /sbin/ldconfig Requires(postun): /sbin/ldconfig Requires: numactl @@ -185,6 +189,7 @@ This package contains DAOS administrative tools (e.g. dmg). Summary: The DAOS client Requires: %{name}%{?_isa} = %{version}-%{release} Requires: mercury-libfabric >= %{mercury_version} +Requires: mercury-libfabric <= %{mercury_version_max} Requires: libfabric >= %{libfabric_version} %if (0%{?suse_version} >= 1500) Requires: libfabric1 >= %{libfabric_version} From fb8e7624054cb4f96dca4d8189497473b5e8415d Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 8 Jun 2026 19:58:03 +0200 Subject: [PATCH 10/11] Fix: narrow solution to only mercury-devel RPM Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-test-el9: true Skip-func-test-leap15: false Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true Skip-fault-injection-test: true Skip-NLT: true --- utils/rpms/daos.spec | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 5ccb8a2a420..f35904cb6eb 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -135,7 +135,7 @@ Requires: openssl # of mercury, at which time the autoprov shared library version should # suffice Requires: mercury-libfabric >= %{mercury_version} -Requires: mercury-libfabric <= %{mercury_version_max} + %description @@ -167,7 +167,6 @@ Requires: libpmemobj >= 2.1.3-2 %endif Requires: libfabric >= %{libfabric_version} Requires: mercury-libfabric >= %{mercury_version} -Requires: mercury-libfabric <= %{mercury_version_max} Requires(post): /sbin/ldconfig Requires(postun): /sbin/ldconfig Requires: numactl @@ -189,7 +188,6 @@ This package contains DAOS administrative tools (e.g. dmg). Summary: The DAOS client Requires: %{name}%{?_isa} = %{version}-%{release} Requires: mercury-libfabric >= %{mercury_version} -Requires: mercury-libfabric <= %{mercury_version_max} Requires: libfabric >= %{libfabric_version} %if (0%{?suse_version} >= 1500) Requires: libfabric1 >= %{libfabric_version} From 921dabd25e62d78272c6cf8a73f7c1feee5e693e Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 9 Jun 2026 12:54:47 +0800 Subject: [PATCH 11/11] DAOS-19036 dtx: handle DTX race issues - b26 Mainly including the following fixes: 1. When DTX leader switch, it is possible that the old DTX leader wanted to abort such DTX but not completed before its eviction. And then the new DTX leader may re-execute related modification successfully and try to commit such DTX. If without control, it is possible that those in-flight DTX ABORT RPC from the old DTX leader may abort the DTX that is to be committed by the new DTX leader, then break DTX semantics. The patch adds @version parameter when abort DTX: when new DTX leader handles resent RPC from client, related DTX version will be refreshed if it has been prepared by old DTX leader; anytime when abort DTX locally, the logic will compare the version from ABORT request with related DTX version and skip stale ABORT RPC. 2. vos_dtx_load_mbs() maybe triggered before related DTX prepared locally. Under such case, related MBS information is empty. We need to handle such case to avoid segmentation fault. 3. Handle race between DTX resync and IO handler for resent RPC. Skip-build-leap15-rpm: true Skip-func-test-leap15: true Signed-off-by: Fan Yong --- src/dtx/dtx_coll.c | 9 ++- src/dtx/dtx_internal.h | 5 +- src/dtx/dtx_resync.c | 19 ++++++- src/dtx/dtx_rpc.c | 19 ++++--- src/dtx/dtx_srv.c | 6 +- src/include/daos_srv/vos.h | 5 +- src/object/srv_obj.c | 68 +++++++++------------- src/utils/ddb/ddb_vos.c | 2 +- src/vos/tests/vts_dtx.c | 23 ++++++-- src/vos/vos_dtx.c | 113 ++++++++++++++++++++++--------------- 10 files changed, 152 insertions(+), 117 deletions(-) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 106205e0c5b..06af0a09a3c 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -43,6 +43,7 @@ struct dtx_coll_local_args { struct dtx_id dcla_xid; daos_epoch_t dcla_epoch; uint32_t dcla_opc; + uint32_t dcla_ver; int *dcla_results; }; @@ -368,7 +369,7 @@ dtx_coll_local_one(void *args) rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, false, NULL); break; case DTX_COLL_ABORT: - rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch, dcla->dcla_ver); break; case DTX_COLL_CHECK: rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, false); @@ -404,7 +405,8 @@ dtx_coll_local_one(void *args) int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results) { struct dtx_coll_local_args dcla = { 0 }; struct dss_coll_ops coll_ops = { 0 }; @@ -419,6 +421,7 @@ dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epo uuid_copy(dcla.dcla_co_uuid, co_uuid); dcla.dcla_xid = *xid; dcla.dcla_epoch = epoch; + dcla.dcla_ver = version; dcla.dcla_opc = opc; coll_ops.co_func = dtx_coll_local_one; diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 9f25c3b19a6..4bdd3aa1a2f 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -303,7 +303,8 @@ int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, - uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); + uint32_t version, uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, + int **p_results); /* clang-format on */ enum dtx_status_handle_result { diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 444e5a143e0..8bcf0de4b98 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -41,6 +41,7 @@ struct dtx_resync_args { daos_epoch_t epoch; uint32_t resync_version; uint32_t discard_version; + bool for_all; }; static inline void @@ -391,7 +392,8 @@ dtx_status_handle(struct dtx_resync_args *dra) d_list_for_each_entry_safe(dre, next, &drh->drh_list, dre_link) { if (dre->dre_dte.dte_ver < dra->discard_version) { - err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch); + err = vos_dtx_abort(cont->sc_hdl, &dre->dre_xid, dre->dre_epoch, + dre->dre_dte.dte_ver); if (err == -DER_NONEXIST) err = 0; if (err != 0) @@ -532,7 +534,17 @@ dtx_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) if (dra->resync_version == dra->discard_version) return 0; - /* Skip unprepared entry which version is at least not older than discard version. */ + /* + * The DTX version maybe refreshed via obj_handle_resend(). It means that either the + * DTX is generated against the latest pool map or related IO RPC is resent by client + * after pool map changed. Under both cases, the DTX resync that is triggered for pool + * map changes (@for_all is false) should not handle such DTX to avoid making conflict + * commit/abort decision (against regular IO handler) by race. + */ + if ((ent->ie_dtx_ver > dra->resync_version) || + (ent->ie_dtx_ver == dra->resync_version && !dra->for_all)) + return 0; + if (ent->ie_dtx_tgt_cnt == 0) return 0; @@ -658,6 +670,7 @@ dtx_resync(daos_handle_t po_hdl, struct ds_cont_child *cont, uint32_t ver, bool dra.epoch = d_hlc_get(); D_INIT_LIST_HEAD(&dra.tables.drh_list); dra.tables.drh_count = 0; + dra.for_all = !block; /* * Trigger DTX reindex. That will avoid DTX_CHECK from others being blocked. diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index a7cb78ae373..72348cd9d73 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -955,7 +955,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) * to resend sometime later. */ if (epoch != 0) - rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dte->dte_xid, epoch, dte->dte_ver); else rc1 = vos_dtx_set_flags(cont->sc_hdl, &dte->dte_xid, 1, DTE_CORRUPTED); if (rc1 > 0 || rc1 == -DER_NONEXIST) @@ -1227,7 +1227,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che d_list_del(&dsp->dsp_link); dtx_dsp_free(dsp); } else { - rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch); + rc1 = vos_dtx_abort(cont->sc_hdl, &dsp->dsp_xid, dsp->dsp_epoch, + dsp->dsp_version); D_ASSERT(rc1 != -DER_NO_PERM); if (rc1 == 0 || !for_io) { @@ -1643,8 +1644,8 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, - DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_COMMIT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1726,8 +1727,8 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_ABORT, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { @@ -1747,7 +1748,7 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } if (epoch != 0) - rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch); + rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch, dce->dce_ver); else rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED); if (rc2 > 0 || rc2 == -DER_NONEXIST) @@ -1783,8 +1784,8 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_bitmap != NULL) { len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, - DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap, - &results); + dce->dce_ver, DTX_COLL_CHECK, dce->dce_bitmap_sz, + dce->dce_bitmap, &results); if (len < 0) { rc1 = len; } else { diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index d48ab663d13..8ca87cebc38 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -241,7 +241,7 @@ dtx_handler(crt_rpc_t *rpc) rc = vos_dtx_abort(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, - din->di_epoch); + din->di_epoch, din->di_version); } else { rc = vos_dtx_set_flags(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, @@ -464,7 +464,7 @@ dtx_coll_handler(crt_rpc_t *rpc) } len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, - opc, bitmap_sz, bitmap, &results); + dci->dci_version, opc, bitmap_sz, bitmap, &results); if (len < 0) D_GOTO(out, rc = len); diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 714e8c11799..019d6521bce 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -201,11 +201,12 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool keep_act * \param coh [IN] Container open handle. * \param dti [IN] The DTX identifiers to be aborted. * \param epoch [IN] The max epoch for the DTX to be aborted. + * \param version [IN] The max version for the DTX to be aborted. * * \return Zero on success, negative value if error. */ int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch); +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version); /** * Discard the active DTX entry's records if invalid. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a477da31be2..f0b989791c6 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2710,11 +2710,11 @@ enum obj_resend_status { }; static int -obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t *pm_ver, +obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t pm_ver, uint32_t *flags, struct dtx_memberships *mbs, bool leader, bool dist) { daos_epoch_t e; - uint32_t ver = *pm_ver; + uint32_t ver = pm_ver; int rc; if (!leader || dist || (flags != NULL && *flags & ORF_RESEND)) @@ -2731,16 +2731,13 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui /* For 'prepared' DTX, if pool map has been changed, then DTX membership maybe * changed also. Let's refresh it if necessary. */ - if (ver < *pm_ver) { - rc = vos_dtx_refresh_mbs(coh, dti, mbs, *pm_ver, leader); + if (ver < pm_ver) { + rc = vos_dtx_refresh_mbs(coh, dti, mbs, pm_ver, leader); if (rc < 0) goto out; if (rc > 0) rc = 0; - - if (leader && !dist) - *pm_ver = ver; } if (flags != NULL) { @@ -2768,7 +2765,7 @@ obj_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, ui D_GOTO(out, rc = -DER_INPROGRESS); /* Abort it if exist but with different epoch, then re-execute with new epoch. */ - rc = vos_dtx_abort(coh, dti, e); + rc = vos_dtx_abort(coh, dti, e, ver); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); /* Fall through */ @@ -2834,7 +2831,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) /* Handle resend. */ if (orw->orw_flags & ORF_RESEND) { rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, - &orw->orw_map_ver, NULL, mbs, false, false); + orw->orw_map_ver, NULL, mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -3019,7 +3016,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) struct obj_pool_metrics *opm; int dti_cos_cnt; uint32_t tgt_cnt; - uint32_t version = 0; uint32_t max_ver = 0; struct dtx_epoch epoch = {0}; int rc; @@ -3132,7 +3128,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (rc != 0) D_GOTO(out, rc); - version = orw->orw_map_ver; max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { @@ -3151,9 +3146,8 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); again: - version = orw->orw_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &orw->orw_epoch, + orw->orw_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -3196,9 +3190,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, - version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, orw->orw_map_ver, + &orw->orw_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -3266,7 +3260,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) int rc1; dte.dte_xid = orw->orw_dti; - dte.dte_ver = version; + dte.dte_ver = orw->orw_map_ver;; dte.dte_refs = 1; dte.dte_mbs = mbs; rc1 = dtx_abort(ioc.ioc_coc, &dte, orw->orw_epoch); @@ -3840,7 +3834,7 @@ obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) if (opi->opi_flags & ORF_RESEND) { rc = obj_handle_resend(p_ioc->ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, - &opi->opi_map_ver, NULL, otpa->mbs, false, false); + opi->opi_map_ver, NULL, otpa->mbs, false, false); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); } @@ -4007,7 +4001,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) uint32_t tgt_cnt; uint32_t flags = 0; uint32_t dtx_flags = 0; - uint32_t version = 0; uint32_t max_ver = 0; struct dtx_epoch epoch; int rc; @@ -4047,7 +4040,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (rc == PE_OK_LOCAL) opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN; - version = opi->opi_map_ver; max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -4067,9 +4059,8 @@ ds_obj_punch_handler(crt_rpc_t *rpc) /* Handle resend. */ if (opi->opi_flags & ORF_RESEND) { again: - version = opi->opi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, &version, - &flags, mbs, true, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &opi->opi_epoch, + opi->opi_map_ver, &flags, mbs, true, false); if (rc < 0) goto out; if (rc == ORS_DONE) @@ -4112,9 +4103,9 @@ ds_obj_punch_handler(crt_rpc_t *rpc) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); + rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, opi->opi_map_ver, + &opi->opi_oid, dti_cos, dti_cos_cnt, tgts, tgt_cnt, dtx_flags, mbs, + NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -4176,7 +4167,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) int rc1; dte.dte_xid = opi->opi_dti; - dte.dte_ver = version; + dte.dte_ver = opi->opi_map_ver; dte.dte_refs = 1; dte.dte_mbs = mbs; rc1 = dtx_abort(ioc.ioc_coc, &dte, opi->opi_epoch); @@ -5097,7 +5088,7 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) D_ASSERT(epoch != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, &oci->oci_map_ver, + rc = obj_handle_resend(ioc->ioc_vos_coh, &dcsh->dcsh_xid, &epoch, oci->oci_map_ver, NULL, dcsh->dcsh_mbs, false, true); if (rc != 0) D_GOTO(out, rc = (rc > 0 ? 0 : rc)); @@ -5228,7 +5219,7 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) * that the DTX has been restarted with newer epoch. */ rc = obj_handle_resend(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch.oe_value, &oci->oci_map_ver, &flags, + &dcsh->dcsh_epoch.oe_value, oci->oci_map_ver, &flags, dcsh->dcsh_mbs, true, true); if (rc < 0) goto out; @@ -5812,7 +5803,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) goto out; } - version = ocpi->ocpi_map_ver; max_ver = ocpi->ocpi_map_ver; if (ocpi->ocpi_flags & ORF_DTX_SYNC) @@ -5823,15 +5813,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { again: - version = ocpi->ocpi_map_ver; - rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, - &version, &flags, odm->odm_mbs, leader, false); + rc = obj_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, &flags, odm->odm_mbs, leader, false); if (rc < 0) goto out; if (rc == ORS_DONE) D_GOTO(out, rc = 0); - - dce->dce_ver = version; } epoch.oe_value = ocpi->ocpi_epoch; @@ -5854,10 +5841,10 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) &exec_arg.coll_cur); rc = dtx_leader_begin(ioc.ioc_vos_coh, &odm->odm_xid, &epoch, - dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, version, + dcts[0].dct_shards[dmi->dmi_tgt_id].dcs_nr, ocpi->ocpi_map_ver, &ocpi->ocpi_oid, NULL /* dti_cos */, 0 /* dti_cos_cnt */, - NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, - dtx_flags, odm->odm_mbs, dce, &dlh); + NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, dtx_flags, + odm->odm_mbs, dce, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); @@ -5902,9 +5889,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (max_ver < ioc.ioc_map_ver) max_ver = ioc.ioc_map_ver; - if (max_ver < version) - max_ver = version; - DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c index c267c812e81..d8207d84687 100644 --- a/src/utils/ddb/ddb_vos.c +++ b/src/utils/ddb/ddb_vos.c @@ -1423,7 +1423,7 @@ dv_dtx_commit_active_entry(daos_handle_t coh, struct dtx_id *dti) int dv_dtx_abort_active_entry(daos_handle_t coh, struct dtx_id *dti) { - return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX); + return vos_dtx_abort(coh, dti, DAOS_EPOCH_MAX, 0); } int diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index d83d2356d15..60633dc6156 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -15,6 +16,8 @@ #include #include "vts_io.h" +#define VTS_DTX_VER 3 + static void vts_init_dte(struct dtx_entry *dte) { @@ -33,7 +36,7 @@ vts_init_dte(struct dtx_entry *dte) /** Use unique API so new UUID is generated even on same thread */ daos_dti_gen_unique(&dte->dte_xid); - dte->dte_ver = 1; + dte->dte_ver = VTS_DTX_VER; dte->dte_refs = 1; dte->dte_mbs = mbs; } @@ -357,8 +360,16 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) /* The update DTX is 'prepared'. */ vts_dtx_end(dth); + /* Abort with old epoch should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch - 1, VTS_DTX_VER); + assert_rc_equal(rc, -DER_NONEXIST); + + /* Abort with old version should fail. */ + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER - 1); + assert_rc_equal(rc, -DER_NONEXIST); + /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -390,7 +401,7 @@ vts_dtx_abort_visibility(struct io_test_args *args, bool ext, bool punch_obj) vts_dtx_end(dth); /* Aborted the punch DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -489,7 +500,7 @@ dtx_14(void **state) assert_memory_equal(update_buf, fetch_buf, UPDATE_BUF_SIZE); /* Committed DTX cannot be aborted. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); @@ -551,11 +562,11 @@ dtx_15(void **state) vts_dtx_end(dth); /* Aborted the update DTX. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_rc_equal(rc, 0); /* Double aborted the DTX is harmless. */ - rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch); + rc = vos_dtx_abort(args->ctx.tc_co_hdl, &xid, epoch, VTS_DTX_VER); assert_int_not_equal(rc, 0); memset(fetch_buf, 0, UPDATE_BUF_SIZE); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index e08d63fd673..ec11b83f8aa 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -116,6 +116,9 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_version = DAE_VER(dae); dsp->dsp_dkey_hash = DAE_DKEY_HASH(dae); + if (unlikely(DAE_MBS_DSIZE(dae) == 0)) + goto add; + mbs = (struct dtx_memberships *)(dsp + 1); mbs->dm_tgt_cnt = DAE_TGT_CNT(dae); mbs->dm_grp_cnt = DAE_GRP_CNT(dae); @@ -135,6 +138,7 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_inline_mbs = 1; dsp->dsp_mbs = mbs; +add: d_list_add_tail(&dsp->dsp_link, &dth->dth_share_tbd_list); dth->dth_share_tbd_count++; @@ -1067,15 +1071,17 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) if (dth->dth_mbs != NULL) { DAE_TGT_CNT(dae) = dth->dth_mbs->dm_tgt_cnt; DAE_GRP_CNT(dae) = dth->dth_mbs->dm_grp_cnt; - DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; DAE_MBS_FLAGS(dae) = dth->dth_mbs->dm_flags; } else { DAE_TGT_CNT(dae) = 1; DAE_GRP_CNT(dae) = 1; - DAE_MBS_DSIZE(dae) = 0; DAE_MBS_FLAGS(dae) = 0; } + /* Will set DAE_MBS_DSIZE and DAE_MBS_OFF via vos_dtx_prepared(). */ + DAE_MBS_DSIZE(dae) = 0; + DAE_MBS_OFF(dae) = UMOFF_NULL; + /* Will be set as dbd::dbd_index via vos_dtx_prepared(). */ DAE_INDEX(dae) = DTX_INDEX_INVAL; dae->dae_dth = dth; @@ -1793,20 +1799,23 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) dae->dae_oid_cnt = 1; } - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { - memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, - DAE_MBS_DSIZE(dae)); - } else { - rec_off = umem_zalloc(umm, DAE_MBS_DSIZE(dae)); - if (UMOFF_IS_NULL(rec_off)) { - D_ERROR("No space to store DTX mbs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - return -DER_NOSPACE; - } + if (dth->dth_mbs != NULL) { + if (dth->dth_mbs->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) { + memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + } else { + rec_off = umem_zalloc(umm, dth->dth_mbs->dm_data_size); + if (UMOFF_IS_NULL(rec_off)) { + D_ERROR("No space (%u) to store MBS for DTX " DF_DTI "\n", + dth->dth_mbs->dm_data_size, DP_DTI(&DAE_XID(dae))); + return -DER_NOSPACE; + } - memcpy(umem_off2ptr(umm, rec_off), - dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); - DAE_MBS_OFF(dae) = rec_off; + memcpy(umem_off2ptr(umm, rec_off), dth->dth_mbs->dm_data, + dth->dth_mbs->dm_data_size); + DAE_MBS_OFF(dae) = rec_off; + } + DAE_MBS_DSIZE(dae) = dth->dth_mbs->dm_data_size; } if (dae->dae_records != NULL) { @@ -1853,34 +1862,45 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) return rc; } -static struct dtx_memberships * -vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) +static int +vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae, + struct dtx_memberships **p_mbs) { struct dtx_handle *dth = dae->dae_dth; struct dtx_memberships *tmp; size_t size; - size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (dth != NULL) + size = sizeof(*tmp) + dth->dth_mbs->dm_data_size; + else + size = sizeof(*tmp) + DAE_MBS_DSIZE(dae); + if (unlikely(size == sizeof(*tmp))) + return -DER_NONEXIST; + D_ALLOC(tmp, size); if (tmp == NULL) - return NULL; + return -DER_NOMEM; tmp->dm_tgt_cnt = DAE_TGT_CNT(dae); tmp->dm_grp_cnt = DAE_GRP_CNT(dae); - tmp->dm_data_size = DAE_MBS_DSIZE(dae); tmp->dm_flags = DAE_MBS_FLAGS(dae); tmp->dm_dte_flags = DAE_FLAGS(dae); - /* The DTX is not prepared yet, copy the MBS from DTX handle. */ - if (dth != NULL) + /* The DTX maybe not prepared yet, copy the MBS from DTX handle. */ + if (dth != NULL) { + tmp->dm_data_size = dth->dth_mbs->dm_data_size; memcpy(tmp->dm_data, dth->dth_mbs->dm_data, tmp->dm_data_size); - else if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) - memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); - else - memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), - tmp->dm_data_size); + } else { + tmp->dm_data_size = DAE_MBS_DSIZE(dae); + if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) + memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); + else + memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), + tmp->dm_data_size); + } - return tmp; + *p_mbs = tmp; + return 0; } int @@ -1952,6 +1972,9 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (dae->dae_dth != NULL) return -DER_INPROGRESS; + if (pm_ver != NULL) + *pm_ver = DAE_VER(dae); + if (epoch != NULL) { daos_epoch_t e = *epoch; @@ -1972,11 +1995,8 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, if (pm_ver == NULL) return DTX_ST_PREPARED; - if (*pm_ver <= cont->vc_dtx_resync_ver) { - if (!for_refresh) - *pm_ver = DAE_VER(dae); + if (*pm_ver <= cont->vc_dtx_resync_ver) return DTX_ST_PREPARED; - } /* * Before DTX resync completed, it is not sure whether related DTX is @@ -2012,7 +2032,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, struct dtx_memberships **mbs) { struct vos_container *cont; - struct dtx_memberships *tmp; struct vos_dtx_act_ent *dae; d_iov_t kiov; d_iov_t riov; @@ -2026,14 +2045,9 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { dae = riov.iov_buf; - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); - if (tmp == NULL) { - rc = -DER_NOMEM; - } else { - if (oid != NULL) - *oid = DAE_OID(dae); - *mbs = tmp; - } + rc = vos_dtx_pack_mbs(vos_cont2umm(cont), dae, mbs); + if (rc == 0 && oid != NULL) + *oid = DAE_OID(dae); } else if (rc == -DER_NONEXIST) { rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); if (rc == 0) @@ -2121,6 +2135,7 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership goto out; dae_df->dae_mbs_off = UMOFF_NULL; + dae_df->dae_mbs_dsize = 0; } if (new_inline) { @@ -2143,10 +2158,9 @@ vos_dtx_refresh_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_membership out: if (started) { if (rc == 0) { + memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); rc = umem_tx_commit(umm); D_ASSERTF(rc == 0, "local TX commit failure: %d\n", rc); - - memcpy(&dae->dae_base, dae_df, sizeof(*dae_df)); } else { rc = umem_tx_abort(umm, rc); } @@ -2300,6 +2314,8 @@ vos_dtx_post_handle(struct vos_container *cont, struct vos_dtx_cmt_ent **dces, int count, bool abort, bool rollback, bool keep_act) { + struct umem_instance *umm = vos_cont2umm(cont); + struct vos_dtx_act_ent_df *dae_df; d_iov_t kiov; int rc; int i; @@ -2385,7 +2401,9 @@ vos_dtx_post_handle(struct vos_container *cont, D_ASSERT(daes[i]->dae_preparing == 0); - daes[i]->dae_prepared = 0; + dae_df = umem_off2ptr(umm, daes[i]->dae_df_off); + memcpy(&daes[i]->dae_base, dae_df, sizeof(*dae_df)); + if (abort) { D_ASSERT(daes[i]->dae_committing == 0); @@ -2503,7 +2521,7 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, } int -vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) +vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch, uint32_t version) { struct vos_container *cont; struct vos_dtx_act_ent *dae = NULL; @@ -2549,6 +2567,9 @@ vos_dtx_abort(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t epoch) if (epoch != DAOS_EPOCH_MAX && epoch != DAE_EPOCH(dae)) D_GOTO(out, rc = -DER_NONEXIST); + if (version != 0 && version < DAE_VER(dae)) + D_GOTO(out, rc = -DER_NONEXIST); + if (unlikely(dae->dae_preparing)) { /* * NOTE: Abort in-preparing DTX entry. It may because the non-leader is some slow,