From 5705ed6e45ca15e4421eb40b2580a1790d4a6e41 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 21 Feb 2026 17:45:52 +0800 Subject: [PATCH 1/6] DAOS-18544 object: reserved targets for GX object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current DAOS only reserves RF × group_size targets for a GX object, whereas it should reserve targets_per_domain × RF. In addition, when the number of reserved targets is a fixed value, the likelihood of losing those targets grows significantly as the cluster scales, which can cause collocated shards and extra data movement during rebuild. This patch changes the number of reserved targets for a GX object to be no less than 30% of the targets. Signed-off-by: Liang Zhen --- src/object/obj_class.c | 63 ++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/src/object/obj_class.c b/src/object/obj_class.c index 654a316efe0..3eebd87bbb4 100644 --- a/src/object/obj_class.c +++ b/src/object/obj_class.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -18,6 +18,9 @@ static struct daos_obj_class **oc_resil_array; static int oc_ident_array_sz; static int oc_resil_array_sz; +#define D_GX_RESERVED_ENV "D_GX_RESERVED" +static unsigned int oc_gx_reserved; + static struct daos_obj_class *oclass_ident2cl(daos_oclass_id_t oc_id, uint32_t *nr_grps); static struct daos_obj_class *oclass_resil2cl(struct daos_oclass_attr *ca); @@ -253,17 +256,30 @@ daos_oclass_grp_nr(struct daos_oclass_attr *oc_attr, struct daos_obj_md *md) /** * To honor RF setting during failure cases, let's reserve RF - * groups, so if some targets fail, there will be enough replacement + * domains, so if some targets fail, there will be enough replacement * targets to rebuild, so to avoid putting multiple shards in the same * domain, which may break the RF setting. * - * Though let's keep reserve targets to be less than 30% of the total - * targets. + * Though let's keep reserve targets to be no less than 30% of the total targets, + * because otherwise layout computation will be too expensive. */ static uint32_t -reserve_grp_by_rf(uint32_t target_nr, uint32_t grp_size, uint32_t rf) +reserve_grp_by_rf(uint32_t domain_nr, uint32_t target_nr, uint32_t grp_size, uint32_t rf) { - return min(((target_nr * 3) / 10) / grp_size, rf); + int tgt_per_dom; + int tgt_reserv; + + if (oc_gx_reserved > 0) + return oc_gx_reserved; + + D_ASSERT(target_nr >= domain_nr); /* unless pool map is corrupted */ + tgt_per_dom = target_nr / domain_nr; + + tgt_reserv = (target_nr * 3) / 10; /* 30 percent */ + if (tgt_reserv < tgt_per_dom * rf) + tgt_reserv = tgt_per_dom * rf; + + return (tgt_reserv + grp_size - 1) / grp_size; } int @@ -303,7 +319,7 @@ daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, enum d grp_size = daos_oclass_grp_size(&ca); if (ca.ca_grp_nr == DAOS_OBJ_GRP_MAX) { - uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf_factor); + uint32_t reserve_grp = reserve_grp_by_rf(domain_nr, target_nr, grp_size, rf_factor); ca.ca_grp_nr = max(1, (target_nr / grp_size)); @@ -751,13 +767,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype *ord = OR_RP_2; grp_size = 2; } else if (rdd == DAOS_OCH_RDD_EC) { - if (domain_nr >= 18) { + if (domain_nr >= 25) { *ord = OR_RS_16P1; grp_size = 17; - } else if (domain_nr >= 10) { + } else if (domain_nr >= 13) { *ord = OR_RS_8P1; grp_size = 9; - } else if (domain_nr >= 6) { + } else if (domain_nr >= 8) { *ord = OR_RS_4P1; grp_size = 5; } else { @@ -772,13 +788,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype case DAOS_PROP_CO_REDUN_RF1: if ((rdd == DAOS_OCH_RDD_EC || (rdd == 0 && daos_is_array_type(otype))) && domain_nr >= 3) { - if (domain_nr >= 18) { + if (domain_nr >= 25) { *ord = OR_RS_16P1; grp_size = 17; - } else if (domain_nr >= 10) { + } else if (domain_nr >= 13) { *ord = OR_RS_8P1; grp_size = 9; - } else if (domain_nr >= 6) { + } else if (domain_nr >= 8) { *ord = OR_RS_4P1; grp_size = 5; } else { @@ -793,13 +809,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype case DAOS_PROP_CO_REDUN_RF2: if ((rdd == DAOS_OCH_RDD_EC || (rdd == 0 && daos_is_array_type(otype))) && domain_nr >= 4) { - if (domain_nr >= 20) { + if (domain_nr >= 26) { *ord = OR_RS_16P2; grp_size = 18; - } else if (domain_nr >= 12) { + } else if (domain_nr >= 15) { *ord = OR_RS_8P2; grp_size = 10; - } else if (domain_nr >= 8) { + } else if (domain_nr >= 9) { *ord = OR_RS_4P2; grp_size = 6; } else { @@ -814,10 +830,10 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype case DAOS_PROP_CO_REDUN_RF3: if ((rdd == DAOS_OCH_RDD_EC || (rdd == 0 && daos_is_array_type(otype))) && domain_nr >= 10) { - if (domain_nr >= 22) { + if (domain_nr >= 28) { *ord = OR_RS_16P3; grp_size = 19; - } else if (domain_nr >= 14) { + } else if (domain_nr >= 16) { *ord = OR_RS_8P3; grp_size = 11; } else { @@ -865,7 +881,7 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype grp_nr = max(256, target_nr * 50 / 100); break; case DAOS_OCH_SHD_EXT: - grp_nr = max(1024, target_nr * 80 / 100); + grp_nr = max(1024, target_nr * 70 / 100); break; default: D_ERROR("Invalid sharding hint\n"); @@ -874,7 +890,7 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype if (grp_nr == DAOS_OBJ_GRP_MAX || grp_nr * grp_size > target_nr) { uint32_t max_grp = target_nr / grp_size; - uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf); + uint32_t reserve_grp = reserve_grp_by_rf(domain_nr, target_nr, grp_size, rf); /* search for the highest scalability in the allowed range */ if (max_grp > reserve_grp) @@ -948,6 +964,13 @@ obj_class_init(void) if (oc_ident_array) return 0; + oc_gx_reserved = 0; + d_getenv_uint(D_GX_RESERVED_ENV, &oc_gx_reserved); + if (oc_gx_reserved > 0) { + D_INFO("%s = %u, it should be set only for benchmarking\n", D_GX_RESERVED_ENV, + oc_gx_reserved); + } + D_ALLOC_ARRAY(oc_ident_array, OC_NR); if (!oc_ident_array) return -DER_NOMEM; From b3b5a9765f6f623b6e6fee6d62bc89352bd64b0e Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Wed, 25 Feb 2026 16:32:48 +0800 Subject: [PATCH 2/6] DAOS-18486 placement: fix down2up handling of placement Signed-off-by: Liang Zhen --- src/placement/jump_map.c | 9 ++++++++- src/placement/pl_map_common.c | 24 ++++++++++++++---------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/placement/jump_map.c b/src/placement/jump_map.c index 56f0643555e..a8c6d52153c 100644 --- a/src/placement/jump_map.c +++ b/src/placement/jump_map.c @@ -1,7 +1,7 @@ /** * * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -734,6 +734,13 @@ get_object_layout(struct pl_jump_map *jmap, uint32_t layout_ver, struct pl_obj_l setbit(dom_cur_grp_real, domain - root); if (pool_target_down(target)) layout->ol_shards[k].po_rebuilding = 1; + + if (pool_target_is_down2up(target)) { + if (gen_mode == PRE_REBUILD) + layout->ol_shards[k].po_rebuilding = 1; + else + layout->ol_shards[k].po_reintegrating = 1; + } } if (is_extending != NULL && pool_target_is_up_or_drain(target)) diff --git a/src/placement/pl_map_common.c b/src/placement/pl_map_common.c index 5afe0691a37..52e9ee93371 100644 --- a/src/placement/pl_map_common.c +++ b/src/placement/pl_map_common.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -255,13 +256,7 @@ is_comp_avaible(struct pool_component *comp, uint32_t allow_version, status = PO_COMP_ST_UPIN; } else if (status == PO_COMP_ST_UP) { if (comp->co_flags & PO_COMPF_DOWN2UP) { - /* PO_COMP_ST_UP status with PO_COMPF_DOWN2UP flag - * is the case of delay_rebuild exclude+reint. - * Cannot mark it as UPIN to avoid it be used for - * rebuild enumerate/fetch, as the data will be - * discarded in reintegrate. - */ - /* status = PO_COMP_ST_UPIN; */ + status = PO_COMP_ST_UPIN; } else { if (comp->co_fseq <= 1) status = PO_COMP_ST_NEW; @@ -394,9 +389,14 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, if (spare_avail) { /* The selected spare target is up and ready */ l_shard->po_target = spare_tgt->ta_comp.co_id; - l_shard->po_fseq = f_shard->fs_fseq; - l_shard->po_rank = spare_tgt->ta_comp.co_rank; - l_shard->po_index = spare_tgt->ta_comp.co_index; + l_shard->po_fseq = f_shard->fs_fseq; + l_shard->po_rank = spare_tgt->ta_comp.co_rank; + l_shard->po_index = spare_tgt->ta_comp.co_index; + + if (pool_target_is_down2up(spare_tgt)) + f_shard->fs_down2up = 1; + else + f_shard->fs_down2up = 0; /* * Mark the shard as 'rebuilding' so that read will skip this shard. @@ -406,6 +406,10 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, f_shard->fs_status == PO_COMP_ST_DRAIN || f_shard->fs_down2up || pool_target_down(spare_tgt)) l_shard->po_rebuilding = 1; + + if (f_shard->fs_down2up && gen_mode != PRE_REBUILD) + l_shard->po_reintegrating = 1; + } else { l_shard->po_shard = -1; l_shard->po_target = -1; From 97989f7a1bdb5dd740007977189e2861cf7bdb08 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 27 Feb 2026 00:20:38 +0800 Subject: [PATCH 3/6] DAOS-18487 rebuild: add new debug log Signed-off-by: Liang Zhen --- src/vos/vos_dtx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index d50f2b87cc8..857d63b530a 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1444,8 +1444,11 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry, * to rebuild. Related new IO corresponding to such non-committed DTX * has already been sent to the in-rebuilding target. */ - if (intent == DAOS_INTENT_MIGRATION) + if (intent == DAOS_INTENT_MIGRATION) { + D_INFO("Skip non-committed DTX " DF_DTI " for obj " DF_UOID "\n", + DP_DTI(&DAE_XID(dae)), DP_UOID(DAE_OID(dae))); return ALB_UNAVAILABLE; + } if (intent == DAOS_INTENT_DEFAULT) { if (DAOS_FAIL_CHECK(DAOS_VOS_NON_LEADER)) From 65a0dc745ce5c44ced99fef0c7d62fda75b0f324 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 28 Feb 2026 18:20:36 +0800 Subject: [PATCH 4/6] DAOS-18487 rebuild: don't wait for discard - pool_discard doesn't wait for completion of discard anymore - Make sure no concurrent discards Signed-off-by: Liang Zhen --- src/include/daos_srv/pool.h | 5 +-- src/object/srv_obj_migrate.c | 2 +- src/pool/srv_target.c | 63 ++++++++++++++++++++++++++++-------- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 147e4bb3fc1..c271fceb7b9 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -88,12 +88,13 @@ struct ds_pool { */ uuid_t sp_srv_cont_hdl; uuid_t sp_srv_pool_hdl; - uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_need_discard : 1, - sp_disable_rebuild : 1, sp_disable_dtx_resync : 1, sp_incr_reint : 1; + uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_disable_rebuild : 1, + sp_disable_dtx_resync : 1, sp_incr_reint : 1; /* pool_uuid + map version + leader term + rebuild generation define a * rebuild job. */ uint32_t sp_rebuild_gen; + ATOMIC int sp_need_discard; ATOMIC int sp_rebuilding; /** * someone has already messaged this pool to for rebuild scan, diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 2ccda79e4c2..6f928cdc672 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3249,7 +3249,7 @@ migrate_obj_ult(void *data) * discard, or discard has been done. spc_discard_done means * discarding has been done in the current VOS target. */ - if (tls->mpt_pool->spc_pool->sp_need_discard) { + if (atomic_load(&tls->mpt_pool->spc_pool->sp_need_discard) > 0) { while(!tls->mpt_pool->spc_discard_done) { D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls)); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index b7fe7dfe2c8..f7e543e39b0 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2025 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2435,6 +2435,7 @@ struct tgt_discard_arg { uuid_t pool_uuid; uint64_t epoch; struct pool_target_addr_list tgt_list; + struct ds_pool_child *pool_child; }; struct child_discard_arg { @@ -2586,12 +2587,13 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, return rc; } -static int +static void pool_child_discard(void *data) { struct tgt_discard_arg *arg = data; struct child_discard_arg cont_arg; - struct ds_pool_child *child; + struct ds_pool_child *child = arg->pool_child; + struct ds_pool *pool = child->spc_pool; vos_iter_param_t param = { 0 }; struct vos_iter_anchors anchor = { 0 }; struct pool_target_addr addr; @@ -2599,13 +2601,16 @@ pool_child_discard(void *data) struct d_backoff_seq backoff_seq; int rc; + D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", + DP_UUID(arg->pool_uuid)); + myrank = dss_self_rank(); addr.pta_rank = myrank; addr.pta_target = dss_get_module_info()->dmi_tgt_id; if (!pool_target_addr_found(&arg->tgt_list, &addr)) { D_DEBUG(DB_TRACE, "skip discard %u/%u.\n", addr.pta_rank, addr.pta_target); - return 0; + return; } D_DEBUG(DB_MD, DF_UUID" discard %u/%u\n", DP_UUID(arg->pool_uuid), @@ -2624,10 +2629,6 @@ pool_child_discard(void *data) * It is important to note that manual reintegration may be initiated * before step 3, in which case, the function should return “DER_AGAIN." */ - child = ds_pool_child_lookup(arg->pool_uuid); - if (child == NULL) - return -DER_AGAIN; - param.ip_hdl = child->spc_hdl; rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, @@ -2635,7 +2636,7 @@ pool_child_discard(void *data) D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); cont_arg.tgt_discard = arg; - child->spc_discard_done = 0; + do { rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, cont_discard_cb, NULL, &cont_arg, NULL); @@ -2648,11 +2649,33 @@ pool_child_discard(void *data) } while (1); child->spc_discard_done = 1; - d_backoff_seq_fini(&backoff_seq); + ABT_mutex_lock(pool->sp_mutex); + if (rc && pool->sp_discard_status == 0) + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + ds_pool_child_put(child); +} + +static int +pool_child_discard_async(void *data) +{ + struct tgt_discard_arg *arg = data; + struct ds_pool_child *child; + int rc; + + child = ds_pool_child_lookup(arg->pool_uuid); + if (child == NULL) + return -DER_AGAIN; + /* set barrier to avoid race with rebuild */ + child->spc_discard_done = 0; + arg->pool_child = child; + rc = dss_ult_create(pool_child_discard, arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); + if (rc) + ds_pool_child_put(child); return rc; } @@ -2749,6 +2772,7 @@ ds_pool_tgt_discard_ult(void *data) struct ds_pool *pool; struct tgt_discard_arg *arg = data; uint32_t ex_status; + int ref; int rc; /* If discard failed, let's still go ahead, since reintegration might @@ -2762,10 +2786,15 @@ ds_pool_tgt_discard_ult(void *data) } ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; - ds_pool_thread_collective(arg->pool_uuid, ex_status, pool_child_discard, arg, - DSS_ULT_DEEP_STACK); + /* It returns after pool_child_discard_async() is scheduled on all xstreams, + * it wouldn't wait for completion of discard, but it can guarantee barriers + * are set on all xstreams (ds_pool_child::spc_discard_done = 0). + */ + ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); + + ref = atomic_fetch_sub(&pool->sp_need_discard, 1); + D_ASSERTF(ref >= 0); - pool->sp_need_discard = 0; pool->sp_discard_status = rc; ds_pool_put(pool); free: @@ -2801,7 +2830,13 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) D_GOTO(out, rc = 0); } - pool->sp_need_discard = 1; + if (atomic_fetch_add(&pool->sp_need_discard, 1) > 1) { + atomic_fetch_sub(&pool->sp_need_discard, 1); + D_INFO(DF_UUID " XXX: discard is already in progress, \n", DP_UUID(arg->pool_uuid)); + ds_pool_put(pool); + D_GOTO(out, rc = -DER_BUSY); + } + pool->sp_discard_status = 0; rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); if (rc == 0) From 875092179839d68add5c758e6c7b1f532eb57c5d Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sun, 1 Mar 2026 19:58:35 +0800 Subject: [PATCH 5/6] DAOS-18487 rebuild: code cleanup Signed-off-by: Liang Zhen --- src/include/daos_srv/pool.h | 5 +- src/object/srv_obj.c | 14 +-- src/object/srv_obj_migrate.c | 23 ++-- src/pool/srv_target.c | 222 ++++++++++++++++------------------- 4 files changed, 126 insertions(+), 138 deletions(-) diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index c271fceb7b9..3fbaae93810 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -94,8 +94,8 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - ATOMIC int sp_need_discard; ATOMIC int sp_rebuilding; + ATOMIC int sp_discarding; /** * someone has already messaged this pool to for rebuild scan, * NB: all xstreams can do lockless-write on it but it's OK @@ -192,8 +192,7 @@ struct ds_pool_child { int spc_ref; ABT_eventual spc_ref_eventual; - uint64_t spc_discard_done:1, - spc_no_storage:1; /* The pool shard has no storage. */ + uint64_t spc_no_storage : 1; /* The pool shard has no storage. */ uint32_t spc_reint_mode; uint32_t *spc_state; /* Pointer to ds_pool->sp_states[i] */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 2ecd52f1a0f..c41c8f0731e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2450,12 +2450,14 @@ static int obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, uint32_t rpc_map_ver, uint32_t flags) { + struct ds_pool *pool = child->sc_pool->spc_pool; + if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) { /* EC aggregation is still inflight, rebuild should wait until it's paused */ if (ds_cont_child_ec_aggregating(child)) { D_ERROR(DF_CONT " ec aggregate still active, rebuilding %d\n", - DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid), - atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)); + DP_CONT(pool->sp_uuid, child->sc_uuid), + atomic_load(&pool->sp_rebuilding)); return -DER_UPDATE_AGAIN; } } @@ -2463,7 +2465,7 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, if (!obj_is_modification_opc(opc) && (opc != DAOS_OBJ_RPC_CPD || flags & ORF_CPD_RDONLY)) return 0; - if (atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)) { + if (atomic_load(&pool->sp_rebuilding)) { uint32_t version; ds_rebuild_running_query(child->sc_pool_uuid, RB_OP_REBUILD, @@ -2480,10 +2482,8 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, * vos discard to finish, which otherwise might discard these new in-flight * I/O update. */ - if ((flags & ORF_REINTEGRATING_IO) && - (child->sc_pool->spc_pool->sp_need_discard && - child->sc_pool->spc_discard_done == 0)) { - D_ERROR("reintegrating "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); + if ((flags & ORF_REINTEGRATING_IO) && atomic_load(&pool->sp_discarding) > 0) { + D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 6f928cdc672..c7740f5594a 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3230,6 +3230,7 @@ migrate_obj_ult(void *data) { struct iter_obj_arg *arg = data; struct migrate_pool_tls *tls = NULL; + struct ds_pool *pool; daos_epoch_range_t epr; daos_epoch_t stable_epoch = 0; daos_handle_t coh = DAOS_HDL_INVAL; @@ -3249,20 +3250,22 @@ migrate_obj_ult(void *data) * discard, or discard has been done. spc_discard_done means * discarding has been done in the current VOS target. */ - if (atomic_load(&tls->mpt_pool->spc_pool->sp_need_discard) > 0) { - while(!tls->mpt_pool->spc_discard_done) { - D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", - DP_RB_MPT(tls)); - dss_sleep(2 * 1000); - if (tls->mpt_fini) - D_GOTO(free_notls, rc); - } - if (tls->mpt_pool->spc_pool->sp_discard_status) { - rc = tls->mpt_pool->spc_pool->sp_discard_status; + pool = tls->mpt_pool->spc_pool; + while (atomic_load(&pool->sp_discarding) > 0) { + D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls)); + dss_sleep(2 * 1000); + if (tls->mpt_fini) + D_GOTO(free_notls, rc); + + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status) { + rc = pool->sp_discard_status; + ABT_mutex_unlock(pool->sp_mutex); D_DEBUG(DB_REBUILD, DF_RB ": discard failure: " DF_RC "\n", DP_RB_MPT(tls), DP_RC(rc)); D_GOTO(out, rc); } + ABT_mutex_unlock(pool->sp_mutex); } if (tls->mpt_reintegrating) { diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index f7e543e39b0..7bd4a0e52dc 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -2435,12 +2435,14 @@ struct tgt_discard_arg { uuid_t pool_uuid; uint64_t epoch; struct pool_target_addr_list tgt_list; - struct ds_pool_child *pool_child; }; struct child_discard_arg { - struct tgt_discard_arg *tgt_discard; - uuid_t cont_uuid; + struct ds_pool_child *ca_child; + struct pool_target_addr ca_addr; + uint64_t ca_epoch; + uuid_t ca_po_uuid; + uuid_t ca_co_uuid; }; static struct tgt_discard_arg* @@ -2489,7 +2491,7 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, 1 << 10 /* max (ms) */); D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - epr.epr_hi = arg->tgt_discard->epoch; + epr.epr_hi = arg->ca_epoch; epr.epr_lo = 0; do { /* Inform the iterator and delete the object */ @@ -2506,9 +2508,8 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, d_backoff_seq_fini(&backoff_seq); if (rc != 0) - D_ERROR("discard object pool/object "DF_UUID"/"DF_UOID" rc: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UOID(ent->ie_oid), - DP_RC(rc)); + D_ERROR("discard object pool/object " DF_UUID "/" DF_UOID " rc: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UOID(ent->ie_oid), DP_RC(rc)); return rc; } @@ -2527,14 +2528,12 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, int rc; D_ASSERT(type == VOS_ITER_COUUID); - if (uuid_compare(arg->cont_uuid, entry->ie_couuid) == 0) { - D_DEBUG(DB_REBUILD, DF_UUID" already discard\n", - DP_UUID(arg->cont_uuid)); + if (uuid_compare(arg->ca_co_uuid, entry->ie_couuid) == 0) { + D_DEBUG(DB_REBUILD, DF_UUID " already discard\n", DP_UUID(arg->ca_co_uuid)); return 0; } - rc = ds_cont_child_lookup(arg->tgt_discard->pool_uuid, entry->ie_couuid, - &cont); + rc = ds_cont_child_lookup(arg->ca_po_uuid, entry->ie_couuid, &cont); if (rc != DER_SUCCESS) { D_ERROR("Lookup container '"DF_UUIDF"' failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); @@ -2554,8 +2553,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, param.ip_hdl = coh; param.ip_epr.epr_lo = 0; - param.ip_epr.epr_hi = arg->tgt_discard->epoch; - uuid_copy(arg->cont_uuid, entry->ie_couuid); + param.ip_epr.epr_hi = arg->ca_epoch; + uuid_copy(arg->ca_co_uuid, entry->ie_couuid); do { /* Inform the iterator and delete the object */ *acts |= VOS_ITER_CB_DELETE; @@ -2571,9 +2570,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, d_backoff_seq_fini(&backoff_seq); vos_cont_close(coh); - D_DEBUG(DB_TRACE, DF_UUID"/"DF_UUID" discard cont done: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid), - DP_RC(rc)); + D_DEBUG(DB_TRACE, DF_UUID "/" DF_UUID " discard cont done: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UUID(entry->ie_couuid), DP_RC(rc)); put: ds_cont_child_put(cont); @@ -2590,92 +2588,103 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, static void pool_child_discard(void *data) { - struct tgt_discard_arg *arg = data; - struct child_discard_arg cont_arg; - struct ds_pool_child *child = arg->pool_child; - struct ds_pool *pool = child->spc_pool; + struct child_discard_arg *cont_arg = data; + struct ds_pool *pool = cont_arg->ca_child->spc_pool; vos_iter_param_t param = { 0 }; - struct vos_iter_anchors anchor = { 0 }; - struct pool_target_addr addr; - uint32_t myrank; + struct vos_iter_anchors anchor = {0}; struct d_backoff_seq backoff_seq; int rc; - D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", - DP_UUID(arg->pool_uuid)); - - myrank = dss_self_rank(); - addr.pta_rank = myrank; - addr.pta_target = dss_get_module_info()->dmi_tgt_id; - if (!pool_target_addr_found(&arg->tgt_list, &addr)) { - D_DEBUG(DB_TRACE, "skip discard %u/%u.\n", addr.pta_rank, - addr.pta_target); - return; - } - - D_DEBUG(DB_MD, DF_UUID" discard %u/%u\n", DP_UUID(arg->pool_uuid), - myrank, addr.pta_target); - - /** - * When a faulty device is replaced with a new one using the - * “dmg storage replace nvme” command, the reintegration of - * affected pool targets is automatically triggered. - * The following steps outline the device replacement process on the engine side: - * - * 1) Replace the old device with the new device in the SMD. - * 2) Setup all SPDK related stuff for the new device. - * 3) Start ds_pool_child - * - * It is important to note that manual reintegration may be initiated - * before step 3, in which case, the function should return “DER_AGAIN." - */ - param.ip_hdl = child->spc_hdl; - rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, 1 << 10 /* max (ms) */); D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - cont_arg.tgt_discard = arg; - + param.ip_hdl = cont_arg->ca_child->spc_hdl; do { rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, cont_discard_cb, NULL, &cont_arg, NULL); if (rc != -DER_BUSY && rc != -DER_INPROGRESS) break; - D_DEBUG(DB_REBUILD, "retry by "DF_RC"/"DF_UUID"\n", - DP_RC(rc), DP_UUID(arg->pool_uuid)); + D_DEBUG(DB_REBUILD, "retry by " DF_RC "/" DF_UUID "\n", DP_RC(rc), + DP_UUID(cont_arg->ca_po_uuid)); dss_sleep(d_backoff_seq_next(&backoff_seq)); } while (1); - child->spc_discard_done = 1; d_backoff_seq_fini(&backoff_seq); + if (rc) { + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status == 0) + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + } + D_INFO(DF_UUID " discard completed rank/target=%u/%d\n", DP_UUID(cont_arg->ca_po_uuid), + cont_arg->ca_addr.pta_rank, cont_arg->ca_addr.pta_target); - ABT_mutex_lock(pool->sp_mutex); - if (rc && pool->sp_discard_status == 0) - pool->sp_discard_status = rc; - ABT_mutex_unlock(pool->sp_mutex); - - ds_pool_child_put(child); + atomic_fetch_sub(&pool->sp_discarding, 1); + ds_pool_child_put(cont_arg->ca_child); + D_FREE(cont_arg); } static int pool_child_discard_async(void *data) { - struct tgt_discard_arg *arg = data; - struct ds_pool_child *child; - int rc; + struct tgt_discard_arg *arg = data; + struct ds_pool *pool; + struct child_discard_arg *cont_arg; + struct pool_target_addr addr; + int rc = 0; - child = ds_pool_child_lookup(arg->pool_uuid); - if (child == NULL) - return -DER_AGAIN; + addr.pta_rank = dss_self_rank(); + addr.pta_target = dss_get_module_info()->dmi_tgt_id; + if (!pool_target_addr_found(&arg->tgt_list, &addr)) { + D_INFO(DF_UUID "discard skipped rank/target=%u/%u.\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); + return 0; + } + + D_ALLOC_PTR(cont_arg); + if (!cont_arg) + return -DER_NOMEM; + + /** + * When a faulty device is replaced with a new one using the + * “dmg storage replace nvme” command, the reintegration of + * affected pool targets is automatically triggered. + * The following steps outline the device replacement process on the engine side: + * + * 1) Replace the old device with the new device in the SMD. + * 2) Setup all SPDK related stuff for the new device. + * 3) Start ds_pool_child + * + * It is important to note that manual reintegration may be initiated + * before step 3, in which case, the function should return “DER_AGAIN." + */ + cont_arg->ca_child = ds_pool_child_lookup(arg->pool_uuid); + if (cont_arg->ca_child == NULL) + D_GOTO(out, rc = -DER_AGAIN); /* set barrier to avoid race with rebuild */ - child->spc_discard_done = 0; - arg->pool_child = child; - rc = dss_ult_create(pool_child_discard, arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); + cont_arg->ca_addr = addr; + cont_arg->ca_epoch = arg->epoch; + uuid_copy(cont_arg->ca_po_uuid, arg->pool_uuid); + + pool = cont_arg->ca_child->spc_pool; + D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", + DP_UUID(arg->pool_uuid)); + D_INFO(DF_UUID " discard started rank/target=%u/%u\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); + + atomic_fetch_add(&pool->sp_discarding, 1); + rc = dss_ult_create(pool_child_discard, cont_arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); if (rc) - ds_pool_child_put(child); + atomic_fetch_sub(&pool->sp_discarding, 1); +out: + if (rc) { + if (cont_arg->ca_child) + ds_pool_child_put(cont_arg->ca_child); + D_FREE(cont_arg); + } return rc; } @@ -2765,43 +2774,6 @@ ds_pool_task_collective(uuid_t pool_uuid, uint32_t ex_status, int (*coll_func)(v return ds_pool_collective(pool_uuid, ex_status, coll_func, arg, flags, false); } -/* Discard the objects by epoch in this pool */ -static int -ds_pool_tgt_discard_ult(void *data) -{ - struct ds_pool *pool; - struct tgt_discard_arg *arg = data; - uint32_t ex_status; - int ref; - int rc; - - /* If discard failed, let's still go ahead, since reintegration might - * still succeed, though it might leave some garbage on the reintegration - * target, the future scrub tool might fix it. XXX - */ - rc = ds_pool_lookup(arg->pool_uuid, &pool); - if (pool == NULL) { - D_INFO(DF_UUID" can not be found: %d\n", DP_UUID(arg->pool_uuid), rc); - D_GOTO(free, rc = 0); - } - - ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; - /* It returns after pool_child_discard_async() is scheduled on all xstreams, - * it wouldn't wait for completion of discard, but it can guarantee barriers - * are set on all xstreams (ds_pool_child::spc_discard_done = 0). - */ - ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); - - ref = atomic_fetch_sub(&pool->sp_need_discard, 1); - D_ASSERTF(ref >= 0); - - pool->sp_discard_status = rc; - ds_pool_put(pool); -free: - tgt_discard_arg_free(arg); - return rc; -} - void ds_pool_tgt_discard_handler(crt_rpc_t *rpc) { @@ -2810,6 +2782,7 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) struct pool_target_addr_list pta_list; struct tgt_discard_arg *arg = NULL; struct ds_pool *pool; + uint32_t ex_status; int rc; pta_list.pta_number = in->ptdi_addrs.ca_count; @@ -2830,25 +2803,38 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) D_GOTO(out, rc = 0); } - if (atomic_fetch_add(&pool->sp_need_discard, 1) > 1) { - atomic_fetch_sub(&pool->sp_need_discard, 1); + if (atomic_fetch_add(&pool->sp_discarding, 1) > 0) { + atomic_fetch_sub(&pool->sp_discarding, 1); D_INFO(DF_UUID " XXX: discard is already in progress, \n", DP_UUID(arg->pool_uuid)); ds_pool_put(pool); D_GOTO(out, rc = -DER_BUSY); } + D_INFO(DF_UUID " discard started\n", DP_UUID(arg->pool_uuid)); + ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = 0; - rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); - if (rc == 0) - rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ + ABT_mutex_unlock(pool->sp_mutex); + + ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; + rc = ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); + if (rc != 0) { + ABT_mutex_lock(pool->sp_mutex); + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + D_GOTO(out_put, rc); + } + rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ +out_put: + /* all targets should have already started to discard, I can release my refcount */ + atomic_fetch_sub(&pool->sp_discarding, 1); ds_pool_put(pool); out: out->ptdo_rc = rc; D_DEBUG(DB_MD, DF_UUID": replying rpc "DF_RC"\n", DP_UUID(in->ptdi_uuid), DP_RC(rc)); crt_reply_send(rpc); - if (rc != 0 && arg != NULL) + if (arg != NULL) tgt_discard_arg_free(arg); } From f0cf9711470dd6b1dd0d1542df2e412f666f10b3 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Mon, 2 Mar 2026 09:10:38 +0800 Subject: [PATCH 6/6] DAOS-18487 rebuild: move assert from ULT to RPC handler Signed-off-by: Liang Zhen --- src/pool/srv_target.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 7bd4a0e52dc..a8f3fd097fb 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -2612,15 +2612,16 @@ pool_child_discard(void *data) } while (1); d_backoff_seq_fini(&backoff_seq); + D_INFO(DF_UUID " discard completed rank/target=%u/%d, rc=%d\n", + DP_UUID(cont_arg->ca_po_uuid), cont_arg->ca_addr.pta_rank, + cont_arg->ca_addr.pta_target, rc); + if (rc) { ABT_mutex_lock(pool->sp_mutex); if (pool->sp_discard_status == 0) pool->sp_discard_status = rc; ABT_mutex_unlock(pool->sp_mutex); } - D_INFO(DF_UUID " discard completed rank/target=%u/%d\n", DP_UUID(cont_arg->ca_po_uuid), - cont_arg->ca_addr.pta_rank, cont_arg->ca_addr.pta_target); - atomic_fetch_sub(&pool->sp_discarding, 1); ds_pool_child_put(cont_arg->ca_child); D_FREE(cont_arg); @@ -2669,12 +2670,10 @@ pool_child_discard_async(void *data) cont_arg->ca_epoch = arg->epoch; uuid_copy(cont_arg->ca_po_uuid, arg->pool_uuid); - pool = cont_arg->ca_child->spc_pool; - D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", - DP_UUID(arg->pool_uuid)); D_INFO(DF_UUID " discard started rank/target=%u/%u\n", DP_UUID(arg->pool_uuid), addr.pta_rank, addr.pta_target); + pool = cont_arg->ca_child->spc_pool; atomic_fetch_add(&pool->sp_discarding, 1); rc = dss_ult_create(pool_child_discard, cont_arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); if (rc) @@ -2811,6 +2810,10 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) } D_INFO(DF_UUID " discard started\n", DP_UUID(arg->pool_uuid)); + /* XXX just return EAGAIN/EPERM? */ + D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", + DP_UUID(arg->pool_uuid)); + ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = 0; ABT_mutex_unlock(pool->sp_mutex); @@ -2821,11 +2824,11 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = rc; ABT_mutex_unlock(pool->sp_mutex); - D_GOTO(out_put, rc); + D_GOTO(out_sub, rc); } rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ -out_put: +out_sub: /* all targets should have already started to discard, I can release my refcount */ atomic_fetch_sub(&pool->sp_discarding, 1); ds_pool_put(pool);