From df0f7268632831393e160d50daea63fe7bd1e21d Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 28 Feb 2026 18:20:36 +0800 Subject: [PATCH 1/4] DAOS-18487 rebuild: don't wait for discard - pool_discard doesn't wait for completion of discard anymore - Make sure no concurrent discards Signed-off-by: Liang Zhen --- src/include/daos_srv/pool.h | 5 +-- src/object/srv_obj_migrate.c | 2 +- src/pool/srv_target.c | 63 ++++++++++++++++++++++++++++-------- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 147e4bb3fc1..c271fceb7b9 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -88,12 +88,13 @@ struct ds_pool { */ uuid_t sp_srv_cont_hdl; uuid_t sp_srv_pool_hdl; - uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_need_discard : 1, - sp_disable_rebuild : 1, sp_disable_dtx_resync : 1, sp_incr_reint : 1; + uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_disable_rebuild : 1, + sp_disable_dtx_resync : 1, sp_incr_reint : 1; /* pool_uuid + map version + leader term + rebuild generation define a * rebuild job. */ uint32_t sp_rebuild_gen; + ATOMIC int sp_need_discard; ATOMIC int sp_rebuilding; /** * someone has already messaged this pool to for rebuild scan, diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 2ccda79e4c2..6f928cdc672 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3249,7 +3249,7 @@ migrate_obj_ult(void *data) * discard, or discard has been done. spc_discard_done means * discarding has been done in the current VOS target. */ - if (tls->mpt_pool->spc_pool->sp_need_discard) { + if (atomic_load(&tls->mpt_pool->spc_pool->sp_need_discard) > 0) { while(!tls->mpt_pool->spc_discard_done) { D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls)); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index b7fe7dfe2c8..f7e543e39b0 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2025 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2435,6 +2435,7 @@ struct tgt_discard_arg { uuid_t pool_uuid; uint64_t epoch; struct pool_target_addr_list tgt_list; + struct ds_pool_child *pool_child; }; struct child_discard_arg { @@ -2586,12 +2587,13 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, return rc; } -static int +static void pool_child_discard(void *data) { struct tgt_discard_arg *arg = data; struct child_discard_arg cont_arg; - struct ds_pool_child *child; + struct ds_pool_child *child = arg->pool_child; + struct ds_pool *pool = child->spc_pool; vos_iter_param_t param = { 0 }; struct vos_iter_anchors anchor = { 0 }; struct pool_target_addr addr; @@ -2599,13 +2601,16 @@ pool_child_discard(void *data) struct d_backoff_seq backoff_seq; int rc; + D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", + DP_UUID(arg->pool_uuid)); + myrank = dss_self_rank(); addr.pta_rank = myrank; addr.pta_target = dss_get_module_info()->dmi_tgt_id; if (!pool_target_addr_found(&arg->tgt_list, &addr)) { D_DEBUG(DB_TRACE, "skip discard %u/%u.\n", addr.pta_rank, addr.pta_target); - return 0; + return; } D_DEBUG(DB_MD, DF_UUID" discard %u/%u\n", DP_UUID(arg->pool_uuid), @@ -2624,10 +2629,6 @@ pool_child_discard(void *data) * It is important to note that manual reintegration may be initiated * before step 3, in which case, the function should return “DER_AGAIN." */ - child = ds_pool_child_lookup(arg->pool_uuid); - if (child == NULL) - return -DER_AGAIN; - param.ip_hdl = child->spc_hdl; rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, @@ -2635,7 +2636,7 @@ pool_child_discard(void *data) D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); cont_arg.tgt_discard = arg; - child->spc_discard_done = 0; + do { rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, cont_discard_cb, NULL, &cont_arg, NULL); @@ -2648,11 +2649,33 @@ pool_child_discard(void *data) } while (1); child->spc_discard_done = 1; - d_backoff_seq_fini(&backoff_seq); + ABT_mutex_lock(pool->sp_mutex); + if (rc && pool->sp_discard_status == 0) + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + ds_pool_child_put(child); +} + +static int +pool_child_discard_async(void *data) +{ + struct tgt_discard_arg *arg = data; + struct ds_pool_child *child; + int rc; + + child = ds_pool_child_lookup(arg->pool_uuid); + if (child == NULL) + return -DER_AGAIN; + /* set barrier to avoid race with rebuild */ + child->spc_discard_done = 0; + arg->pool_child = child; + rc = dss_ult_create(pool_child_discard, arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); + if (rc) + ds_pool_child_put(child); return rc; } @@ -2749,6 +2772,7 @@ ds_pool_tgt_discard_ult(void *data) struct ds_pool *pool; struct tgt_discard_arg *arg = data; uint32_t ex_status; + int ref; int rc; /* If discard failed, let's still go ahead, since reintegration might @@ -2762,10 +2786,15 @@ ds_pool_tgt_discard_ult(void *data) } ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; - ds_pool_thread_collective(arg->pool_uuid, ex_status, pool_child_discard, arg, - DSS_ULT_DEEP_STACK); + /* It returns after pool_child_discard_async() is scheduled on all xstreams, + * it wouldn't wait for completion of discard, but it can guarantee barriers + * are set on all xstreams (ds_pool_child::spc_discard_done = 0). + */ + ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); + + ref = atomic_fetch_sub(&pool->sp_need_discard, 1); + D_ASSERTF(ref >= 0); - pool->sp_need_discard = 0; pool->sp_discard_status = rc; ds_pool_put(pool); free: @@ -2801,7 +2830,13 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) D_GOTO(out, rc = 0); } - pool->sp_need_discard = 1; + if (atomic_fetch_add(&pool->sp_need_discard, 1) > 1) { + atomic_fetch_sub(&pool->sp_need_discard, 1); + D_INFO(DF_UUID " XXX: discard is already in progress, \n", DP_UUID(arg->pool_uuid)); + ds_pool_put(pool); + D_GOTO(out, rc = -DER_BUSY); + } + pool->sp_discard_status = 0; rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); if (rc == 0) From 02dd64377dc2c1675edea4a601c5f4bb1af3cca2 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sun, 1 Mar 2026 19:58:35 +0800 Subject: [PATCH 2/4] DAOS-18487 rebuild: code cleanup Signed-off-by: Liang Zhen --- src/include/daos_srv/pool.h | 5 +- src/object/srv_obj.c | 14 +-- src/object/srv_obj_migrate.c | 23 ++-- src/pool/srv_target.c | 222 ++++++++++++++++------------------- 4 files changed, 126 insertions(+), 138 deletions(-) diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index c271fceb7b9..3fbaae93810 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -94,8 +94,8 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - ATOMIC int sp_need_discard; ATOMIC int sp_rebuilding; + ATOMIC int sp_discarding; /** * someone has already messaged this pool to for rebuild scan, * NB: all xstreams can do lockless-write on it but it's OK @@ -192,8 +192,7 @@ struct ds_pool_child { int spc_ref; ABT_eventual spc_ref_eventual; - uint64_t spc_discard_done:1, - spc_no_storage:1; /* The pool shard has no storage. */ + uint64_t spc_no_storage : 1; /* The pool shard has no storage. */ uint32_t spc_reint_mode; uint32_t *spc_state; /* Pointer to ds_pool->sp_states[i] */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 2ecd52f1a0f..c41c8f0731e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2450,12 +2450,14 @@ static int obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, uint32_t rpc_map_ver, uint32_t flags) { + struct ds_pool *pool = child->sc_pool->spc_pool; + if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) { /* EC aggregation is still inflight, rebuild should wait until it's paused */ if (ds_cont_child_ec_aggregating(child)) { D_ERROR(DF_CONT " ec aggregate still active, rebuilding %d\n", - DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid), - atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)); + DP_CONT(pool->sp_uuid, child->sc_uuid), + atomic_load(&pool->sp_rebuilding)); return -DER_UPDATE_AGAIN; } } @@ -2463,7 +2465,7 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, if (!obj_is_modification_opc(opc) && (opc != DAOS_OBJ_RPC_CPD || flags & ORF_CPD_RDONLY)) return 0; - if (atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)) { + if (atomic_load(&pool->sp_rebuilding)) { uint32_t version; ds_rebuild_running_query(child->sc_pool_uuid, RB_OP_REBUILD, @@ -2480,10 +2482,8 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, * vos discard to finish, which otherwise might discard these new in-flight * I/O update. */ - if ((flags & ORF_REINTEGRATING_IO) && - (child->sc_pool->spc_pool->sp_need_discard && - child->sc_pool->spc_discard_done == 0)) { - D_ERROR("reintegrating "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); + if ((flags & ORF_REINTEGRATING_IO) && atomic_load(&pool->sp_discarding) > 0) { + D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 6f928cdc672..c7740f5594a 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3230,6 +3230,7 @@ migrate_obj_ult(void *data) { struct iter_obj_arg *arg = data; struct migrate_pool_tls *tls = NULL; + struct ds_pool *pool; daos_epoch_range_t epr; daos_epoch_t stable_epoch = 0; daos_handle_t coh = DAOS_HDL_INVAL; @@ -3249,20 +3250,22 @@ migrate_obj_ult(void *data) * discard, or discard has been done. spc_discard_done means * discarding has been done in the current VOS target. */ - if (atomic_load(&tls->mpt_pool->spc_pool->sp_need_discard) > 0) { - while(!tls->mpt_pool->spc_discard_done) { - D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", - DP_RB_MPT(tls)); - dss_sleep(2 * 1000); - if (tls->mpt_fini) - D_GOTO(free_notls, rc); - } - if (tls->mpt_pool->spc_pool->sp_discard_status) { - rc = tls->mpt_pool->spc_pool->sp_discard_status; + pool = tls->mpt_pool->spc_pool; + while (atomic_load(&pool->sp_discarding) > 0) { + D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls)); + dss_sleep(2 * 1000); + if (tls->mpt_fini) + D_GOTO(free_notls, rc); + + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status) { + rc = pool->sp_discard_status; + ABT_mutex_unlock(pool->sp_mutex); D_DEBUG(DB_REBUILD, DF_RB ": discard failure: " DF_RC "\n", DP_RB_MPT(tls), DP_RC(rc)); D_GOTO(out, rc); } + ABT_mutex_unlock(pool->sp_mutex); } if (tls->mpt_reintegrating) { diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index f7e543e39b0..7bd4a0e52dc 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -2435,12 +2435,14 @@ struct tgt_discard_arg { uuid_t pool_uuid; uint64_t epoch; struct pool_target_addr_list tgt_list; - struct ds_pool_child *pool_child; }; struct child_discard_arg { - struct tgt_discard_arg *tgt_discard; - uuid_t cont_uuid; + struct ds_pool_child *ca_child; + struct pool_target_addr ca_addr; + uint64_t ca_epoch; + uuid_t ca_po_uuid; + uuid_t ca_co_uuid; }; static struct tgt_discard_arg* @@ -2489,7 +2491,7 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, 1 << 10 /* max (ms) */); D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - epr.epr_hi = arg->tgt_discard->epoch; + epr.epr_hi = arg->ca_epoch; epr.epr_lo = 0; do { /* Inform the iterator and delete the object */ @@ -2506,9 +2508,8 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, d_backoff_seq_fini(&backoff_seq); if (rc != 0) - D_ERROR("discard object pool/object "DF_UUID"/"DF_UOID" rc: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UOID(ent->ie_oid), - DP_RC(rc)); + D_ERROR("discard object pool/object " DF_UUID "/" DF_UOID " rc: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UOID(ent->ie_oid), DP_RC(rc)); return rc; } @@ -2527,14 +2528,12 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, int rc; D_ASSERT(type == VOS_ITER_COUUID); - if (uuid_compare(arg->cont_uuid, entry->ie_couuid) == 0) { - D_DEBUG(DB_REBUILD, DF_UUID" already discard\n", - DP_UUID(arg->cont_uuid)); + if (uuid_compare(arg->ca_co_uuid, entry->ie_couuid) == 0) { + D_DEBUG(DB_REBUILD, DF_UUID " already discard\n", DP_UUID(arg->ca_co_uuid)); return 0; } - rc = ds_cont_child_lookup(arg->tgt_discard->pool_uuid, entry->ie_couuid, - &cont); + rc = ds_cont_child_lookup(arg->ca_po_uuid, entry->ie_couuid, &cont); if (rc != DER_SUCCESS) { D_ERROR("Lookup container '"DF_UUIDF"' failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); @@ -2554,8 +2553,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, param.ip_hdl = coh; param.ip_epr.epr_lo = 0; - param.ip_epr.epr_hi = arg->tgt_discard->epoch; - uuid_copy(arg->cont_uuid, entry->ie_couuid); + param.ip_epr.epr_hi = arg->ca_epoch; + uuid_copy(arg->ca_co_uuid, entry->ie_couuid); do { /* Inform the iterator and delete the object */ *acts |= VOS_ITER_CB_DELETE; @@ -2571,9 +2570,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, d_backoff_seq_fini(&backoff_seq); vos_cont_close(coh); - D_DEBUG(DB_TRACE, DF_UUID"/"DF_UUID" discard cont done: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid), - DP_RC(rc)); + D_DEBUG(DB_TRACE, DF_UUID "/" DF_UUID " discard cont done: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UUID(entry->ie_couuid), DP_RC(rc)); put: ds_cont_child_put(cont); @@ -2590,92 +2588,103 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, static void pool_child_discard(void *data) { - struct tgt_discard_arg *arg = data; - struct child_discard_arg cont_arg; - struct ds_pool_child *child = arg->pool_child; - struct ds_pool *pool = child->spc_pool; + struct child_discard_arg *cont_arg = data; + struct ds_pool *pool = cont_arg->ca_child->spc_pool; vos_iter_param_t param = { 0 }; - struct vos_iter_anchors anchor = { 0 }; - struct pool_target_addr addr; - uint32_t myrank; + struct vos_iter_anchors anchor = {0}; struct d_backoff_seq backoff_seq; int rc; - D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", - DP_UUID(arg->pool_uuid)); - - myrank = dss_self_rank(); - addr.pta_rank = myrank; - addr.pta_target = dss_get_module_info()->dmi_tgt_id; - if (!pool_target_addr_found(&arg->tgt_list, &addr)) { - D_DEBUG(DB_TRACE, "skip discard %u/%u.\n", addr.pta_rank, - addr.pta_target); - return; - } - - D_DEBUG(DB_MD, DF_UUID" discard %u/%u\n", DP_UUID(arg->pool_uuid), - myrank, addr.pta_target); - - /** - * When a faulty device is replaced with a new one using the - * “dmg storage replace nvme” command, the reintegration of - * affected pool targets is automatically triggered. - * The following steps outline the device replacement process on the engine side: - * - * 1) Replace the old device with the new device in the SMD. - * 2) Setup all SPDK related stuff for the new device. - * 3) Start ds_pool_child - * - * It is important to note that manual reintegration may be initiated - * before step 3, in which case, the function should return “DER_AGAIN." - */ - param.ip_hdl = child->spc_hdl; - rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, 1 << 10 /* max (ms) */); D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - cont_arg.tgt_discard = arg; - + param.ip_hdl = cont_arg->ca_child->spc_hdl; do { rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, cont_discard_cb, NULL, &cont_arg, NULL); if (rc != -DER_BUSY && rc != -DER_INPROGRESS) break; - D_DEBUG(DB_REBUILD, "retry by "DF_RC"/"DF_UUID"\n", - DP_RC(rc), DP_UUID(arg->pool_uuid)); + D_DEBUG(DB_REBUILD, "retry by " DF_RC "/" DF_UUID "\n", DP_RC(rc), + DP_UUID(cont_arg->ca_po_uuid)); dss_sleep(d_backoff_seq_next(&backoff_seq)); } while (1); - child->spc_discard_done = 1; d_backoff_seq_fini(&backoff_seq); + if (rc) { + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status == 0) + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + } + D_INFO(DF_UUID " discard completed rank/target=%u/%d\n", DP_UUID(cont_arg->ca_po_uuid), + cont_arg->ca_addr.pta_rank, cont_arg->ca_addr.pta_target); - ABT_mutex_lock(pool->sp_mutex); - if (rc && pool->sp_discard_status == 0) - pool->sp_discard_status = rc; - ABT_mutex_unlock(pool->sp_mutex); - - ds_pool_child_put(child); + atomic_fetch_sub(&pool->sp_discarding, 1); + ds_pool_child_put(cont_arg->ca_child); + D_FREE(cont_arg); } static int pool_child_discard_async(void *data) { - struct tgt_discard_arg *arg = data; - struct ds_pool_child *child; - int rc; + struct tgt_discard_arg *arg = data; + struct ds_pool *pool; + struct child_discard_arg *cont_arg; + struct pool_target_addr addr; + int rc = 0; - child = ds_pool_child_lookup(arg->pool_uuid); - if (child == NULL) - return -DER_AGAIN; + addr.pta_rank = dss_self_rank(); + addr.pta_target = dss_get_module_info()->dmi_tgt_id; + if (!pool_target_addr_found(&arg->tgt_list, &addr)) { + D_INFO(DF_UUID "discard skipped rank/target=%u/%u.\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); + return 0; + } + + D_ALLOC_PTR(cont_arg); + if (!cont_arg) + return -DER_NOMEM; + + /** + * When a faulty device is replaced with a new one using the + * “dmg storage replace nvme” command, the reintegration of + * affected pool targets is automatically triggered. + * The following steps outline the device replacement process on the engine side: + * + * 1) Replace the old device with the new device in the SMD. + * 2) Setup all SPDK related stuff for the new device. + * 3) Start ds_pool_child + * + * It is important to note that manual reintegration may be initiated + * before step 3, in which case, the function should return “DER_AGAIN." + */ + cont_arg->ca_child = ds_pool_child_lookup(arg->pool_uuid); + if (cont_arg->ca_child == NULL) + D_GOTO(out, rc = -DER_AGAIN); /* set barrier to avoid race with rebuild */ - child->spc_discard_done = 0; - arg->pool_child = child; - rc = dss_ult_create(pool_child_discard, arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); + cont_arg->ca_addr = addr; + cont_arg->ca_epoch = arg->epoch; + uuid_copy(cont_arg->ca_po_uuid, arg->pool_uuid); + + pool = cont_arg->ca_child->spc_pool; + D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", + DP_UUID(arg->pool_uuid)); + D_INFO(DF_UUID " discard started rank/target=%u/%u\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); + + atomic_fetch_add(&pool->sp_discarding, 1); + rc = dss_ult_create(pool_child_discard, cont_arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); if (rc) - ds_pool_child_put(child); + atomic_fetch_sub(&pool->sp_discarding, 1); +out: + if (rc) { + if (cont_arg->ca_child) + ds_pool_child_put(cont_arg->ca_child); + D_FREE(cont_arg); + } return rc; } @@ -2765,43 +2774,6 @@ ds_pool_task_collective(uuid_t pool_uuid, uint32_t ex_status, int (*coll_func)(v return ds_pool_collective(pool_uuid, ex_status, coll_func, arg, flags, false); } -/* Discard the objects by epoch in this pool */ -static int -ds_pool_tgt_discard_ult(void *data) -{ - struct ds_pool *pool; - struct tgt_discard_arg *arg = data; - uint32_t ex_status; - int ref; - int rc; - - /* If discard failed, let's still go ahead, since reintegration might - * still succeed, though it might leave some garbage on the reintegration - * target, the future scrub tool might fix it. XXX - */ - rc = ds_pool_lookup(arg->pool_uuid, &pool); - if (pool == NULL) { - D_INFO(DF_UUID" can not be found: %d\n", DP_UUID(arg->pool_uuid), rc); - D_GOTO(free, rc = 0); - } - - ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; - /* It returns after pool_child_discard_async() is scheduled on all xstreams, - * it wouldn't wait for completion of discard, but it can guarantee barriers - * are set on all xstreams (ds_pool_child::spc_discard_done = 0). - */ - ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); - - ref = atomic_fetch_sub(&pool->sp_need_discard, 1); - D_ASSERTF(ref >= 0); - - pool->sp_discard_status = rc; - ds_pool_put(pool); -free: - tgt_discard_arg_free(arg); - return rc; -} - void ds_pool_tgt_discard_handler(crt_rpc_t *rpc) { @@ -2810,6 +2782,7 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) struct pool_target_addr_list pta_list; struct tgt_discard_arg *arg = NULL; struct ds_pool *pool; + uint32_t ex_status; int rc; pta_list.pta_number = in->ptdi_addrs.ca_count; @@ -2830,25 +2803,38 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) D_GOTO(out, rc = 0); } - if (atomic_fetch_add(&pool->sp_need_discard, 1) > 1) { - atomic_fetch_sub(&pool->sp_need_discard, 1); + if (atomic_fetch_add(&pool->sp_discarding, 1) > 0) { + atomic_fetch_sub(&pool->sp_discarding, 1); D_INFO(DF_UUID " XXX: discard is already in progress, \n", DP_UUID(arg->pool_uuid)); ds_pool_put(pool); D_GOTO(out, rc = -DER_BUSY); } + D_INFO(DF_UUID " discard started\n", DP_UUID(arg->pool_uuid)); + ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = 0; - rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); - if (rc == 0) - rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ + ABT_mutex_unlock(pool->sp_mutex); + + ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; + rc = ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); + if (rc != 0) { + ABT_mutex_lock(pool->sp_mutex); + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + D_GOTO(out_put, rc); + } + rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ +out_put: + /* all targets should have already started to discard, I can release my refcount */ + atomic_fetch_sub(&pool->sp_discarding, 1); ds_pool_put(pool); out: out->ptdo_rc = rc; D_DEBUG(DB_MD, DF_UUID": replying rpc "DF_RC"\n", DP_UUID(in->ptdi_uuid), DP_RC(rc)); crt_reply_send(rpc); - if (rc != 0 && arg != NULL) + if (arg != NULL) tgt_discard_arg_free(arg); } From e4e354573898683ce88af97ae0fd515a1644278d Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Mon, 2 Mar 2026 09:10:38 +0800 Subject: [PATCH 3/4] DAOS-18487 rebuild: move assert from ULT to RPC handler Signed-off-by: Liang Zhen --- src/pool/srv_target.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 7bd4a0e52dc..a8f3fd097fb 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -2612,15 +2612,16 @@ pool_child_discard(void *data) } while (1); d_backoff_seq_fini(&backoff_seq); + D_INFO(DF_UUID " discard completed rank/target=%u/%d, rc=%d\n", + DP_UUID(cont_arg->ca_po_uuid), cont_arg->ca_addr.pta_rank, + cont_arg->ca_addr.pta_target, rc); + if (rc) { ABT_mutex_lock(pool->sp_mutex); if (pool->sp_discard_status == 0) pool->sp_discard_status = rc; ABT_mutex_unlock(pool->sp_mutex); } - D_INFO(DF_UUID " discard completed rank/target=%u/%d\n", DP_UUID(cont_arg->ca_po_uuid), - cont_arg->ca_addr.pta_rank, cont_arg->ca_addr.pta_target); - atomic_fetch_sub(&pool->sp_discarding, 1); ds_pool_child_put(cont_arg->ca_child); D_FREE(cont_arg); @@ -2669,12 +2670,10 @@ pool_child_discard_async(void *data) cont_arg->ca_epoch = arg->epoch; uuid_copy(cont_arg->ca_po_uuid, arg->pool_uuid); - pool = cont_arg->ca_child->spc_pool; - D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", - DP_UUID(arg->pool_uuid)); D_INFO(DF_UUID " discard started rank/target=%u/%u\n", DP_UUID(arg->pool_uuid), addr.pta_rank, addr.pta_target); + pool = cont_arg->ca_child->spc_pool; atomic_fetch_add(&pool->sp_discarding, 1); rc = dss_ult_create(pool_child_discard, cont_arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); if (rc) @@ -2811,6 +2810,10 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) } D_INFO(DF_UUID " discard started\n", DP_UUID(arg->pool_uuid)); + /* XXX just return EAGAIN/EPERM? */ + D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", + DP_UUID(arg->pool_uuid)); + ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = 0; ABT_mutex_unlock(pool->sp_mutex); @@ -2821,11 +2824,11 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = rc; ABT_mutex_unlock(pool->sp_mutex); - D_GOTO(out_put, rc); + D_GOTO(out_sub, rc); } rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ -out_put: +out_sub: /* all targets should have already started to discard, I can release my refcount */ atomic_fetch_sub(&pool->sp_discarding, 1); ds_pool_put(pool); From 33e12bf115511fea26a5cafa5fd78978b7dfd678 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Mon, 2 Mar 2026 17:15:38 +0800 Subject: [PATCH 4/4] DAOS-18487 rebuild: remove the assertion Signed-off-by: Liang Zhen --- src/pool/srv_target.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index a8f3fd097fb..fbb9207c87b 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -2805,14 +2805,14 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) if (atomic_fetch_add(&pool->sp_discarding, 1) > 0) { atomic_fetch_sub(&pool->sp_discarding, 1); D_INFO(DF_UUID " XXX: discard is already in progress, \n", DP_UUID(arg->pool_uuid)); - ds_pool_put(pool); - D_GOTO(out, rc = -DER_BUSY); + D_GOTO(out_put, rc = -DER_BUSY); } D_INFO(DF_UUID " discard started\n", DP_UUID(arg->pool_uuid)); - /* XXX just return EAGAIN/EPERM? */ - D_ASSERTF(!ds_pool_is_rebuilding(pool), DF_UUID " is already being reintegrated!\n", - DP_UUID(arg->pool_uuid)); + if (ds_pool_is_rebuilding(pool)) { + D_INFO(DF_UUID " is already being reintegrated!\n", DP_UUID(arg->pool_uuid)); + D_GOTO(out_put, rc = -DER_BUSY); + } ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = 0; @@ -2831,6 +2831,7 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) out_sub: /* all targets should have already started to discard, I can release my refcount */ atomic_fetch_sub(&pool->sp_discarding, 1); +out_put: ds_pool_put(pool); out: out->ptdo_rc = rc;