diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 147e4bb3fc1..3fbaae93810 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -88,13 +88,14 @@ struct ds_pool { */ uuid_t sp_srv_cont_hdl; uuid_t sp_srv_pool_hdl; - uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_need_discard : 1, - sp_disable_rebuild : 1, sp_disable_dtx_resync : 1, sp_incr_reint : 1; + uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_disable_rebuild : 1, + sp_disable_dtx_resync : 1, sp_incr_reint : 1; /* pool_uuid + map version + leader term + rebuild generation define a * rebuild job. */ uint32_t sp_rebuild_gen; ATOMIC int sp_rebuilding; + ATOMIC int sp_discarding; /** * someone has already messaged this pool to for rebuild scan, * NB: all xstreams can do lockless-write on it but it's OK @@ -191,8 +192,7 @@ struct ds_pool_child { int spc_ref; ABT_eventual spc_ref_eventual; - uint64_t spc_discard_done:1, - spc_no_storage:1; /* The pool shard has no storage. */ + uint64_t spc_no_storage : 1; /* The pool shard has no storage. */ uint32_t spc_reint_mode; uint32_t *spc_state; /* Pointer to ds_pool->sp_states[i] */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 2ecd52f1a0f..c41c8f0731e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2450,12 +2450,14 @@ static int obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, uint32_t rpc_map_ver, uint32_t flags) { + struct ds_pool *pool = child->sc_pool->spc_pool; + if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) { /* EC aggregation is still inflight, rebuild should wait until it's paused */ if (ds_cont_child_ec_aggregating(child)) { D_ERROR(DF_CONT " ec aggregate still active, rebuilding %d\n", - DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid), - atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)); + DP_CONT(pool->sp_uuid, child->sc_uuid), + atomic_load(&pool->sp_rebuilding)); return -DER_UPDATE_AGAIN; } } @@ -2463,7 +2465,7 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, if (!obj_is_modification_opc(opc) && (opc != DAOS_OBJ_RPC_CPD || flags & ORF_CPD_RDONLY)) return 0; - if (atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)) { + if (atomic_load(&pool->sp_rebuilding)) { uint32_t version; ds_rebuild_running_query(child->sc_pool_uuid, RB_OP_REBUILD, @@ -2480,10 +2482,8 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, * vos discard to finish, which otherwise might discard these new in-flight * I/O update. */ - if ((flags & ORF_REINTEGRATING_IO) && - (child->sc_pool->spc_pool->sp_need_discard && - child->sc_pool->spc_discard_done == 0)) { - D_ERROR("reintegrating "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); + if ((flags & ORF_REINTEGRATING_IO) && atomic_load(&pool->sp_discarding) > 0) { + D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 2ccda79e4c2..c7740f5594a 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3230,6 +3230,7 @@ migrate_obj_ult(void *data) { struct iter_obj_arg *arg = data; struct migrate_pool_tls *tls = NULL; + struct ds_pool *pool; daos_epoch_range_t epr; daos_epoch_t stable_epoch = 0; daos_handle_t coh = DAOS_HDL_INVAL; @@ -3249,20 +3250,22 @@ migrate_obj_ult(void *data) * discard, or discard has been done. spc_discard_done means * discarding has been done in the current VOS target. */ - if (tls->mpt_pool->spc_pool->sp_need_discard) { - while(!tls->mpt_pool->spc_discard_done) { - D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", - DP_RB_MPT(tls)); - dss_sleep(2 * 1000); - if (tls->mpt_fini) - D_GOTO(free_notls, rc); - } - if (tls->mpt_pool->spc_pool->sp_discard_status) { - rc = tls->mpt_pool->spc_pool->sp_discard_status; + pool = tls->mpt_pool->spc_pool; + while (atomic_load(&pool->sp_discarding) > 0) { + D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls)); + dss_sleep(2 * 1000); + if (tls->mpt_fini) + D_GOTO(free_notls, rc); + + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status) { + rc = pool->sp_discard_status; + ABT_mutex_unlock(pool->sp_mutex); D_DEBUG(DB_REBUILD, DF_RB ": discard failure: " DF_RC "\n", DP_RB_MPT(tls), DP_RC(rc)); D_GOTO(out, rc); } + ABT_mutex_unlock(pool->sp_mutex); } if (tls->mpt_reintegrating) { diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index b7fe7dfe2c8..fbb9207c87b 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2016-2025 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2438,8 +2438,11 @@ struct tgt_discard_arg { }; struct child_discard_arg { - struct tgt_discard_arg *tgt_discard; - uuid_t cont_uuid; + struct ds_pool_child *ca_child; + struct pool_target_addr ca_addr; + uint64_t ca_epoch; + uuid_t ca_po_uuid; + uuid_t ca_co_uuid; }; static struct tgt_discard_arg* @@ -2488,7 +2491,7 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, 1 << 10 /* max (ms) */); D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - epr.epr_hi = arg->tgt_discard->epoch; + epr.epr_hi = arg->ca_epoch; epr.epr_lo = 0; do { /* Inform the iterator and delete the object */ @@ -2505,9 +2508,8 @@ obj_discard_cb(daos_handle_t ch, vos_iter_entry_t *ent, d_backoff_seq_fini(&backoff_seq); if (rc != 0) - D_ERROR("discard object pool/object "DF_UUID"/"DF_UOID" rc: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UOID(ent->ie_oid), - DP_RC(rc)); + D_ERROR("discard object pool/object " DF_UUID "/" DF_UOID " rc: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UOID(ent->ie_oid), DP_RC(rc)); return rc; } @@ -2526,14 +2528,12 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, int rc; D_ASSERT(type == VOS_ITER_COUUID); - if (uuid_compare(arg->cont_uuid, entry->ie_couuid) == 0) { - D_DEBUG(DB_REBUILD, DF_UUID" already discard\n", - DP_UUID(arg->cont_uuid)); + if (uuid_compare(arg->ca_co_uuid, entry->ie_couuid) == 0) { + D_DEBUG(DB_REBUILD, DF_UUID " already discard\n", DP_UUID(arg->ca_co_uuid)); return 0; } - rc = ds_cont_child_lookup(arg->tgt_discard->pool_uuid, entry->ie_couuid, - &cont); + rc = ds_cont_child_lookup(arg->ca_po_uuid, entry->ie_couuid, &cont); if (rc != DER_SUCCESS) { D_ERROR("Lookup container '"DF_UUIDF"' failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); @@ -2553,8 +2553,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, param.ip_hdl = coh; param.ip_epr.epr_lo = 0; - param.ip_epr.epr_hi = arg->tgt_discard->epoch; - uuid_copy(arg->cont_uuid, entry->ie_couuid); + param.ip_epr.epr_hi = arg->ca_epoch; + uuid_copy(arg->ca_co_uuid, entry->ie_couuid); do { /* Inform the iterator and delete the object */ *acts |= VOS_ITER_CB_DELETE; @@ -2570,9 +2570,8 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, d_backoff_seq_fini(&backoff_seq); vos_cont_close(coh); - D_DEBUG(DB_TRACE, DF_UUID"/"DF_UUID" discard cont done: "DF_RC"\n", - DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid), - DP_RC(rc)); + D_DEBUG(DB_TRACE, DF_UUID "/" DF_UUID " discard cont done: " DF_RC "\n", + DP_UUID(arg->ca_po_uuid), DP_UUID(entry->ie_couuid), DP_RC(rc)); put: ds_cont_child_put(cont); @@ -2586,30 +2585,68 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry, return rc; } -static int +static void pool_child_discard(void *data) { - struct tgt_discard_arg *arg = data; - struct child_discard_arg cont_arg; - struct ds_pool_child *child; + struct child_discard_arg *cont_arg = data; + struct ds_pool *pool = cont_arg->ca_child->spc_pool; vos_iter_param_t param = { 0 }; - struct vos_iter_anchors anchor = { 0 }; - struct pool_target_addr addr; - uint32_t myrank; + struct vos_iter_anchors anchor = {0}; struct d_backoff_seq backoff_seq; int rc; - myrank = dss_self_rank(); - addr.pta_rank = myrank; + rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, + 1 << 10 /* max (ms) */); + D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); + + param.ip_hdl = cont_arg->ca_child->spc_hdl; + do { + rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, + cont_discard_cb, NULL, &cont_arg, NULL); + if (rc != -DER_BUSY && rc != -DER_INPROGRESS) + break; + + D_DEBUG(DB_REBUILD, "retry by " DF_RC "/" DF_UUID "\n", DP_RC(rc), + DP_UUID(cont_arg->ca_po_uuid)); + dss_sleep(d_backoff_seq_next(&backoff_seq)); + } while (1); + + d_backoff_seq_fini(&backoff_seq); + D_INFO(DF_UUID " discard completed rank/target=%u/%d, rc=%d\n", + DP_UUID(cont_arg->ca_po_uuid), cont_arg->ca_addr.pta_rank, + cont_arg->ca_addr.pta_target, rc); + + if (rc) { + ABT_mutex_lock(pool->sp_mutex); + if (pool->sp_discard_status == 0) + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + } + atomic_fetch_sub(&pool->sp_discarding, 1); + ds_pool_child_put(cont_arg->ca_child); + D_FREE(cont_arg); +} + +static int +pool_child_discard_async(void *data) +{ + struct tgt_discard_arg *arg = data; + struct ds_pool *pool; + struct child_discard_arg *cont_arg; + struct pool_target_addr addr; + int rc = 0; + + addr.pta_rank = dss_self_rank(); addr.pta_target = dss_get_module_info()->dmi_tgt_id; if (!pool_target_addr_found(&arg->tgt_list, &addr)) { - D_DEBUG(DB_TRACE, "skip discard %u/%u.\n", addr.pta_rank, - addr.pta_target); + D_INFO(DF_UUID "discard skipped rank/target=%u/%u.\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); return 0; } - D_DEBUG(DB_MD, DF_UUID" discard %u/%u\n", DP_UUID(arg->pool_uuid), - myrank, addr.pta_target); + D_ALLOC_PTR(cont_arg); + if (!cont_arg) + return -DER_NOMEM; /** * When a faulty device is replaced with a new one using the @@ -2624,35 +2661,29 @@ pool_child_discard(void *data) * It is important to note that manual reintegration may be initiated * before step 3, in which case, the function should return “DER_AGAIN." */ - child = ds_pool_child_lookup(arg->pool_uuid); - if (child == NULL) - return -DER_AGAIN; - - param.ip_hdl = child->spc_hdl; - - rc = d_backoff_seq_init(&backoff_seq, 0 /* nzeros */, 16 /* factor */, 8 /* next (ms) */, - 1 << 10 /* max (ms) */); - D_ASSERTF(rc == 0, "d_backoff_seq_init: "DF_RC"\n", DP_RC(rc)); - - cont_arg.tgt_discard = arg; - child->spc_discard_done = 0; - do { - rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, - cont_discard_cb, NULL, &cont_arg, NULL); - if (rc != -DER_BUSY && rc != -DER_INPROGRESS) - break; - - D_DEBUG(DB_REBUILD, "retry by "DF_RC"/"DF_UUID"\n", - DP_RC(rc), DP_UUID(arg->pool_uuid)); - dss_sleep(d_backoff_seq_next(&backoff_seq)); - } while (1); + cont_arg->ca_child = ds_pool_child_lookup(arg->pool_uuid); + if (cont_arg->ca_child == NULL) + D_GOTO(out, rc = -DER_AGAIN); - child->spc_discard_done = 1; + /* set barrier to avoid race with rebuild */ + cont_arg->ca_addr = addr; + cont_arg->ca_epoch = arg->epoch; + uuid_copy(cont_arg->ca_po_uuid, arg->pool_uuid); - d_backoff_seq_fini(&backoff_seq); - - ds_pool_child_put(child); + D_INFO(DF_UUID " discard started rank/target=%u/%u\n", DP_UUID(arg->pool_uuid), + addr.pta_rank, addr.pta_target); + pool = cont_arg->ca_child->spc_pool; + atomic_fetch_add(&pool->sp_discarding, 1); + rc = dss_ult_create(pool_child_discard, cont_arg, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); + if (rc) + atomic_fetch_sub(&pool->sp_discarding, 1); +out: + if (rc) { + if (cont_arg->ca_child) + ds_pool_child_put(cont_arg->ca_child); + D_FREE(cont_arg); + } return rc; } @@ -2742,37 +2773,6 @@ ds_pool_task_collective(uuid_t pool_uuid, uint32_t ex_status, int (*coll_func)(v return ds_pool_collective(pool_uuid, ex_status, coll_func, arg, flags, false); } -/* Discard the objects by epoch in this pool */ -static int -ds_pool_tgt_discard_ult(void *data) -{ - struct ds_pool *pool; - struct tgt_discard_arg *arg = data; - uint32_t ex_status; - int rc; - - /* If discard failed, let's still go ahead, since reintegration might - * still succeed, though it might leave some garbage on the reintegration - * target, the future scrub tool might fix it. XXX - */ - rc = ds_pool_lookup(arg->pool_uuid, &pool); - if (pool == NULL) { - D_INFO(DF_UUID" can not be found: %d\n", DP_UUID(arg->pool_uuid), rc); - D_GOTO(free, rc = 0); - } - - ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; - ds_pool_thread_collective(arg->pool_uuid, ex_status, pool_child_discard, arg, - DSS_ULT_DEEP_STACK); - - pool->sp_need_discard = 0; - pool->sp_discard_status = rc; - ds_pool_put(pool); -free: - tgt_discard_arg_free(arg); - return rc; -} - void ds_pool_tgt_discard_handler(crt_rpc_t *rpc) { @@ -2781,6 +2781,7 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) struct pool_target_addr_list pta_list; struct tgt_discard_arg *arg = NULL; struct ds_pool *pool; + uint32_t ex_status; int rc; pta_list.pta_number = in->ptdi_addrs.ca_count; @@ -2801,19 +2802,43 @@ ds_pool_tgt_discard_handler(crt_rpc_t *rpc) D_GOTO(out, rc = 0); } - pool->sp_need_discard = 1; + if (atomic_fetch_add(&pool->sp_discarding, 1) > 0) { + atomic_fetch_sub(&pool->sp_discarding, 1); + D_INFO(DF_UUID " XXX: discard is already in progress, \n", DP_UUID(arg->pool_uuid)); + D_GOTO(out_put, rc = -DER_BUSY); + } + D_INFO(DF_UUID " discard started\n", DP_UUID(arg->pool_uuid)); + + if (ds_pool_is_rebuilding(pool)) { + D_INFO(DF_UUID " is already being reintegrated!\n", DP_UUID(arg->pool_uuid)); + D_GOTO(out_put, rc = -DER_BUSY); + } + + ABT_mutex_lock(pool->sp_mutex); pool->sp_discard_status = 0; - rc = dss_ult_execute(ds_pool_tgt_discard_ult, arg, NULL, NULL, DSS_XS_SYS, 0, 0); - if (rc == 0) - rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ + ABT_mutex_unlock(pool->sp_mutex); + + ex_status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN; + rc = ds_pool_collective(arg->pool_uuid, ex_status, pool_child_discard_async, arg, 0, true); + if (rc != 0) { + ABT_mutex_lock(pool->sp_mutex); + pool->sp_discard_status = rc; + ABT_mutex_unlock(pool->sp_mutex); + D_GOTO(out_sub, rc); + } + rc = ds_iv_ns_reint_prep(pool->sp_iv_ns); /* cleanup IV cache */ +out_sub: + /* all targets should have already started to discard, I can release my refcount */ + atomic_fetch_sub(&pool->sp_discarding, 1); +out_put: ds_pool_put(pool); out: out->ptdo_rc = rc; D_DEBUG(DB_MD, DF_UUID": replying rpc "DF_RC"\n", DP_UUID(in->ptdi_uuid), DP_RC(rc)); crt_reply_send(rpc); - if (rc != 0 && arg != NULL) + if (arg != NULL) tgt_discard_arg_free(arg); }