Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/include/daos_srv/pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,14 @@ struct ds_pool {
*/
uuid_t sp_srv_cont_hdl;
uuid_t sp_srv_pool_hdl;
uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_need_discard : 1,
sp_disable_rebuild : 1, sp_disable_dtx_resync : 1, sp_incr_reint : 1;
uint32_t sp_stopping : 1, sp_cr_checked : 1, sp_immutable : 1, sp_disable_rebuild : 1,
sp_disable_dtx_resync : 1, sp_incr_reint : 1;
/* pool_uuid + map version + leader term + rebuild generation define a
* rebuild job.
*/
uint32_t sp_rebuild_gen;
ATOMIC int sp_rebuilding;
ATOMIC int sp_discarding;
/**
* someone has already messaged this pool to for rebuild scan,
* NB: all xstreams can do lockless-write on it but it's OK
Expand Down Expand Up @@ -191,8 +192,7 @@ struct ds_pool_child {
int spc_ref;
ABT_eventual spc_ref_eventual;

uint64_t spc_discard_done:1,
spc_no_storage:1; /* The pool shard has no storage. */
uint64_t spc_no_storage : 1; /* The pool shard has no storage. */

uint32_t spc_reint_mode;
uint32_t *spc_state; /* Pointer to ds_pool->sp_states[i] */
Expand Down
63 changes: 43 additions & 20 deletions src/object/obj_class.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -18,6 +18,9 @@ static struct daos_obj_class **oc_resil_array;
static int oc_ident_array_sz;
static int oc_resil_array_sz;

#define D_GX_RESERVED_ENV "D_GX_RESERVED"
static unsigned int oc_gx_reserved;

static struct daos_obj_class *oclass_ident2cl(daos_oclass_id_t oc_id,
uint32_t *nr_grps);
static struct daos_obj_class *oclass_resil2cl(struct daos_oclass_attr *ca);
Expand Down Expand Up @@ -253,17 +256,30 @@ daos_oclass_grp_nr(struct daos_oclass_attr *oc_attr, struct daos_obj_md *md)

/**
* To honor RF setting during failure cases, let's reserve RF
* groups, so if some targets fail, there will be enough replacement
* domains, so if some targets fail, there will be enough replacement
* targets to rebuild, so to avoid putting multiple shards in the same
* domain, which may break the RF setting.
*
* Though let's keep reserve targets to be less than 30% of the total
* targets.
* Though let's keep reserve targets to be no less than 30% of the total targets,
* because otherwise layout computation will be too expensive.
*/
static uint32_t
reserve_grp_by_rf(uint32_t target_nr, uint32_t grp_size, uint32_t rf)
reserve_grp_by_rf(uint32_t domain_nr, uint32_t target_nr, uint32_t grp_size, uint32_t rf)
{
return min(((target_nr * 3) / 10) / grp_size, rf);
int tgt_per_dom;
int tgt_reserv;

if (oc_gx_reserved > 0)
return oc_gx_reserved;

D_ASSERT(target_nr >= domain_nr); /* unless pool map is corrupted */
tgt_per_dom = target_nr / domain_nr;

tgt_reserv = (target_nr * 3) / 10; /* 30 percent */
if (tgt_reserv < tgt_per_dom * rf)
tgt_reserv = tgt_per_dom * rf;

return (tgt_reserv + grp_size - 1) / grp_size;
}

int
Expand Down Expand Up @@ -303,7 +319,7 @@ daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, enum d

grp_size = daos_oclass_grp_size(&ca);
if (ca.ca_grp_nr == DAOS_OBJ_GRP_MAX) {
uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf_factor);
uint32_t reserve_grp = reserve_grp_by_rf(domain_nr, target_nr, grp_size, rf_factor);

ca.ca_grp_nr = max(1, (target_nr / grp_size));

Expand Down Expand Up @@ -751,13 +767,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype
*ord = OR_RP_2;
grp_size = 2;
} else if (rdd == DAOS_OCH_RDD_EC) {
if (domain_nr >= 18) {
if (domain_nr >= 25) {
*ord = OR_RS_16P1;
grp_size = 17;
} else if (domain_nr >= 10) {
} else if (domain_nr >= 13) {
*ord = OR_RS_8P1;
grp_size = 9;
} else if (domain_nr >= 6) {
} else if (domain_nr >= 8) {
*ord = OR_RS_4P1;
grp_size = 5;
} else {
Expand All @@ -772,13 +788,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype
case DAOS_PROP_CO_REDUN_RF1:
if ((rdd == DAOS_OCH_RDD_EC || (rdd == 0 && daos_is_array_type(otype))) &&
domain_nr >= 3) {
if (domain_nr >= 18) {
if (domain_nr >= 25) {
*ord = OR_RS_16P1;
grp_size = 17;
} else if (domain_nr >= 10) {
} else if (domain_nr >= 13) {
*ord = OR_RS_8P1;
grp_size = 9;
} else if (domain_nr >= 6) {
} else if (domain_nr >= 8) {
*ord = OR_RS_4P1;
grp_size = 5;
} else {
Expand All @@ -793,13 +809,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype
case DAOS_PROP_CO_REDUN_RF2:
if ((rdd == DAOS_OCH_RDD_EC || (rdd == 0 && daos_is_array_type(otype))) &&
domain_nr >= 4) {
if (domain_nr >= 20) {
if (domain_nr >= 26) {
*ord = OR_RS_16P2;
grp_size = 18;
} else if (domain_nr >= 12) {
} else if (domain_nr >= 15) {
*ord = OR_RS_8P2;
grp_size = 10;
} else if (domain_nr >= 8) {
} else if (domain_nr >= 9) {
*ord = OR_RS_4P2;
grp_size = 6;
} else {
Expand All @@ -814,10 +830,10 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype
case DAOS_PROP_CO_REDUN_RF3:
if ((rdd == DAOS_OCH_RDD_EC || (rdd == 0 && daos_is_array_type(otype))) &&
domain_nr >= 10) {
if (domain_nr >= 22) {
if (domain_nr >= 28) {
*ord = OR_RS_16P3;
grp_size = 19;
} else if (domain_nr >= 14) {
} else if (domain_nr >= 16) {
*ord = OR_RS_8P3;
grp_size = 11;
} else {
Expand Down Expand Up @@ -865,7 +881,7 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype
grp_nr = max(256, target_nr * 50 / 100);
break;
case DAOS_OCH_SHD_EXT:
grp_nr = max(1024, target_nr * 80 / 100);
grp_nr = max(1024, target_nr * 70 / 100);
break;
default:
D_ERROR("Invalid sharding hint\n");
Expand All @@ -874,7 +890,7 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype

if (grp_nr == DAOS_OBJ_GRP_MAX || grp_nr * grp_size > target_nr) {
uint32_t max_grp = target_nr / grp_size;
uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf);
uint32_t reserve_grp = reserve_grp_by_rf(domain_nr, target_nr, grp_size, rf);

/* search for the highest scalability in the allowed range */
if (max_grp > reserve_grp)
Expand Down Expand Up @@ -948,6 +964,13 @@ obj_class_init(void)
if (oc_ident_array)
return 0;

oc_gx_reserved = 0;
d_getenv_uint(D_GX_RESERVED_ENV, &oc_gx_reserved);
if (oc_gx_reserved > 0) {
D_INFO("%s = %u, it should be set only for benchmarking\n", D_GX_RESERVED_ENV,
oc_gx_reserved);
}

D_ALLOC_ARRAY(oc_ident_array, OC_NR);
if (!oc_ident_array)
return -DER_NOMEM;
Expand Down
14 changes: 7 additions & 7 deletions src/object/srv_obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -2450,20 +2450,22 @@ static int
obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc,
uint32_t rpc_map_ver, uint32_t flags)
{
struct ds_pool *pool = child->sc_pool->spc_pool;

if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) {
/* EC aggregation is still inflight, rebuild should wait until it's paused */
if (ds_cont_child_ec_aggregating(child)) {
D_ERROR(DF_CONT " ec aggregate still active, rebuilding %d\n",
DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid),
atomic_load(&child->sc_pool->spc_pool->sp_rebuilding));
DP_CONT(pool->sp_uuid, child->sc_uuid),
atomic_load(&pool->sp_rebuilding));
return -DER_UPDATE_AGAIN;
}
}

if (!obj_is_modification_opc(opc) && (opc != DAOS_OBJ_RPC_CPD || flags & ORF_CPD_RDONLY))
return 0;

if (atomic_load(&child->sc_pool->spc_pool->sp_rebuilding)) {
if (atomic_load(&pool->sp_rebuilding)) {
uint32_t version;

ds_rebuild_running_query(child->sc_pool_uuid, RB_OP_REBUILD,
Expand All @@ -2480,10 +2482,8 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc,
* vos discard to finish, which otherwise might discard these new in-flight
* I/O update.
*/
if ((flags & ORF_REINTEGRATING_IO) &&
(child->sc_pool->spc_pool->sp_need_discard &&
child->sc_pool->spc_discard_done == 0)) {
D_ERROR("reintegrating "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid));
if ((flags & ORF_REINTEGRATING_IO) && atomic_load(&pool->sp_discarding) > 0) {
D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid));
return -DER_UPDATE_AGAIN;
}

Expand Down
23 changes: 13 additions & 10 deletions src/object/srv_obj_migrate.c
Original file line number Diff line number Diff line change
Expand Up @@ -3230,6 +3230,7 @@ migrate_obj_ult(void *data)
{
struct iter_obj_arg *arg = data;
struct migrate_pool_tls *tls = NULL;
struct ds_pool *pool;
daos_epoch_range_t epr;
daos_epoch_t stable_epoch = 0;
daos_handle_t coh = DAOS_HDL_INVAL;
Expand All @@ -3249,20 +3250,22 @@ migrate_obj_ult(void *data)
* discard, or discard has been done. spc_discard_done means
* discarding has been done in the current VOS target.
*/
if (tls->mpt_pool->spc_pool->sp_need_discard) {
while(!tls->mpt_pool->spc_discard_done) {
D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n",
DP_RB_MPT(tls));
dss_sleep(2 * 1000);
if (tls->mpt_fini)
D_GOTO(free_notls, rc);
}
if (tls->mpt_pool->spc_pool->sp_discard_status) {
rc = tls->mpt_pool->spc_pool->sp_discard_status;
pool = tls->mpt_pool->spc_pool;
while (atomic_load(&pool->sp_discarding) > 0) {
D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", DP_RB_MPT(tls));
dss_sleep(2 * 1000);
if (tls->mpt_fini)
D_GOTO(free_notls, rc);

ABT_mutex_lock(pool->sp_mutex);
if (pool->sp_discard_status) {
rc = pool->sp_discard_status;
ABT_mutex_unlock(pool->sp_mutex);
D_DEBUG(DB_REBUILD, DF_RB ": discard failure: " DF_RC "\n", DP_RB_MPT(tls),
DP_RC(rc));
D_GOTO(out, rc);
}
ABT_mutex_unlock(pool->sp_mutex);
}

if (tls->mpt_reintegrating) {
Expand Down
9 changes: 8 additions & 1 deletion src/placement/jump_map.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -734,6 +734,13 @@ get_object_layout(struct pl_jump_map *jmap, uint32_t layout_ver, struct pl_obj_l
setbit(dom_cur_grp_real, domain - root);
if (pool_target_down(target))
layout->ol_shards[k].po_rebuilding = 1;

if (pool_target_is_down2up(target)) {
if (gen_mode == PRE_REBUILD)
layout->ol_shards[k].po_rebuilding = 1;
else
layout->ol_shards[k].po_reintegrating = 1;
}
}

if (is_extending != NULL && pool_target_is_up_or_drain(target))
Expand Down
24 changes: 14 additions & 10 deletions src/placement/pl_map_common.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -255,13 +256,7 @@ is_comp_avaible(struct pool_component *comp, uint32_t allow_version,
status = PO_COMP_ST_UPIN;
} else if (status == PO_COMP_ST_UP) {
if (comp->co_flags & PO_COMPF_DOWN2UP) {
/* PO_COMP_ST_UP status with PO_COMPF_DOWN2UP flag
* is the case of delay_rebuild exclude+reint.
* Cannot mark it as UPIN to avoid it be used for
* rebuild enumerate/fetch, as the data will be
* discarded in reintegrate.
*/
/* status = PO_COMP_ST_UPIN; */
status = PO_COMP_ST_UPIN;
} else {
if (comp->co_fseq <= 1)
status = PO_COMP_ST_NEW;
Expand Down Expand Up @@ -394,9 +389,14 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md,
if (spare_avail) {
/* The selected spare target is up and ready */
l_shard->po_target = spare_tgt->ta_comp.co_id;
l_shard->po_fseq = f_shard->fs_fseq;
l_shard->po_rank = spare_tgt->ta_comp.co_rank;
l_shard->po_index = spare_tgt->ta_comp.co_index;
l_shard->po_fseq = f_shard->fs_fseq;
l_shard->po_rank = spare_tgt->ta_comp.co_rank;
l_shard->po_index = spare_tgt->ta_comp.co_index;

if (pool_target_is_down2up(spare_tgt))
f_shard->fs_down2up = 1;
else
f_shard->fs_down2up = 0;

/*
* Mark the shard as 'rebuilding' so that read will skip this shard.
Expand All @@ -406,6 +406,10 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md,
f_shard->fs_status == PO_COMP_ST_DRAIN ||
f_shard->fs_down2up || pool_target_down(spare_tgt))
l_shard->po_rebuilding = 1;

if (f_shard->fs_down2up && gen_mode != PRE_REBUILD)
l_shard->po_reintegrating = 1;

} else {
l_shard->po_shard = -1;
l_shard->po_target = -1;
Expand Down
Loading
Loading