Skip to content

Commit

Permalink
Merge branch 'master' into jvolivie/minor_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
jolivier23 committed Jan 21, 2025
2 parents 91d41a8 + fcaa560 commit 0b5b1bc
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 53 deletions.
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ pipeline {
}
steps {
job_step_update(
unitTest(timeout_time: 60,
unitTest(timeout_time: 160,
unstash_opt: true,
ignore_failure: true,
inst_repos: prRepos(),
Expand Down
10 changes: 3 additions & 7 deletions src/engine/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
{
struct sched_info *info = &dx->dx_sched_info;
struct vos_pool_space vps = { 0 };
uint64_t scm_left, nvme_left, ne_left, ne_sys;
uint64_t scm_left, nvme_left, ne_left;
struct pressure_ratio *pr;
int orig_pressure, rc;

Expand Down Expand Up @@ -807,12 +807,8 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
if (vps.vps_ne_total == 0) {
ne_left = UINT64_MAX;
} else {
D_ASSERT(vps.vps_ne_total < SCM_TOTAL(&vps));
ne_sys = SCM_SYS(&vps) * vps.vps_ne_total / SCM_TOTAL(&vps);
if (vps.vps_ne_free > ne_sys)
ne_left = vps.vps_ne_free - ne_sys;
else
ne_left = 0;
ne_left = vps.vps_ne_free;
D_ASSERT(ne_left <= vps.vps_ne_total);
}

if (NVME_TOTAL(&vps) == 0) /* NVMe not enabled */
Expand Down
127 changes: 89 additions & 38 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ struct pool_svc {
rdb_path_t ps_handles; /* pool handle KVS */
rdb_path_t ps_user; /* pool user attributes KVS */
rdb_path_t ps_ops; /* metadata ops KVS */
int ps_error; /* in DB data (see pool_svc_lookup_leader) */
struct pool_svc_events ps_events;
uint32_t ps_global_version;
int ps_svc_rf;
Expand Down Expand Up @@ -1951,8 +1952,11 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out,

rc = ds_pool_svc_load(&tx, svc->ps_uuid, &svc->ps_root, &svc->ps_global_version, &map_buf,
&map_version);
if (rc != 0)
if (rc != 0) {
if (rc == -DER_DF_INCOMPT)
svc->ps_error = rc;
goto out_lock;
}

rc = pool_prop_read(&tx, svc, DAOS_PO_QUERY_PROP_ALL, &prop);
if (rc != 0) {
Expand Down Expand Up @@ -2264,6 +2268,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
d_rank_t rank = dss_self_rank();
int rc;

D_ASSERTF(svc->ps_error == 0, "ps_error: " DF_RC "\n", DP_RC(svc->ps_error));

/*
* If this is the only voting replica, it may have become the leader
* without doing any RPC. The primary group may have yet to be
Expand Down Expand Up @@ -2329,9 +2335,9 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
}

rc = ds_pool_iv_prop_update(svc->ps_pool, prop);
if (rc) {
D_ERROR("ds_pool_iv_prop_update failed: " DF_RC "\n", DP_RC(rc));
D_GOTO(out, rc);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": ds_pool_iv_prop_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) {
Expand All @@ -2347,11 +2353,10 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid);
}

rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid,
cont_hdl_uuid);
if (rc) {
D_ERROR("ds_pool_iv_srv_hdl_update failed: " DF_RC "\n", DP_RC(rc));
D_GOTO(out, rc);
rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, cont_hdl_uuid);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": ds_pool_iv_srv_hdl_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

/* resume pool upgrade if needed */
Expand Down Expand Up @@ -2386,27 +2391,39 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
D_FREE(map_buf);
if (prop != NULL)
daos_prop_free(prop);
if (rc < 0)
ds_pool_failed_add(svc->ps_uuid, rc);
else if (rc == 0)
ds_pool_failed_remove(svc->ps_uuid);
if (svc->ps_error != 0) {
/*
* Step up with the error anyway, so that RPCs to the PS
* receive an error instead of timeouts.
*/
DS_POOL_NOTE_PRINT(DF_UUID": rank %u became pool service leader "DF_U64
" with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank,
svc->ps_rsvc.s_term, DP_RC(svc->ps_error));
rc = 0;
}
return rc;
}

static void
pool_svc_step_down_cb(struct ds_rsvc *rsvc)
{
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();

pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
ds_cont_svc_step_down(svc->ps_cont_svc);
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();

DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n",
DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term);
if (svc->ps_error == 0) {
pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
ds_cont_svc_step_down(svc->ps_cont_svc);
DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n",
DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term);
} else {
DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64
" with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank,
svc->ps_rsvc.s_term, DP_RC(svc->ps_error));
svc->ps_error = 0;
}
}

static void
Expand Down Expand Up @@ -2477,12 +2494,13 @@ ds_pool_rsvc_class_unregister(void)
ds_rsvc_class_unregister(DS_RSVC_CLASS_POOL);
}

/* Use pool_svc_lookup_leader instead. */
static int
pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp)
{
struct ds_rsvc *rsvc;
d_iov_t id;
int rc;
d_iov_t id;
int rc;

d_iov_set(&id, uuid, sizeof(uuid_t));
rc = ds_rsvc_lookup(DS_RSVC_CLASS_POOL, &id, &rsvc);
Expand All @@ -2493,25 +2511,48 @@ pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp)
}

static int
pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp,
struct rsvc_hint *hint)
pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, struct rsvc_hint *hint)
{
struct ds_rsvc *rsvc;
d_iov_t id;
int rc;
struct ds_rsvc *rsvc;
d_iov_t id;
struct pool_svc *svc;
int rc;

rc = ds_pool_failed_lookup(uuid);
if (rc) {
D_ERROR(DF_UUID": failed to start: "DF_RC"\n",
DP_UUID(uuid), DP_RC(rc));
if (rc != 0) {
D_DEBUG(DB_MD, DF_UUID ": failed: " DF_RC "\n", DP_UUID(uuid), DP_RC(rc));
return -DER_NO_SERVICE;
}

d_iov_set(&id, uuid, sizeof(uuid_t));
rc = ds_rsvc_lookup_leader(DS_RSVC_CLASS_POOL, &id, &rsvc, hint);
if (rc != 0)
return rc;
*svcp = pool_svc_obj(rsvc);

/*
* The svc->ps_error field stores a persistent error, usually in the DB
* data, if any. For instance, "the layout of the DB data is
* incompatible with the software version". This mustn't be a replica
* error, because there may be a majorty of replicas working. We let the
* PS step up with this error so that it can serve all requests by
* returning the error. PS clients therefore get a quick error response
* instead of a timeout.
*
* Checking svc->ps_error here without confirming our leadership via
* rdb_raft_verify_leadership may cause some requests to get
* unnecessary errors, if there is a newer leader whose svc->ps_error
* is zero and is able to serve those requests. Such a state won't last
* much longer than an election timeout though, because we will step
* down due to inability to maintain a majority lease.
*/
svc = pool_svc_obj(rsvc);
if (svc->ps_error != 0) {
rc = svc->ps_error;
ds_rsvc_put_leader(rsvc);
return rc;
}

*svcp = svc;
return 0;
}

Expand Down Expand Up @@ -2583,6 +2624,7 @@ int ds_pool_failed_add(uuid_t uuid, int rc)
uuid_copy(psf->psf_uuid, uuid);
psf->psf_error = rc;
d_list_add_tail(&psf->psf_link, &pool_svc_failed_list);
DL_ERROR(rc, DF_UUID ": added to list of failed pools", DP_UUID(uuid));
out:
D_RWLOCK_UNLOCK(&psfl_rwlock);
return ret;
Expand All @@ -2597,6 +2639,8 @@ void ds_pool_failed_remove(uuid_t uuid)
d_list_for_each_entry_safe(psf, tmp, &pool_svc_failed_list, psf_link) {
if (uuid_compare(psf->psf_uuid, uuid) == 0) {
d_list_del_init(&psf->psf_link);
DL_INFO(psf->psf_error, DF_UUID ": removed from list of failed pools",
DP_UUID(uuid));
D_FREE(psf);
break;
}
Expand Down Expand Up @@ -3432,7 +3476,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid
int rc = 0;

if (!svc) {
rc = pool_svc_lookup(pool_uuid, &svc);
rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */);
if (rc != 0) {
DL_ERROR(rc, "pool_svc lookup failed");
goto out;
Expand Down Expand Up @@ -3480,7 +3524,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid
D_FREE(op_key_enc.iov_buf);
out_svc:
if (need_put_svc)
pool_svc_put(svc);
pool_svc_put_leader(svc);
out:
if (rc == 0) {
*is_dup = duplicate;
Expand Down Expand Up @@ -3527,7 +3571,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
int rc = 0;

if (!svc) {
rc = pool_svc_lookup(pool_uuid, &svc);
rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */);
if (rc != 0) {
DL_ERROR(rc, "pool_svc lookup failed");
goto out;
Expand Down Expand Up @@ -3589,7 +3633,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
D_FREE(op_key_enc.iov_buf);
out_svc:
if (need_put_svc)
pool_svc_put(svc);
pool_svc_put_leader(svc);
out:
return rc;
}
Expand Down Expand Up @@ -3682,6 +3726,13 @@ ds_pool_create_handler(crt_rpc_t *rpc)
ABT_rwlock_wrlock(svc->ps_lock);
ds_cont_wrlock_metadata(svc->ps_cont_svc);

if (svc->ps_error != 0) {
DL_ERROR(svc->ps_error, DF_UUID ": encountered pool service leader with error",
DP_UUID(svc->ps_uuid));
rc = svc->ps_error;
goto out_tx;
}

/* See if the DB has already been initialized. */
d_iov_set(&value, NULL /* buf */, 0 /* size */);
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_map_buffer,
Expand Down
29 changes: 22 additions & 7 deletions src/vos/vos_aggregate.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -177,10 +178,24 @@ struct vos_agg_param {
};

static inline void
credits_set(struct vos_agg_credits *vac, bool tight)
credits_set(struct vos_pool *pool, struct vos_agg_credits *vac, bool tight)
{
vac->vac_creds_scan = tight ? AGG_CREDS_SCAN_TIGHT : AGG_CREDS_SCAN_SLACK;
vac->vac_creds_del = tight ? AGG_CREDS_DEL_TIGHT : AGG_CREDS_DEL_SLACK;
unsigned int multiplier = 1;

/*
* When md-on-ssd phase2 pool runs into space pressure, larger SCAN credits will
* be used to reduce yield & reprobe on iterating, larger DEL credits will be used
* to drop more punched objects to GC in one batch, so that GC will likely reclaim
* more objects when reclaiming a bucket.
*
* Though larger aggregation credits will lower front end I/O performance, it can
* greatly reduce page misses for GC when free space/page is tight.
*/
if (tight && vos_pool_is_evictable(pool))
multiplier = 100;

vac->vac_creds_scan = (tight ? AGG_CREDS_SCAN_TIGHT : AGG_CREDS_SCAN_SLACK) * multiplier;
vac->vac_creds_del = (tight ? AGG_CREDS_DEL_TIGHT : AGG_CREDS_DEL_SLACK) * multiplier;
vac->vac_creds_merge = tight ? AGG_CREDS_MERGE_TIGHT : AGG_CREDS_MERGE_SLACK;
}

Expand Down Expand Up @@ -323,7 +338,7 @@ vos_aggregate_yield(struct vos_agg_param *agg_param)

if (agg_param->ap_yield_func == NULL) {
bio_yield(agg_param->ap_umm);
credits_set(&agg_param->ap_credits, true);
credits_set(cont->vc_pool, &agg_param->ap_credits, true);
return false;
}

Expand All @@ -333,7 +348,7 @@ vos_aggregate_yield(struct vos_agg_param *agg_param)
return true;

/* rc == 0: tight mode; rc == 1: slack mode */
credits_set(&agg_param->ap_credits, rc == 0);
credits_set(cont->vc_pool, &agg_param->ap_credits, rc == 0);

return false;
}
Expand Down Expand Up @@ -2702,7 +2717,7 @@ vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr,
/* Set aggregation parameters */
ad->ad_agg_param.ap_umm = &cont->vc_pool->vp_umm;
ad->ad_agg_param.ap_coh = coh;
credits_set(&ad->ad_agg_param.ap_credits, true);
credits_set(cont->vc_pool, &ad->ad_agg_param.ap_credits, true);
ad->ad_agg_param.ap_discard = 0;
ad->ad_agg_param.ap_yield_func = yield_func;
ad->ad_agg_param.ap_yield_arg = yield_arg;
Expand Down Expand Up @@ -2822,7 +2837,7 @@ vos_discard(daos_handle_t coh, daos_unit_oid_t *oidp, daos_epoch_range_t *epr,
/* Set aggregation parameters */
ad->ad_agg_param.ap_umm = &cont->vc_pool->vp_umm;
ad->ad_agg_param.ap_coh = coh;
credits_set(&ad->ad_agg_param.ap_credits, true);
credits_set(cont->vc_pool, &ad->ad_agg_param.ap_credits, true);
ad->ad_agg_param.ap_discard = 1;
ad->ad_agg_param.ap_yield_func = yield_func;
ad->ad_agg_param.ap_yield_arg = yield_arg;
Expand Down

0 comments on commit 0b5b1bc

Please sign in to comment.