Skip to content

Commit

Permalink
ZAP: Massively switch to _by_dnode() interfaces
Browse files Browse the repository at this point in the history
Before this change ZAP called dnode_hold() for almost every block
access, that was clearly visible in profiler under heavy load, such
as BRT.  This patch makes it always hold the dnode reference between
zap_lockdir() and zap_unlockdir().  It allows to avoid most of dnode
operations between those.  It also adds several new _by_dnode() APIs
to ZAP and uses them in BRT code.  Also adds dmu_prefetch_by_dnode()
variant and uses it in the ZAP code.

Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed Mar 22, 2024
1 parent c28f94f commit 687ae94
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 149 deletions.
2 changes: 2 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,8 @@ extern uint_t zfs_max_recordsize;
*/
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);

typedef struct dmu_object_info {
Expand Down
8 changes: 8 additions & 0 deletions include/sys/zap.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);

/*
* Set the attribute with the given name to the given value. If an
Expand All @@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);

/*
* Get the length (in integers) and the integer size of the specified
Expand All @@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx);
int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, dmu_tx_t *tx);

/*
* Returns (in *count) the number of attributes in the specified zap
Expand Down
1 change: 1 addition & 0 deletions include/sys/zap_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ typedef struct zap {
dmu_buf_user_t zap_dbu;
objset_t *zap_objset;
uint64_t zap_object;
dnode_t *zap_dnode;
struct dmu_buf *zap_dbuf;
krwlock_t zap_rwlock;
boolean_t zap_ismicro;
Expand Down
72 changes: 14 additions & 58 deletions module/zfs/brt.c
Original file line number Diff line number Diff line change
Expand Up @@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
if (mos_entries == 0)
return;

BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
(u_longlong_t)mos_entries, (u_longlong_t)vdevid,
(u_longlong_t)bre->bre_offset);
(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
}

static int
brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
{
int error;

ASSERT(RW_LOCK_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_mos_entries != 0);
ASSERT(bre->bre_refcount > 0);

error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
"error=%d", (u_longlong_t)brtvd->bv_mos_entries,
(u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
(u_longlong_t)bre->bre_refcount, error);

return (error);
}

static int
brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
{
int error;

ASSERT(RW_LOCK_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_mos_entries != 0);
ASSERT0(bre->bre_refcount);

error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
"error=%d", (u_longlong_t)brtvd->bv_mos_entries,
(u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
(u_longlong_t)bre->bre_refcount, error);

return (error);
}

/*
* Return TRUE if we _can_ have BRT entry for this bp. It might be false
* positive, but gives us quick answer if we should look into BRT, which
Expand Down Expand Up @@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
}

static void
brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
{

ASSERT(RW_WRITE_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_mos_entries != 0);

if (bre->bre_refcount == 0) {
int error;

error = brt_entry_remove(brt, brtvd, bre, tx);
ASSERT(error == 0 || error == ENOENT);
/*
* If error == ENOENT then zfs_clone_range() was done from a
* removed (but opened) file (open(), unlink()).
*/
ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
BRT_KEY_WORDS, tx);
VERIFY(error == 0 || error == ENOENT);
} else {
VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
&bre->bre_refcount, tx));
}
}

Expand All @@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
{
brt_vdev_t *brtvd;
brt_entry_t *bre;
dnode_t *dn;
uint64_t vdevid;
void *c;

Expand All @@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
if (brtvd->bv_mos_brtvdev == 0)
brt_vdev_create(brt, brtvd, tx);

VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
FTAG, &dn));

c = NULL;
while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
brt_sync_entry(brt, brtvd, bre, tx);
brt_sync_entry(dn, bre, tx);
brt_entry_free(bre);
ASSERT(brt->brt_nentries > 0);
brt->brt_nentries--;
}

dnode_rele(dn, FTAG);

brt_vdev_sync(brt, brtvd, tx);

if (brtvd->bv_totalcount == 0)
Expand Down
18 changes: 14 additions & 4 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
int64_t level2 = level;
uint64_t start, end, start2, end2;

if (dmu_prefetch_max == 0 || len == 0) {
dmu_prefetch_dnode(os, object, pri);
Expand All @@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
if (dnode_hold(os, object, FTAG, &dn) != 0)
return;

dmu_prefetch_by_dnode(dn, level, offset, len, pri);

dnode_rele(dn, FTAG);
}

void
dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
int64_t level2 = level;
uint64_t start, end, start2, end2;

/*
* Depending on len we may do two prefetches: blocks [start, end) at
* level, and following blocks [start2, end2) at higher level2.
Expand Down Expand Up @@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
for (uint64_t i = start2; i < end2; i++)
dbuf_prefetch(dn, level2, i, pri, 0);
rw_exit(&dn->dn_struct_rwlock);

dnode_rele(dn, FTAG);
}

/*
Expand Down Expand Up @@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
EXPORT_SYMBOL(dmu_buf_rele_array);
EXPORT_SYMBOL(dmu_prefetch);
EXPORT_SYMBOL(dmu_prefetch_by_dnode);
EXPORT_SYMBOL(dmu_prefetch_dnode);
EXPORT_SYMBOL(dmu_free_range);
EXPORT_SYMBOL(dmu_free_long_range);
EXPORT_SYMBOL(dmu_free_long_object);
Expand Down
43 changes: 16 additions & 27 deletions module/zfs/zap.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
* set up block 1 - the first leaf
*/
dmu_buf_t *db;
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);

Expand Down Expand Up @@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied);
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
dmu_prefetch_by_dnode(zap->zap_dnode, 0,
tbl->zt_blk << bs, tbl->zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
}
Expand All @@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,

uint64_t b = tbl->zt_blks_copied;
dmu_buf_t *db_old;
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);

/* first half of entries in old[b] go to new[2*b+0] */
dmu_buf_t *db_new;
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
(newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);

/* second half of entries in old[b] go to new[2*b+1] */
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
(newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
Expand Down Expand Up @@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
uint64_t off = idx & ((1<<(bs-3))-1);

dmu_buf_t *db;
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
Expand All @@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
uint64_t off2 = idx2 & ((1<<(bs-3))-1);
dmu_buf_t *db2;

err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
DMU_READ_NO_PREFETCH);
if (err != 0) {
Expand Down Expand Up @@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
uint64_t blk = idx >> (bs-3);
uint64_t off = idx & ((1<<(bs-3))-1);

/*
* Note: this is equivalent to dmu_buf_hold(), but we use
* _dnode_enter / _by_dnode because it's faster because we don't
* have to hold the dnode.
*/
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
dmu_buf_t *db;
int err = dmu_buf_hold_by_dnode(dn,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err != 0)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
Expand All @@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
*/
blk = (idx*2) >> (bs-3);

dn = dmu_buf_dnode_enter(zap->zap_dbuf);
err = dmu_buf_hold_by_dnode(dn,
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err == 0)
dmu_buf_rele(db, FTAG);
}
Expand Down Expand Up @@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)

uint64_t newblk = zap_allocate_blocks(zap, 1);
dmu_buf_t *db_new;
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
DMU_READ_NO_PREFETCH);
if (err != 0)
Expand Down Expand Up @@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;

VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
DMU_READ_NO_PREFETCH));
dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
Expand Down Expand Up @@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
return (SET_ERROR(ENOENT));

int bs = FZAP_BLOCK_SHIFT(zap);
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
int err = dmu_buf_hold_by_dnode(dn,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err != 0)
return (err);

Expand Down Expand Up @@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
int bs = FZAP_BLOCK_SHIFT(zap);
dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
ZIO_PRIORITY_SYNC_READ);
}

Expand Down Expand Up @@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
*/
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
ZIO_PRIORITY_ASYNC_READ);
}
Expand Down Expand Up @@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
} else {
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
dmu_prefetch_by_dnode(zap->zap_dnode, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
Expand All @@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
dmu_buf_t *db;
int err;

err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (err == 0) {
Expand Down
Loading

0 comments on commit 687ae94

Please sign in to comment.