Skip to content

Commit

Permalink
Livelist logic should handle dedup blkptrs
Browse files Browse the repository at this point in the history
= Problem

When the livelist logic was designed it didn't take into
account that when dedup is enabled the sublivelists can
have consecutive FREE entries for the same block without
an ALLOC entry for it in-between them. This caused panics
in systems that were deleting/condesing clones where dedup
is enabled.

= This patch

Update the logic to handle the dedup-case of consecutive
FREEs in the livelist code. The logic still ensured that
all the FREE entries are matched up with a respective
ALLOC by keeping a refcount for each FREE blkptr that we
encounter and ensuring that this refcount gets to zero
by the time we are done processing the livelist.

= Testing

After I reproduced the issue with a shell script, I added
a variant of that shell script to ZTS. After ensuring that
this new test panics the system the same way as the original
reproducer I tried it against the updated logic in this
patch and verified that the system no longer panics.

= Side Fixes

* zdb -y no longer panics when encountering double frees

Signed-off-by: Serapheim Dimitropoulos <[email protected]>
Closes openzfs#11480
  • Loading branch information
sdimitro committed Jun 1, 2021
1 parent 8d5f211 commit 43791fa
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 64 deletions.
117 changes: 72 additions & 45 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx);

typedef struct sublivelist_verify {
/* all ALLOC'd blkptr_t in one sub-livelist */
zfs_btree_t sv_all_allocs;

/* all FREE'd blkptr_t in one sub-livelist */
zfs_btree_t sv_all_frees;

/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
zfs_btree_t sv_pair;

Expand Down Expand Up @@ -227,29 +221,68 @@ typedef struct sublivelist_verify_block {

static void zdb_print_blkptr(const blkptr_t *bp, int flags);

typedef struct sublivelist_verify_block_refcnt {
/* block pointer entry in livelist being verified */
blkptr_t svbr_blk;

/*
* Refcount gets incremented to 1 when we encounter the first
* FREE entry for the svfbr block pointer and a node for it
* is created in our ZDB verification/tracking metadata.
*
* As we encounter more FREE entries we increment this counter
* and similarly decrement it whenever we find the respective
* ALLOC entries for this block.
*
* When the refcount gets to 0 it means that all the FREE and
* ALLOC entries of this block have paired up and we no longer
* need to track it in our verification logic (e.g. the node
* containing this struct in our verification data structure
* should be freed).
*
* [refer to sublivelist_verify_blkptr() for the actual code]
*/
uint32_t svbr_refcnt;
} sublivelist_verify_block_refcnt_t;

static int
sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
{
const sublivelist_verify_block_refcnt_t *l = larg;
const sublivelist_verify_block_refcnt_t *r = rarg;
return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
}

static int
sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx)
{
ASSERT3P(tx, ==, NULL);
struct sublivelist_verify *sv = arg;
char blkbuf[BP_SPRINTF_LEN];
sublivelist_verify_block_refcnt_t current = {
.svbr_blk = *bp,

/*
* Start with 1 in case this is the first free entry.
* This field is not used for our B-Tree comparisons
* anyway.
*/
.svbr_refcnt = 1,
};

zfs_btree_index_t where;
sublivelist_verify_block_refcnt_t *pair =
zfs_btree_find(&sv->sv_pair, &current, &where);
if (free) {
zfs_btree_add(&sv->sv_pair, bp);
/* Check if the FREE is a duplicate */
if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
free);
(void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
if (pair == NULL) {
/* first free entry for this block pointer */
zfs_btree_add(&sv->sv_pair, &current);
} else {
zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
pair->svbr_refcnt++;
}
} else {
/* Check if the ALLOC has been freed */
if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
zfs_btree_remove_idx(&sv->sv_pair, &where);
} else {
if (pair == NULL) {
/* block that is currently marked as allocated */
for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
if (DVA_IS_EMPTY(&bp->blk_dva[i]))
break;
Expand All @@ -264,49 +297,39 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
&svb, &where);
}
}
}
/* Check if the ALLOC is a duplicate */
if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
free);
(void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
} else {
zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
/* alloc matches a free entry */
pair->svbr_refcnt--;
if (pair->svbr_refcnt == 0) {
/* all allocs and frees have been matched */
zfs_btree_remove_idx(&sv->sv_pair, &where);
}
}
}

return (0);
}

static int
sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
{
int err;
char blkbuf[BP_SPRINTF_LEN];
struct sublivelist_verify *sv = args;

zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
sizeof (blkptr_t));

zfs_btree_create(&sv->sv_all_frees, livelist_compare,
sizeof (blkptr_t));

zfs_btree_create(&sv->sv_pair, livelist_compare,
sizeof (blkptr_t));
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
sizeof (sublivelist_verify_block_refcnt_t));

err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
sv, NULL);

zfs_btree_clear(&sv->sv_all_allocs);
zfs_btree_destroy(&sv->sv_all_allocs);

zfs_btree_clear(&sv->sv_all_frees);
zfs_btree_destroy(&sv->sv_all_frees);

blkptr_t *e;
sublivelist_verify_block_refcnt_t *e;
zfs_btree_index_t *cookie = NULL;
while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
(void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
&e->svbr_blk, B_TRUE);
(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
e->svbr_refcnt, blkbuf);
}
zfs_btree_destroy(&sv->sv_pair);

Expand Down Expand Up @@ -615,10 +638,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
/*
* [Livelist Check]
* Iterate through all the sublivelists and:
* - report leftover frees
* - report double ALLOCs/FREEs
* - report leftover frees (**)
* - record leftover ALLOCs together with their TXG [see Cross Check]
*
* (**) Note: Double ALLOCs are valid in datasets that have dedup
* enabled. Similarly double FREEs are allowed as well but
* only if they pair up with a corresponding ALLOC entry once
* we our done with our sublivelist iteration.
*
* [Spacemap Check]
* for each metaslab:
* - iterate over spacemap and then the metaslab's entries in the
Expand Down
65 changes: 48 additions & 17 deletions module/zfs/dsl_deadlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
}

typedef struct livelist_entry {
const blkptr_t *le_bp;
blkptr_t le_bp;
uint32_t le_refcnt;
avl_node_t le_node;
} livelist_entry_t;

static int
livelist_compare(const void *larg, const void *rarg)
{
const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;

/* Sort them according to dva[0] */
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
Expand All @@ -944,6 +945,11 @@ struct livelist_iter_arg {
* Expects an AVL tree which is incrementally filled will FREE blkptrs
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
* corresponding FREE are stored in the supplied bplist.
*
* Note that multiple FREE and ALLOC entries for the same blkptr may
* be encountered when dedup is involved. For this reason we keep a
* refcount for all the FREE entries of each blkptr and ensure that
* each of those FREE entries has a corresponding ALLOC preceding it.
*/
static int
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
Expand All @@ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,

if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
return (SET_ERROR(EINTR));

livelist_entry_t node;
node.le_bp = *bp;
livelist_entry_t *found = avl_find(avl, &node, NULL);
if (bp_freed) {
livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
KM_SLEEP);
blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
*temp_bp = *bp;
node->le_bp = temp_bp;
avl_add(avl, node);
} else {
livelist_entry_t node;
node.le_bp = bp;
livelist_entry_t *found = avl_find(avl, &node, NULL);
if (found != NULL) {
avl_remove(avl, found);
kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
kmem_free(found, sizeof (livelist_entry_t));
if (found == NULL) {
/* first free entry for this blkptr */
livelist_entry_t *e =
kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
e->le_bp = *bp;
e->le_refcnt = 1;
avl_add(avl, e);
} else {
/* dedup block free */
ASSERT(BP_GET_DEDUP(bp));
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
BP_GET_CHECKSUM(&found->le_bp));
ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
found->le_refcnt++;
}
} else {
if (found == NULL) {
/* block is currently marked as allocated */
bplist_append(to_free, bp);
} else {
/* alloc matches a free entry */
ASSERT3U(found->le_refcnt, !=, 0);
found->le_refcnt--;
if (found->le_refcnt == 0) {
/* all tracked free pairs have been matched */
avl_remove(avl, found);
kmem_free(found, sizeof (livelist_entry_t));
} else {
/*
* This is definitely a deduped blkptr so
* let's validate it.
*/
ASSERT(BP_GET_DEDUP(bp));
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
BP_GET_CHECKSUM(&found->le_bp));
}
}
}
return (0);
Expand All @@ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
};
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);

VERIFY0(avl_numnodes(&avl));
avl_destroy(&avl);
return (err);
}
Expand Down
4 changes: 2 additions & 2 deletions tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ tags = ['functional', 'cli_root', 'zfs_create']

[tests/functional/cli_root/zfs_destroy]
tests = ['zfs_clone_livelist_condense_and_disable',
'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#

#
# Copyright (c) 2021 by Delphix. All rights reserved.
#

# DESCRIPTION
# Verify zfs destroy test for clones with livelists that contain
# dedup blocks. This test is a baseline regression test created
# to ensure that past bugs that we've encountered between dedup
# and the livelist logic don't resurface.

# STRATEGY
# 1. Create a clone from a test filesystem and enable dedup.
# 2. Write some data and create a livelist.
# 3. Copy the data within the clone to create dedup blocks.
# 4. Remove some of the dedup data to create multiple free
# entries for the same block pointers.
# 5. Process all the livelist entries by destroying the clone.

. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib

function cleanup
{
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
# Reset the minimum percent shared to 75
set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED
}

function test_dedup
{
# Set a small percent shared threshold so the livelist is not disabled
set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10
clone_dataset $TESTFS1 snap $TESTCLONE

# Enable dedup
log_must zfs set dedup=on $TESTPOOL/$TESTCLONE

# Create some data to be deduped
log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k

# Create dedup blocks
# Note: We sync before and after so all dedup blocks belong to the
# same TXG, otherwise they won't look identical to the livelist
# iterator due to their logical birth TXG being different.
log_must zpool sync $TESTPOOL
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3
log_must zpool sync $TESTPOOL
check_livelist_exists $TESTCLONE

# Introduce "double frees"
# We want to introduce consecutive FREEs of the same block as this
# was what triggered past panics.
# Note: Similarly to the previouys step we sync before and after our
# our deletions so all the entries end up in the same TXG.
log_must zpool sync $TESTPOOL
log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2
log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3
log_must zpool sync $TESTPOOL
check_livelist_exists $TESTCLONE

log_must zfs destroy $TESTPOOL/$TESTCLONE
check_livelist_gone
}

ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED)

log_onexit cleanup
log_must zfs create $TESTPOOL/$TESTFS1
log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
test_dedup

log_pass "Clone's livelist processes dedup blocks as expected."

0 comments on commit 43791fa

Please sign in to comment.