diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5158c23b08e8..8228e98a1daa 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -161,12 +161,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); typedef struct sublivelist_verify { - /* all ALLOC'd blkptr_t in one sub-livelist */ - zfs_btree_t sv_all_allocs; - - /* all FREE'd blkptr_t in one sub-livelist */ - zfs_btree_t sv_all_frees; - /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ zfs_btree_t sv_pair; @@ -225,29 +219,68 @@ typedef struct sublivelist_verify_block { static void zdb_print_blkptr(const blkptr_t *bp, int flags); +typedef struct sublivelist_verify_block_refcnt { + /* block pointer entry in livelist being verified */ + blkptr_t svbr_blk; + + /* + * Refcount gets incremented to 1 when we encounter the first + * FREE entry for the svfbr block pointer and a node for it + * is created in our ZDB verification/tracking metadata. + * + * As we encounter more FREE entries we increment this counter + * and similarly decrement it whenever we find the respective + * ALLOC entries for this block. + * + * When the refcount gets to 0 it means that all the FREE and + * ALLOC entries of this block have paired up and we no longer + * need to track it in our verification logic (e.g. the node + * containing this struct in our verification data structure + * should be freed). + * + * [refer to sublivelist_verify_blkptr() for the actual code] + */ + uint32_t svbr_refcnt; +} sublivelist_verify_block_refcnt_t; + +static int +sublivelist_block_refcnt_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_refcnt_t *l = larg; + const sublivelist_verify_block_refcnt_t *r = rarg; + return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); +} + static int sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx) { ASSERT3P(tx, ==, NULL); struct sublivelist_verify *sv = arg; - char blkbuf[BP_SPRINTF_LEN]; + sublivelist_verify_block_refcnt_t current = { + .svbr_blk = *bp, + + /* + * Start with 1 in case this is the first free entry. + * This field is not used for our B-Tree comparisons + * anyway. + */ + .svbr_refcnt = 1, + }; + zfs_btree_index_t where; + sublivelist_verify_block_refcnt_t *pair = + zfs_btree_find(&sv->sv_pair, ¤t, &where); if (free) { - zfs_btree_add(&sv->sv_pair, bp); - /* Check if the FREE is a duplicate */ - if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) { - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, - free); - (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf); + if (pair == NULL) { + /* first free entry for this block pointer */ + zfs_btree_add(&sv->sv_pair, ¤t); } else { - zfs_btree_add_idx(&sv->sv_all_frees, bp, &where); + pair->svbr_refcnt++; } } else { - /* Check if the ALLOC has been freed */ - if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) { - zfs_btree_remove_idx(&sv->sv_pair, &where); - } else { + if (pair == NULL) { + /* block that is currently marked as allocated */ for (int i = 0; i < SPA_DVAS_PER_BP; i++) { if (DVA_IS_EMPTY(&bp->blk_dva[i])) break; @@ -262,16 +295,16 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, &svb, &where); } } - } - /* Check if the ALLOC is a duplicate */ - if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) { - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, - free); - (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf); } else { - zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where); + /* alloc matches a free entry */ + pair->svbr_refcnt--; + if (pair->svbr_refcnt == 0) { + /* all allocs and frees have been matched */ + zfs_btree_remove_idx(&sv->sv_pair, &where); + } } } + return (0); } @@ -279,32 +312,22 @@ static int sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) { int err; - char blkbuf[BP_SPRINTF_LEN]; struct sublivelist_verify *sv = args; - zfs_btree_create(&sv->sv_all_allocs, livelist_compare, - sizeof (blkptr_t)); - - zfs_btree_create(&sv->sv_all_frees, livelist_compare, - sizeof (blkptr_t)); - - zfs_btree_create(&sv->sv_pair, livelist_compare, - sizeof (blkptr_t)); + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, + sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, sv, NULL); - zfs_btree_clear(&sv->sv_all_allocs); - zfs_btree_destroy(&sv->sv_all_allocs); - - zfs_btree_clear(&sv->sv_all_frees); - zfs_btree_destroy(&sv->sv_all_frees); - - blkptr_t *e; + sublivelist_verify_block_refcnt_t *e; zfs_btree_index_t *cookie = NULL; while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE); - (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf); + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), + &e->svbr_blk, B_TRUE); + (void) printf("\tERROR: %d unmatched FREE(s): %s\n", + e->svbr_refcnt, blkbuf); } zfs_btree_destroy(&sv->sv_pair); @@ -613,10 +636,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) /* * [Livelist Check] * Iterate through all the sublivelists and: - * - report leftover frees - * - report double ALLOCs/FREEs + * - report leftover frees (**) * - record leftover ALLOCs together with their TXG [see Cross Check] * + * (**) Note: Double ALLOCs are valid in datasets that have dedup + * enabled. Similarly double FREEs are allowed as well but + * only if they pair up with a corresponding ALLOC entry once + * we our done with our sublivelist iteration. + * * [Spacemap Check] * for each metaslab: * - iterate over spacemap and then the metaslab's entries in the diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index bad2d56eefdd..a77e381520db 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, } typedef struct livelist_entry { - const blkptr_t *le_bp; + blkptr_t le_bp; + uint32_t le_refcnt; avl_node_t le_node; } livelist_entry_t; static int livelist_compare(const void *larg, const void *rarg) { - const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; - const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; + const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; + const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; /* Sort them according to dva[0] */ uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); @@ -944,6 +945,11 @@ struct livelist_iter_arg { * Expects an AVL tree which is incrementally filled will FREE blkptrs * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. + * + * Note that multiple FREE and ALLOC entries for the same blkptr may + * be encountered when dedup is involved. For this reason we keep a + * refcount for all the FREE entries of each blkptr and ensure that + * each of those FREE entries has a corresponding ALLOC preceding it. */ static int dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, @@ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) return (SET_ERROR(EINTR)); + + livelist_entry_t node; + node.le_bp = *bp; + livelist_entry_t *found = avl_find(avl, &node, NULL); if (bp_freed) { - livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), - KM_SLEEP); - blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - *temp_bp = *bp; - node->le_bp = temp_bp; - avl_add(avl, node); - } else { - livelist_entry_t node; - node.le_bp = bp; - livelist_entry_t *found = avl_find(avl, &node, NULL); - if (found != NULL) { - avl_remove(avl, found); - kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); - kmem_free(found, sizeof (livelist_entry_t)); + if (found == NULL) { + /* first free entry for this blkptr */ + livelist_entry_t *e = + kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); + e->le_bp = *bp; + e->le_refcnt = 1; + avl_add(avl, e); } else { + /* dedup block free */ + ASSERT(BP_GET_DEDUP(bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); + found->le_refcnt++; + } + } else { + if (found == NULL) { + /* block is currently marked as allocated */ bplist_append(to_free, bp); + } else { + /* alloc matches a free entry */ + ASSERT3U(found->le_refcnt, !=, 0); + found->le_refcnt--; + if (found->le_refcnt == 0) { + /* all tracked free pairs have been matched */ + avl_remove(avl, found); + kmem_free(found, sizeof (livelist_entry_t)); + } else { + /* + * This is definitely a deduped blkptr so + * let's validate it. + */ + ASSERT(BP_GET_DEDUP(bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + } } } return (0); @@ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, }; int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); + VERIFY0(avl_numnodes(&avl)); avl_destroy(&avl); return (err); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 22688fdc1ee1..d254c719c430 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -165,8 +165,8 @@ tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', - 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', - 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', + 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', + 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh new file mode 100755 index 000000000000..5f356967a457 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify zfs destroy test for clones with livelists that contain +# dedup blocks. This test is a baseline regression test created +# to ensure that past bugs that we've encountered between dedup +# and the livelist logic don't resurface. + +# STRATEGY +# 1. Create a clone from a test filesystem and enable dedup. +# 2. Write some data and create a livelist. +# 3. Copy the data within the clone to create dedup blocks. +# 4. Remove some of the dedup data to create multiple free +# entries for the same block pointers. +# 5. Process all the livelist entries by destroying the clone. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + # Reset the minimum percent shared to 75 + set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED +} + +function test_dedup +{ + # Set a small percent shared threshold so the livelist is not disabled + set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10 + clone_dataset $TESTFS1 snap $TESTCLONE + + # Enable dedup + log_must zfs set dedup=on $TESTPOOL/$TESTCLONE + + # Create some data to be deduped + log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k + + # Create dedup blocks + # Note: We sync before and after so all dedup blocks belong to the + # same TXG, otherwise they won't look identical to the livelist + # iterator due to their logical birth TXG being different. + log_must zpool sync $TESTPOOL + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0 + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1 + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2 + log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3 + log_must zpool sync $TESTPOOL + check_livelist_exists $TESTCLONE + + # Introduce "double frees" + # We want to introduce consecutive FREEs of the same block as this + # was what triggered past panics. + # Note: Similarly to the previouys step we sync before and after our + # our deletions so all the entries end up in the same TXG. + log_must zpool sync $TESTPOOL + log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2 + log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3 + log_must zpool sync $TESTPOOL + check_livelist_exists $TESTCLONE + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED) + +log_onexit cleanup +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap +test_dedup + +log_pass "Clone's livelist processes dedup blocks as expected."