From 90842f894f10cd8bbfb82713432181a983a64425 Mon Sep 17 00:00:00 2001 From: MigeljanImeri Date: Thu, 25 Jan 2024 09:43:16 -0700 Subject: [PATCH] Add vdev module to allow io to bypass the queue Added vdev module to allow bypassing the vdev queue when reading / writing from / to vdev. The intention behind this property is to improve performance when using o_direct. Signed-off-by: MigeljanImeri --- include/sys/vdev_impl.h | 1 + include/sys/zio.h | 9 ++++++++ module/zfs/vdev.c | 2 ++ module/zfs/vdev_queue.c | 44 +++++++++++++++++++++++++++++++++++++++ tests/runfiles/common.run | 2 +- 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..3be570afb978 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -151,6 +151,7 @@ struct vdev_queue { uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ list_t vq_active_list; /* List of active I/Os. */ + kmutex_t vq_active_list_lock; hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 46f5d68aed4a..175a25d1d4f3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -249,6 +249,14 @@ typedef uint64_t zio_flag_t; #define ZIO_CHILD_BIT(x) (1U << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x))) +#define ZIO_FLAG_IMPORTANT \ + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL | \ + ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB | \ + ZIO_FLAG_IO_RETRY + +#define ZIO_IS_NORMAL(zio) \ + !((zio)->io_flags & (ZIO_FLAG_IMPORTANT)) + enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, @@ -449,6 +457,7 @@ enum zio_qstate { ZIO_QS_NONE = 0, ZIO_QS_QUEUED, ZIO_QS_ACTIVE, + ZIO_QS_BYPASS, }; struct zio { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5df2f77e5780..a25d4b00f026 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag) * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ + mutex_enter(&vq->vq_active_list_lock); fio = list_head(&vq->vq_active_list); + mutex_exit(&vq->vq_active_list_lock); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 092b3f375be0..3274ce399615 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300; */ uint_t zfs_vdev_def_queue_depth = 32; +/* + * Allow io to bypass the queue depending on how full the queue is. + * 0 = never bypass, 100 = always bypass. + */ +uint_t zfs_vdev_queue_bypass_pct = 10; + static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd) list_create(&vq->vq_active_list, sizeof (struct zio), offsetof(struct zio, io_queue_node.l)); mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd) list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); + mutex_destroy(&vq->vq_active_list_lock); } static void @@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit--; } zio->io_queue_state = ZIO_QS_ACTIVE; + mutex_enter(&vq->vq_active_list_lock); list_insert_tail(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); } static void @@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; + mutex_enter(&vq->vq_active_list_lock); list_remove(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); zio->io_queue_state = ZIO_QS_NONE; } @@ -946,6 +958,27 @@ vdev_queue_io(zio_t *zio) zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); + /* + * Bypass queue if certain conditions are met. Queue bypassing requires + * a non-rotational device. Reads / writes will attempt to bypass queue, + * depending on how full the queue is. Other operations will always + * queue. + */ + if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) { + + int bypass = vdev_queue_length(vq->vq_vdev) < + (zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100 + ? 1 : 0; + + if (bypass) { + zio->io_queue_state = ZIO_QS_BYPASS; + mutex_enter(&vq->vq_active_list_lock); + list_insert_tail(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); + return (zio); + } + } + mutex_enter(&vq->vq_lock); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); @@ -978,6 +1011,14 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = now; vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; + if (zio->io_queue_state == ZIO_QS_BYPASS) { + mutex_enter(&vq->vq_active_list_lock); + list_remove(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); + zio->io_queue_state = ZIO_QS_NONE; + return; + } + mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); @@ -1163,3 +1204,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, "Default queue depth for each allocator"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW, + "Queue bypass percentage per vdev"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8a4a4b0f5cb8..ded968dc7051 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -544,7 +544,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', - 'user_property_001_pos', 'user_property_002_neg', + 'user_property_001_pos', 'user_property_002_neg', 'zpool_set_clear_userprop'] tags = ['functional', 'cli_root', 'zpool_set']