Skip to content

Commit

Permalink
Add vdev module to allow io to bypass the queue
Browse files Browse the repository at this point in the history
Added vdev module to allow bypassing the vdev queue when
reading / writing from / to vdev. The intention behind this
property is to improve performance when using o_direct.

Signed-off-by: MigeljanImeri <[email protected]>
  • Loading branch information
MigeljanImeri committed Jan 21, 2025
1 parent 18c67d2 commit 90842f8
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 1 deletion.
1 change: 1 addition & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ struct vdev_queue {
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
kmutex_t vq_active_list_lock;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
Expand Down
9 changes: 9 additions & 0 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,14 @@ typedef uint64_t zio_flag_t;
#define ZIO_CHILD_BIT(x) (1U << (x))
#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x)))

#define ZIO_FLAG_IMPORTANT \
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL | \
ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB | \
ZIO_FLAG_IO_RETRY

#define ZIO_IS_NORMAL(zio) \
!((zio)->io_flags & (ZIO_FLAG_IMPORTANT))

enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
Expand Down Expand Up @@ -449,6 +457,7 @@ enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
ZIO_QS_BYPASS,
};

struct zio {
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag)
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
mutex_enter(&vq->vq_active_list_lock);
fio = list_head(&vq->vq_active_list);
mutex_exit(&vq->vq_active_list_lock);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
Expand Down
44 changes: 44 additions & 0 deletions module/zfs/vdev_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/
uint_t zfs_vdev_def_queue_depth = 32;

/*
* Allow io to bypass the queue depending on how full the queue is.
* 0 = never bypass, 100 = always bypass.
*/
uint_t zfs_vdev_queue_bypass_pct = 10;

static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
Expand Down Expand Up @@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd)
list_create(&vq->vq_active_list, sizeof (struct zio),
offsetof(struct zio, io_queue_node.l));
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL);
}

void
Expand All @@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd)

list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
mutex_destroy(&vq->vq_active_list_lock);
}

static void
Expand Down Expand Up @@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit--;
}
zio->io_queue_state = ZIO_QS_ACTIVE;
mutex_enter(&vq->vq_active_list_lock);
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
}

static void
Expand All @@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
mutex_enter(&vq->vq_active_list_lock);
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
}

Expand Down Expand Up @@ -946,6 +958,27 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();

/*
* Bypass queue if certain conditions are met. Queue bypassing requires
* a non-rotational device. Reads / writes will attempt to bypass queue,
* depending on how full the queue is. Other operations will always
* queue.
*/
if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {

int bypass = vdev_queue_length(vq->vq_vdev) <
(zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
? 1 : 0;

if (bypass) {
zio->io_queue_state = ZIO_QS_BYPASS;
mutex_enter(&vq->vq_active_list_lock);
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
return (zio);
}
}

mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
Expand Down Expand Up @@ -978,6 +1011,14 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;

if (zio->io_queue_state == ZIO_QS_BYPASS) {
mutex_enter(&vq->vq_active_list_lock);
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
return;
}

mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);

Expand Down Expand Up @@ -1163,3 +1204,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
"Default queue depth for each allocator");

ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW,
"Queue bypass percentage per vdev");
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
[tests/functional/cli_root/zpool_set]
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
'user_property_001_pos', 'user_property_002_neg',
'user_property_001_pos', 'user_property_002_neg',
'zpool_set_clear_userprop']
tags = ['functional', 'cli_root', 'zpool_set']

Expand Down

0 comments on commit 90842f8

Please sign in to comment.