Skip to content

Commit

Permalink
Allow bypassing the vdev queue on SSDs
Browse files Browse the repository at this point in the history
Allow bypassing the vdev queue on SSDs if the vdev queue is less than
zfs_vdev_queue_bypass_pct percent full. This can lead to an over 2x
IOPS speed-up on some benchmarks. The intention behind this property
is to improve performance when using O_DIRECT.

Signed-off-by: MigeljanImeri <[email protected]>
  • Loading branch information
MigeljanImeri committed Jan 23, 2025
1 parent 18c67d2 commit e33957a
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 0 deletions.
1 change: 1 addition & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ struct vdev_queue {
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
kmutex_t vq_active_list_lock;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
Expand Down
19 changes: 19 additions & 0 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ typedef uint64_t zio_flag_t;
#define ZIO_CHILD_BIT(x) (1U << (x))
#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x)))


/*
* ZIOs that are ZIO_FLAG_IMPORTANT are always queued so that they never get
* starved out. This allows us to bypass the queue for "normal" reads and
* writes when the queues are low for better IOPS. If the queues get too high
* then we go back to queuing the "normal" reads/writes so as not to starve
* out more important IOs like scrub/resilver/retry. See
* zfs_vdev_queue_bypass_pct for details.
*/

#define ZIO_FLAG_IMPORTANT \
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL | \
ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB | \
ZIO_FLAG_IO_RETRY | ZIO_FLAG_NODATA

#define ZIO_IS_NORMAL(zio) \
!((zio)->io_flags & (ZIO_FLAG_IMPORTANT))

enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
Expand Down Expand Up @@ -449,6 +467,7 @@ enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
ZIO_QS_BYPASS,
};

struct zio {
Expand Down
7 changes: 7 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,13 @@ Default queue depth for each vdev IO allocator.
Higher values allow for better coalescing of sequential writes before sending
them to the disk, but can increase transaction commit times.
.
.It Sy zfs_vdev_queue_bypass_pct Ns = Ns Sy 10 Pq uint
Allow bypassing the vdev's queue if the vdev queue is less than
zfs_vdev_queue_bypass_pct percent full.
This only applies to SSDs (non-rotational drives).
Only "normal" (read/write) zios can bypass the queue.
You can use 0 to always queue IOs and 100 to never queue IOs.
.
.It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
Defines if the driver should retire on a given error type.
The following options may be bitwise-ored together:
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag)
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
mutex_enter(&vq->vq_active_list_lock);
fio = list_head(&vq->vq_active_list);
mutex_exit(&vq->vq_active_list_lock);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
Expand Down
47 changes: 47 additions & 0 deletions module/zfs/vdev_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/
uint_t zfs_vdev_def_queue_depth = 32;

/*
* Allow io to bypass the queue depending on how full the queue is.
* 0 = never bypass, 100 = always bypass.
*/
uint_t zfs_vdev_queue_bypass_pct = 10;

static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
Expand Down Expand Up @@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd)
list_create(&vq->vq_active_list, sizeof (struct zio),
offsetof(struct zio, io_queue_node.l));
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL);
}

void
Expand All @@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd)

list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
mutex_destroy(&vq->vq_active_list_lock);
}

static void
Expand Down Expand Up @@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit--;
}
zio->io_queue_state = ZIO_QS_ACTIVE;
mutex_enter(&vq->vq_active_list_lock);
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
}

static void
Expand All @@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
mutex_enter(&vq->vq_active_list_lock);
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
}

Expand Down Expand Up @@ -946,6 +958,30 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();

/*
* Bypass queue if certain conditions are met. Queue bypassing requires
* a non-rotational device. Reads / writes will attempt to bypass queue,
* depending on how full the queue is. Other operations will always
* queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some
* benchmarks. If the queue is too full (due to a scrub or resilver)
* then go back to queuing normal reads/writes so as not to starve out
* the more important IOs.
*/
if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {

int bypass = vdev_queue_length(vq->vq_vdev) <
(zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
? 1 : 0;

if (bypass) {
zio->io_queue_state = ZIO_QS_BYPASS;
mutex_enter(&vq->vq_active_list_lock);
list_insert_tail(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
return (zio);
}
}

mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
Expand Down Expand Up @@ -978,6 +1014,14 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;

if (zio->io_queue_state == ZIO_QS_BYPASS) {
mutex_enter(&vq->vq_active_list_lock);
list_remove(&vq->vq_active_list, zio);
mutex_exit(&vq->vq_active_list_lock);
zio->io_queue_state = ZIO_QS_NONE;
return;
}

mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);

Expand Down Expand Up @@ -1163,3 +1207,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
"Default queue depth for each allocator");

ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW,
"Queue bypass percentage per vdev");

0 comments on commit e33957a

Please sign in to comment.