diff --git a/scheds/rust/scx_rusty/src/bpf/intf.h b/scheds/rust/scx_rusty/src/bpf/intf.h
index 604548af8..125e8db93 100644
--- a/scheds/rust/scx_rusty/src/bpf/intf.h
+++ b/scheds/rust/scx_rusty/src/bpf/intf.h
@@ -34,6 +34,22 @@ enum consts {
 	LB_LOAD_BUCKETS		= 100,	/* Must be a factor of LB_MAX_WEIGHT */
 	LB_WEIGHT_PER_BUCKET	= LB_MAX_WEIGHT / LB_LOAD_BUCKETS,
 
+	/* Time constants */
+	MSEC_PER_SEC		= 1000LLU,
+	USEC_PER_MSEC		= 1000LLU,
+	NSEC_PER_USEC		= 1000LLU,
+	NSEC_PER_MSEC           = USEC_PER_MSEC * NSEC_PER_USEC,
+	USEC_PER_SEC            = USEC_PER_MSEC * MSEC_PER_SEC,
+	NSEC_PER_SEC            = NSEC_PER_USEC * USEC_PER_SEC,
+
+	/* Constants used for determining a task's deadline */
+	DL_RUNTIME_SCALE	= 2, /* roughly scales average runtime to */
+				     /* same order of magnitude as waker  */
+				     /* and blocked frequencies */
+	DL_MAX_LATENCY_NS	= (50 * NSEC_PER_MSEC),
+	DL_FREQ_FT_MAX		= 100000,
+	DL_MAX_LAT_PRIO		= 39,
+
 	/*
 	 * When userspace load balancer is trying to determine the tasks to push
 	 * out from an overloaded domain, it looks at the the following number
@@ -71,6 +87,10 @@ enum stat_idx {
 	/* Errors */
 	RUSTY_STAT_TASK_GET_ERR,
 
+	/* Deadline related stats */
+	RUSTY_STAT_DL_CLAMP,
+	RUSTY_STAT_DL_PRESET,
+
 	RUSTY_NR_STATS,
 };
 
@@ -84,7 +104,19 @@ struct task_ctx {
 	u32 weight;
 	bool runnable;
 	u64 dom_active_pids_gen;
-	u64 running_at;
+	u64 deadline;
+
+	u64 sum_runtime;
+	u64 avg_runtime;
+	u64 last_run_at;
+
+	/* frequency with which a task is blocked (consumer) */
+	u64 blocked_freq;
+	u64 last_blocked_at;
+
+	/* frequency with which a task wakes other tasks (producer) */
+	u64 waker_freq;
+	u64 last_woke_at;
 
 	/* The task is a workqueue worker thread */
 	bool is_kworker;
@@ -110,11 +142,12 @@ struct bucket_ctx {
 };
 
 struct dom_ctx {
-	u64 vtime_now;
+	u32 id;
 	struct bpf_cpumask __kptr *cpumask;
 	struct bpf_cpumask __kptr *direct_greedy_cpumask;
 	struct bpf_cpumask __kptr *node_cpumask;
-	u32 node_id;
+
+	u64 min_vruntime;
 
 	u64 dbg_dcycle_printed_at;
 	struct bucket_ctx buckets[LB_LOAD_BUCKETS];
diff --git a/scheds/rust/scx_rusty/src/bpf/main.bpf.c b/scheds/rust/scx_rusty/src/bpf/main.bpf.c
index 87e30a7a6..592cab355 100644
--- a/scheds/rust/scx_rusty/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_rusty/src/bpf/main.bpf.c
@@ -76,7 +76,7 @@ const volatile u32 greedy_threshold_x_numa;
 const volatile u32 debug;
 
 /* base slice duration */
-const volatile u64 slice_ns = SCX_SLICE_DFL;
+static u64 slice_ns = SCX_SLICE_DFL;
 
 /*
  * Per-CPU context
@@ -116,6 +116,14 @@ struct lock_wrapper {
 	struct bpf_spin_lock lock;
 };
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct lock_wrapper);
+	__uint(max_entries, MAX_DOMS);
+	__uint(map_flags, 0);
+} dom_vtime_locks SEC(".maps");
+
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
@@ -144,6 +152,22 @@ struct {
 	__uint(map_flags, 0);
 } task_data SEC(".maps");
 
+static struct dom_ctx *try_lookup_dom_ctx(u32 dom_id)
+{
+	return bpf_map_lookup_elem(&dom_data, &dom_id);
+}
+
+static struct dom_ctx *lookup_dom_ctx(u32 dom_id)
+{
+	struct dom_ctx *domc;
+
+	domc = try_lookup_dom_ctx(dom_id);
+	if (!domc)
+		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
+
+	return domc;
+}
+
 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 {
 	struct task_ctx *taskc;
@@ -196,7 +220,7 @@ static struct bucket_ctx *lookup_dom_bucket(struct dom_ctx *dom_ctx,
 	return NULL;
 }
 
-static struct lock_wrapper *lookup_dom_lock(u32 dom_id, u32 weight)
+static struct lock_wrapper *lookup_dom_bkt_lock(u32 dom_id, u32 weight)
 {
 	u32 idx = dom_id * LB_LOAD_BUCKETS + weight_to_bucket_idx(weight);
 	struct lock_wrapper *lockw;
@@ -209,6 +233,33 @@ static struct lock_wrapper *lookup_dom_lock(u32 dom_id, u32 weight)
 	return NULL;
 }
 
+static struct lock_wrapper *lookup_dom_vtime_lock(u32 dom_id)
+{
+	struct lock_wrapper *lockw;
+	u32 idx = dom_id;
+
+	lockw = bpf_map_lookup_elem(&dom_vtime_locks, &idx);
+	if (!lockw)
+		scx_bpf_error("Failed to lookup dom lock");
+
+	return lockw;
+}
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static u64 scale_up_fair(u64 value, u64 weight)
+{
+	return value * weight / 100;
+}
+
+static u64 scale_inverse_fair(u64 value, u64 weight)
+{
+	return value * 100 / weight;
+}
+
 static void dom_dcycle_adj(u32 dom_id, u32 weight, u64 now, bool runnable)
 {
 	struct dom_ctx *domc;
@@ -217,14 +268,11 @@ static void dom_dcycle_adj(u32 dom_id, u32 weight, u64 now, bool runnable)
 	s64 adj = runnable ? 1 : -1;
 	u32 bucket_idx = 0;
 
-	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-	if (!domc) {
-		scx_bpf_error("Failed to lookup dom_ctx");
+	if (!(domc = lookup_dom_ctx(dom_id)))
 		return;
-	}
 
 	bucket = lookup_dom_bucket(domc, weight, &bucket_idx);
-	lockw = lookup_dom_lock(dom_id, weight);
+	lockw = lookup_dom_bkt_lock(dom_id, weight);
 
 	if (!bucket || !lockw)
 		return;
@@ -248,24 +296,20 @@ static void dom_dcycle_adj(u32 dom_id, u32 weight, u64 now, bool runnable)
 	}
 }
 
-static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
-			       u32 from_dom_id, u32 to_dom_id, u64 now)
+static void dom_dcycle_xfer_task(struct task_struct *p, struct task_ctx *taskc,
+			         struct dom_ctx *from_domc,
+				 struct dom_ctx *to_domc, u64 now)
 {
 	struct bucket_ctx *from_bucket, *to_bucket;
 	u32 idx = 0, weight = taskc->weight;
-	struct dom_ctx *from_domc, *to_domc;
 	struct lock_wrapper *from_lockw, *to_lockw;
 	struct ravg_data task_dcyc_rd;
 	u64 from_dcycle[2], to_dcycle[2], task_dcycle;
 
-	from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
-	from_lockw = lookup_dom_lock(from_dom_id, weight);
-	to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id);
-	to_lockw = lookup_dom_lock(to_dom_id, weight);
-	if (!from_domc || !from_lockw || !to_domc || !to_lockw) {
-		scx_bpf_error("dom_ctx / lock lookup failed");
+	from_lockw = lookup_dom_bkt_lock(from_domc->id, weight);
+	to_lockw = lookup_dom_bkt_lock(to_domc->id, weight);
+	if (!from_lockw || !to_lockw)
 		return;
-	}
 
 	from_bucket = lookup_dom_bucket(from_domc, weight, &idx);
 	to_bucket = lookup_dom_bucket(to_domc, weight, &idx);
@@ -273,8 +317,8 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 		return;
 
 	/*
-	 * @p is moving from @from_dom_id to @to_dom_id. Its duty cycle
-	 * contribution in the relevant bucket of @from_dom_id should be moved
+	 * @p is moving from @from_domc to @to_domc. Its duty cycle
+	 * contribution in the relevant bucket of @from_domc should be moved
 	 * together to the corresponding bucket in @to_dom_id. We only track
 	 * duty cycle from BPF. Load is computed in user space when performing
 	 * load balancing.
@@ -284,7 +328,7 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	if (debug >= 2)
 		task_dcycle = ravg_read(&task_dcyc_rd, now, load_half_life);
 
-	/* transfer out of @from_dom_id */
+	/* transfer out of @from_domc */
 	bpf_spin_lock(&from_lockw->lock);
 	if (taskc->runnable)
 		from_bucket->dcycle--;
@@ -300,7 +344,7 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 
 	bpf_spin_unlock(&from_lockw->lock);
 
-	/* transfer into @to_dom_id */
+	/* transfer into @to_domc */
 	bpf_spin_lock(&to_lockw->lock);
 	if (taskc->runnable)
 		to_bucket->dcycle++;
@@ -317,8 +361,8 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	bpf_spin_unlock(&to_lockw->lock);
 
 	if (debug >= 2)
-		bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
-			   from_dom_id, to_dom_id,
+		bpf_printk("XFER DCYCLE dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
+			   from_domc->id, to_domc->id,
 			   task_dcycle >> RAVG_FRAC_BITS,
 			   from_dcycle[0] >> RAVG_FRAC_BITS,
 			   from_dcycle[1] >> RAVG_FRAC_BITS,
@@ -326,6 +370,39 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 			   to_dcycle[1] >> RAVG_FRAC_BITS);
 }
 
+static u64 dom_min_vruntime(struct dom_ctx *domc)
+{
+	/*
+	 * Technically, this is undefined behavior according to the C standard,
+	 * section 5.1.2.4:
+	 *
+	 * The execution of a program contains a data race if it contains two
+	 * conflicting actions in different threads, at least one of which is
+	 * not atomic, and neither happens before the other. Any such data race
+	 * results in undefined behavior.
+	 *
+	 * To get around this we can adopt what's done in the LKMM by using
+	 * READ_ONCE() and WRITE_ONCE() macros.
+	 *
+	 * XXX: Once those macros are added, update this access.
+	 */
+	return domc->min_vruntime;
+}
+
+static void dom_xfer_task(struct task_struct *p, struct task_ctx *taskc,
+			  u32 from_dom_id, u32 to_dom_id, u64 now)
+{
+	struct dom_ctx *from_domc, *to_domc;
+
+	from_domc = lookup_dom_ctx(from_dom_id);
+	to_domc = lookup_dom_ctx(to_dom_id);
+
+	if (!from_domc || !to_domc)
+		return;
+
+	dom_dcycle_xfer_task(p, taskc, from_domc, to_domc, now);
+}
+
 /*
  * Statistics
  */
@@ -364,6 +441,7 @@ struct {
  */
 struct tune_input{
 	u64 gen;
+	u64 slice_ns;
 	u64 direct_greedy_cpumask[MAX_CPUS / 64];
 	u64 kick_greedy_cpumask[MAX_CPUS / 64];
 } tune_input;
@@ -373,11 +451,6 @@ private(A) struct bpf_cpumask __kptr *all_cpumask;
 private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask;
 private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask;
 
-static inline bool vtime_before(u64 a, u64 b)
-{
-	return (s64)(a - b) < 0;
-}
-
 static u32 cpu_to_dom_id(s32 cpu)
 {
 	const volatile u32 *dom_idp;
@@ -405,6 +478,7 @@ static void refresh_tune_params(void)
 		return;
 
 	tune_params_gen = tune_input.gen;
+	slice_ns = tune_input.slice_ns;
 
 	bpf_for(cpu, 0, nr_cpus_possible) {
 		u32 dom_id = cpu_to_dom_id(cpu);
@@ -413,10 +487,8 @@ static void refresh_tune_params(void)
 		if (is_offline_cpu(cpu))
 			continue;
 
-		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
-			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
+		if (!(domc = lookup_dom_ctx(dom_id)))
 			return;
-		}
 
 		if (tune_input.direct_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
 			if (direct_greedy_cpumask)
@@ -440,13 +512,253 @@ static void refresh_tune_params(void)
 	}
 }
 
+/*
+ * log2 helper functions taken from scx_lavd
+ */
+static inline __attribute__((always_inline)) u32 bpf_log2(u32 v)
+{
+        u32 r;
+        u32 shift;
+
+        r = (v > 0xFFFF) << 4; v >>= r;
+        shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+        shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+        shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+        r |= (v >> 1);
+        return r;
+}
+
+static inline __attribute__((always_inline)) u32 bpf_log2l(u64 v)
+{
+        u32 hi = v >> 32;
+        if (hi)
+                return bpf_log2(hi) + 32 + 1;
+        else
+                return bpf_log2(v) + 1;
+}
+
+static u64 min(u64 a, u64 b)
+{
+	return a <= b ? a : b;
+}
+
+/*
+ * ** Taken directly from fair.c in the Linux kernel **
+ *
+ * We use this table to inversely scale deadline according to a task's
+ * calculated latency factor. We preserve the comment directly from the table
+ * in fair.c:
+ *
+ * "Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)"
+ */
+const int sched_prio_to_weight[DL_MAX_LAT_PRIO] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+static u64 sched_prio_to_latency_weight(u64 prio)
+{
+	if (prio >= DL_MAX_LAT_PRIO) {
+		scx_bpf_error("Invalid prio index");
+		return 0;
+	}
+
+	return sched_prio_to_weight[DL_MAX_LAT_PRIO - prio - 1];
+}
+
+static u64 task_compute_dl(struct task_struct *p, struct task_ctx *taskc,
+			   u64 enq_flags)
+{
+	u64 waker_freq, blocked_freq;
+	u64 lat_prio, lat_scale, avg_run_raw, avg_run;
+	u64 freq_factor;
+
+	/*
+	 * Determine the latency criticality of a task, and scale a task's
+	 * deadline accordingly. Much of this is inspired by the logic in
+	 * scx_lavd that was originally conceived and implemented by Changwoo
+	 * Min. Though the implementations for determining latency criticality
+	 * are quite different in many ways, individuals familiar with both
+	 * schedulers will feel an eerie sense of deja-vu. The details of
+	 * interactivity boosting for rusty are described below.
+	 */
+
+	/*
+	 * We begin by calculating the following interactivity factors for a
+	 * task:
+	 *
+	 * - waker_freq: The frequency with which a task wakes up other tasks.
+	 *		 A high waker frequency generally implies a producer
+	 *		 task that is at the beginning and/or middle of a work
+	 *		 chain.
+	 *
+	 * - blocked_freq: The frequency with which a task is blocked. A high
+	 *		   blocked frequency indicates a consumer task that is
+	 *		   at the middle and/or end of a work chain.
+	 *
+	 * A task that is high in both frequencies indicates what is often the
+	 * most latency-critical interactive task: a task that functions both
+	 * as a producer and a consumer by being in the _middle_ of a work
+	 * chain.
+	 *
+	 * We want to prioritize running these tasks, as they are likely to
+	 * have a disproporionate impact on the latency (and possibly
+	 * throughput) of the workload they are enabling due to Amdahl's law.
+	 * For example, say that you have a workload where 50% of the workload
+	 * is serialized by a producer and consumer task (25% each), and the
+	 * latter 50% is serviced in parallel by n CPU hogging tasks. If either
+	 * the producer or consumer is improved by a factor of x, it improves
+	 * the latency of the entire workload by:
+	 *
+	 *	S_lat(x) = 1 / ((1 - .25) + (.25 / x))
+	 *
+	 * Say that we improve wakeup latency by 2x for either task, the
+	 * latency improvement would be:
+	 *
+	 *	S_lat(2) = 1 / ((1 - .25) + (.25 / 2))
+	 *		 = 1 / (.75 + .125)
+	 *		 = 1 / .875
+	 *		~= 14.2%
+	 *
+	 * If we instead improve wakeup latency by 2x for all the n parallel
+	 * tasks in the latter 50% of the workload window, the improvement
+	 * would be:
+	 *
+	 *	S_lat(2) = 1 / ((1 - .5) + (.5 / 2))
+	 *		 = 1 / (.5 + .25)
+	 *		 = 1 / .75
+	 *		~= 33%
+	 *
+	 * This is also significant, but the returns are amortized across all
+	 * of those tasks. Thus, by giving a latency boost to the producer /
+	 * consumer tasks, we optimize for the case of scheduling tasks that
+	 * are on the critical path for serial workchains, and have a
+	 * disproportionate impact on the latency of a workload.
+	 *
+	 * We multiply the frequencies of wait_freq and waker_freq somewhat
+	 * arbitrarily, based on observed performance for audio and gaming
+	 * interactive workloads.
+	 */
+	waker_freq = min(taskc->waker_freq, DL_FREQ_FT_MAX);
+	blocked_freq = min(taskc->blocked_freq, DL_FREQ_FT_MAX);
+	freq_factor = blocked_freq * waker_freq * waker_freq;
+
+	/*
+	 * Scale the frequency factor according to the task's weight. A task
+	 * with higher weight is given a higher frequency factor than a task
+	 * with a lower weight.
+	 */
+	freq_factor = scale_up_fair(freq_factor, p->scx.weight);
+
+	/*
+	 * The above frequencies roughly follow an exponential distribution, so
+	 * borrow the bpf_log2l() implementation from lavd to linearize it to a
+	 * boost priority.
+	 */
+	lat_prio = bpf_log2l(freq_factor + 1);
+	lat_prio = min(lat_prio, DL_MAX_LAT_PRIO);
+
+	/*
+	 * Next calculate a task's average runtime, and apply it to deadline
+	 * accordingly. A task with a large runtime is penalized from an
+	 * interactivity standpoint, for obvious reasons.
+	 *
+	 * As with waker and blocked frequencies above, this follows an
+	 * exponential distribution. We inversely scale to account for
+	 * empirical observations which seem to bring it roughly to the same
+	 * order of magnitude as the blocker and waker frequencies above.
+	 *
+	 * We inversely scale the task's averge_runtime to cause tasks with
+	 * lower weight to receive a harsher penalty for long runtimes, and
+	 * vice versa for tasks with lower weight.
+	 */
+	avg_run_raw = taskc->avg_runtime / DL_RUNTIME_SCALE;
+	avg_run_raw = (avg_run_raw, DL_MAX_LATENCY_NS);
+	avg_run_raw = scale_inverse_fair(avg_run_raw, p->scx.weight);
+	avg_run = bpf_log2l(avg_run_raw + 1);
+
+	if (avg_run < lat_prio) {
+		/* Equivalent to lat_prio = log(freq_factor / avg_run_raw) */
+		lat_prio -= avg_run;
+	} else {
+		lat_prio = 0;
+	}
+
+	/*
+	 * Ultimately, what we're trying to arrive at is a single value
+	 * 'lat_prio' that we can use to compute the weight that we use to
+	 * scale a task's average runtime as below.
+	 *
+	 * To summarize what we've done above, we compute this lat_prio as the
+	 * sum of a task's frequency factor, minus an average runtime factor.
+	 * Both factors are scaled according to a task's weight.
+	 *
+	 * Today, we're just interpreting lat_prio as a niceness value, but
+	 * this can and almost certainly will likely change to something more
+	 * generic and/or continuous and flexible so that it can also
+	 * accommodate cgroups.
+	 */
+	lat_scale = sched_prio_to_latency_weight(lat_prio);
+	lat_scale = min(lat_scale, LB_MAX_WEIGHT);
+
+	/*
+	 * Finally, with our 'lat_scale' weight, we compute the length of the
+	 * task's request as:
+	 *
+	 * r_i = avg_runtime * 100 / lat_scale
+	 *
+	 * In other words, the "CPU request length" which is used to determine
+	 * the actual absolute vtime that the task is dispatched with.
+	 */
+	return scale_inverse_fair(taskc->avg_runtime, lat_scale);
+}
+
+static void clamp_task_vtime(struct task_struct *p, struct task_ctx *taskc, u64 enq_flags)
+{
+	u64 dom_vruntime, min_vruntime;
+	struct dom_ctx *domc;
+
+	if (!(domc = lookup_dom_ctx(taskc->dom_id)))
+		return;
+
+	dom_vruntime = dom_min_vruntime(domc);
+	min_vruntime = dom_vruntime - slice_ns;
+	/*
+	 * Allow an idling task to accumulate at most one slice worth of
+	 * vruntime budget. This prevents e.g. a task for sleeping for 1 day,
+	 * and then coming back and having essentially full use of the CPU for
+	 * an entire day until it's caught up to the other tasks' vtimes.
+	 */
+	if (vtime_before(p->scx.dsq_vtime, min_vruntime)) {
+		p->scx.dsq_vtime = min_vruntime;
+		taskc->deadline = p->scx.dsq_vtime + task_compute_dl(p, taskc, enq_flags);
+		stat_add(RUSTY_STAT_DL_CLAMP, 1);
+	} else {
+		stat_add(RUSTY_STAT_DL_PRESET, 1);
+	}
+}
+
 static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 			    u32 new_dom_id, bool init_dsq_vtime)
 {
 	struct dom_ctx *old_domc, *new_domc;
 	struct bpf_cpumask *d_cpumask, *t_cpumask;
 	u32 old_dom_id = taskc->dom_id;
-	s64 vtime_delta;
 
 	t_cpumask = taskc->cpumask;
 	if (!t_cpumask) {
@@ -454,18 +766,11 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 		return false;
 	}
 
-	old_domc = bpf_map_lookup_elem(&dom_data, &old_dom_id);
-	if (!old_domc) {
-		scx_bpf_error("Failed to lookup old dom%u", old_dom_id);
+	old_domc = lookup_dom_ctx(old_dom_id);
+	if (!old_domc)
 		return false;
-	}
-
-	if (init_dsq_vtime)
-		vtime_delta = 0;
-	else
-		vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now;
 
-	new_domc = bpf_map_lookup_elem(&dom_data, &new_dom_id);
+	new_domc = try_lookup_dom_ctx(new_dom_id);
 	if (!new_domc) {
 		if (new_dom_id == NO_DOM_FOUND) {
 			taskc->offline = true;
@@ -499,10 +804,12 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 				   p->cpus_ptr)) {
 		u64 now = bpf_ktime_get_ns();
 
-		dom_load_xfer_task(p, taskc, taskc->dom_id, new_dom_id, now);
-
-		p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta;
+		if (!init_dsq_vtime)
+			dom_xfer_task(p, taskc, taskc->dom_id, new_dom_id, now);
 		taskc->dom_id = new_dom_id;
+		p->scx.dsq_vtime = dom_min_vruntime(new_domc);
+		taskc->deadline = p->scx.dsq_vtime +
+				  scale_inverse_fair(taskc->avg_runtime, taskc->weight);
 		bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask,
 				p->cpus_ptr);
 	}
@@ -550,11 +857,9 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 			const struct cpumask *idle_cpumask;
 			bool has_idle;
 
-			domc = bpf_map_lookup_elem(&dom_data, &taskc->dom_id);
-			if (!domc) {
-				scx_bpf_error("Failed to find dom%u", taskc->dom_id);
+			domc = lookup_dom_ctx(taskc->dom_id);
+			if (!domc)
 				goto enoent;
-			}
 			d_cpumask = domc->cpumask;
 			if (!d_cpumask) {
 				scx_bpf_error("Failed to acquire dom%u cpumask kptr",
@@ -657,10 +962,8 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		struct dom_ctx *domc;
 		struct bpf_cpumask *tmp_direct_greedy, *node_mask;
 
-		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
-			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
+		if (!(domc = lookup_dom_ctx(dom_id)))
 			goto enoent;
-		}
 
 		tmp_direct_greedy = direct_greedy_cpumask;
 		if (!tmp_direct_greedy) {
@@ -772,6 +1075,14 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	return -ENOENT;
 }
 
+static void place_task_dl(struct task_struct *p, struct task_ctx *taskc,
+			  u64 enq_flags)
+{
+	clamp_task_vtime(p, taskc, enq_flags);
+	scx_bpf_dispatch_vtime(p, taskc->dom_id, slice_ns, taskc->deadline,
+			       enq_flags);
+}
+
 void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct task_ctx *taskc;
@@ -822,28 +1133,10 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 dom_queue:
-	if (fifo_sched) {
+	if (fifo_sched)
 		scx_bpf_dispatch(p, taskc->dom_id, slice_ns, enq_flags);
-	} else {
-		u64 vtime = p->scx.dsq_vtime;
-		u32 dom_id = taskc->dom_id;
-		struct dom_ctx *domc;
-
-		domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-		if (!domc) {
-			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
-			return;
-		}
-
-		/*
-		 * Limit the amount of budget that an idling task can accumulate
-		 * to one slice.
-		 */
-		if (vtime_before(vtime, domc->vtime_now - slice_ns))
-			vtime = domc->vtime_now - slice_ns;
-
-		scx_bpf_dispatch_vtime(p, taskc->dom_id, slice_ns, vtime, enq_flags);
-	}
+	else
+		place_task_dl(p, taskc, enq_flags);
 
 	/*
 	 * If there are CPUs which are idle and not saturated, wake them up to
@@ -885,29 +1178,6 @@ static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
 	return false;
 }
 
-static u32 dom_rr_next(s32 cpu)
-{
-	struct pcpu_ctx *pcpuc;
-	u32 idx, *dom_id;
-
-	pcpuc = lookup_pcpu_ctx(cpu);
-	if (!pcpuc || !pcpuc->nr_node_doms)
-		return 0;
-
-	idx = (pcpuc->dom_rr_cur + 1) % pcpuc->nr_node_doms;
-	dom_id = MEMBER_VPTR(pcpuc->node_doms, [idx]);
-	if (!dom_id) {
-		scx_bpf_error("Failed to lookup dom for %d", cpu);
-		return 0;
-	}
-
-	if (*dom_id == cpu_to_dom_id(cpu))
-		scx_bpf_error("%d found current dom in node_doms array", cpu);
-
-	pcpuc->dom_rr_cur++;
-	return *dom_id;
-}
-
 u32 dom_node_id(u32 dom_id)
 {
 	const volatile u32 *nid_ptr;
@@ -965,23 +1235,73 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+/*
+ * Exponential weighted moving average
+ *
+ * Copied from scx_lavd. Returns the new average as:
+ *
+ *	new_avg := (old_avg * .75) + (new_val * .25);
+ */
+static u64 calc_avg(u64 old_val, u64 new_val)
+{
+	return (old_val - (old_val >> 2)) + (new_val >> 2);
+}
+
+static u64 update_freq(u64 freq, u64 interval)
+{
+	u64 new_freq;
+
+	new_freq = (100 * NSEC_PER_MSEC) / interval;
+	return calc_avg(freq, new_freq);
+}
+
 void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 {
-	u64 now = bpf_ktime_get_ns();
-	struct task_ctx *taskc;
+	u64 now = bpf_ktime_get_ns(), interval;
+	struct task_struct *waker;
+	struct task_ctx *wakee_ctx, *waker_ctx;
 
-	if (!(taskc = lookup_task_ctx(p)))
+	if (!(wakee_ctx = lookup_task_ctx(p)))
 		return;
 
-	if (taskc->offline) {
+	if (wakee_ctx->offline) {
 		scx_bpf_error("Offline task [%s](%d) is becoming runnable",
 			      p->comm, p->pid);
 		return;
 	}
-	taskc->is_kworker = p->flags & PF_WQ_WORKER;
 
-	task_load_adj(p, taskc, now, true);
-	dom_dcycle_adj(taskc->dom_id, taskc->weight, now, true);
+	wakee_ctx->is_kworker = p->flags & PF_WQ_WORKER;
+
+	task_load_adj(p, wakee_ctx, now, true);
+	dom_dcycle_adj(wakee_ctx->dom_id, wakee_ctx->weight, now, true);
+
+	if (fifo_sched)
+		return;
+
+	wakee_ctx->sum_runtime = 0;
+
+	waker = bpf_get_current_task_btf();
+	if (!(waker_ctx = lookup_task_ctx(p)))
+		return;
+
+	interval = now - waker_ctx->last_woke_at;
+	waker_ctx->waker_freq = update_freq(waker_ctx->waker_freq, interval);
+	waker_ctx->last_woke_at = now;
+}
+
+static void running_update_vtime(struct task_struct *p,
+				 struct task_ctx *taskc,
+				 struct dom_ctx *domc)
+{
+	struct lock_wrapper* lockw = lookup_dom_vtime_lock(domc->id);
+
+	if (!lockw)
+		return;
+
+	bpf_spin_lock(&lockw->lock);
+	if (vtime_before(domc->min_vruntime, p->scx.dsq_vtime))
+		domc->min_vruntime = p->scx.dsq_vtime;
+	bpf_spin_unlock(&lockw->lock);
 }
 
 void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
@@ -993,7 +1313,6 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
-	taskc->running_at = bpf_ktime_get_ns();
 	dom_id = taskc->dom_id;
 	if (dom_id >= MAX_DOMS) {
 		scx_bpf_error("Invalid dom ID");
@@ -1025,25 +1344,38 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 	if (fifo_sched)
 		return;
 
-	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-	if (!domc) {
-		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
+	domc = lookup_dom_ctx(dom_id);
+	if (!domc)
 		return;
-	}
 
-	/*
-	 * Global vtime always progresses forward as tasks start executing. The
-	 * test and update can be performed concurrently from multiple CPUs and
-	 * thus racy. Any error should be contained and temporary. Let's just
-	 * live with it.
-	 */
-	if (vtime_before(domc->vtime_now, p->scx.dsq_vtime))
-		domc->vtime_now = p->scx.dsq_vtime;
+	running_update_vtime(p, taskc, domc);
+	taskc->last_run_at = bpf_ktime_get_ns();
+}
+
+static void stopping_update_vtime(struct task_struct *p,
+				  struct task_ctx *taskc,
+				  struct dom_ctx *domc)
+{
+	struct lock_wrapper* lockw = lookup_dom_vtime_lock(domc->id);
+	u64 now, delta;
+
+	if (!lockw)
+		return;
+
+	now = bpf_ktime_get_ns();
+	delta = now - taskc->last_run_at;
+
+	taskc->sum_runtime += delta;
+	taskc->avg_runtime = calc_avg(taskc->avg_runtime, taskc->sum_runtime);
+
+	p->scx.dsq_vtime += scale_inverse_fair(delta, p->scx.weight);
+	taskc->deadline = p->scx.dsq_vtime + task_compute_dl(p, taskc, 0);
 }
 
 void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 {
 	struct task_ctx *taskc;
+	struct dom_ctx *domc;
 
 	if (fifo_sched)
 		return;
@@ -1051,21 +1383,33 @@ void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
-	/* scale the execution time by the inverse of the weight and charge */
-	p->scx.dsq_vtime +=
-		(bpf_ktime_get_ns() - taskc->running_at) * 100 / p->scx.weight;
+	if (!(domc = lookup_dom_ctx(taskc->dom_id)))
+		return;
+
+	stopping_update_vtime(p, taskc, domc);
 }
 
 void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 {
-	u64 now = bpf_ktime_get_ns();
+	u64 now = bpf_ktime_get_ns(), interval;
 	struct task_ctx *taskc;
+	struct dom_ctx *domc;
 
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
 	task_load_adj(p, taskc, now, false);
 	dom_dcycle_adj(taskc->dom_id, taskc->weight, now, false);
+
+	if (fifo_sched)
+		return;
+
+	if (!(domc = lookup_dom_ctx(taskc->dom_id)))
+		return;
+
+	interval = now - taskc->last_blocked_at;
+	taskc->blocked_freq = update_freq(taskc->blocked_freq, interval);
+	taskc->last_blocked_at = now;
 }
 
 void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
@@ -1161,7 +1505,13 @@ static s32 create_save_cpumask(struct bpf_cpumask **kptr)
 s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
 {
-	struct task_ctx taskc = { .dom_active_pids_gen = -1 };
+	u64 now = bpf_ktime_get_ns();
+	struct task_ctx taskc = {
+		.dom_active_pids_gen = -1,
+		.last_blocked_at = now,
+		.last_woke_at = now,
+
+	};
 	struct task_ctx *map_value;
 	long ret;
 	pid_t pid;
@@ -1292,13 +1642,11 @@ static s32 create_dom(u32 dom_id)
 		return ret;
 	}
 
-	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-	if (!domc) {
-		/* Should never happen, it's created statically at load time. */
-		scx_bpf_error("No dom%u", dom_id);
+	domc = lookup_dom_ctx(dom_id);
+	if (!domc)
 		return -ENOENT;
-	}
 
+	domc->id = dom_id;
 
 	ret = create_save_cpumask(&domc->cpumask);
 	if (ret)
@@ -1373,11 +1721,9 @@ static s32 initialize_cpu(s32 cpu)
 
 	pcpuc->dom_rr_cur = cpu;
 	bpf_for(i, 0, nr_doms) {
-		domc = bpf_map_lookup_elem(&dom_data, &i);
-		if (!domc) {
-			scx_bpf_error("Failed to lookup dom_ctx");
+		domc = lookup_dom_ctx(i);
+		if (!domc)
 			return -ENOENT;
-		}
 		bpf_rcu_read_lock();
 		cpumask = domc->node_cpumask;
 		if (!cpumask) {
@@ -1492,4 +1838,5 @@ SCX_OPS_DEFINE(rusty,
 	       .cpu_offline		= (void *)rusty_cpu_offline,
 	       .init			= (void *)rusty_init,
 	       .exit			= (void *)rusty_exit,
+	       .timeout_ms		= 10000,
 	       .name			= "rusty");
diff --git a/scheds/rust/scx_rusty/src/main.rs b/scheds/rust/scx_rusty/src/main.rs
index 468942624..1b86ba5d1 100644
--- a/scheds/rust/scx_rusty/src/main.rs
+++ b/scheds/rust/scx_rusty/src/main.rs
@@ -79,9 +79,13 @@ const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
 /// limitation will be removed in the future.
 #[derive(Debug, Parser)]
 struct Opts {
-    /// Scheduling slice duration in microseconds.
-    #[clap(short = 's', long, default_value = "20000")]
-    slice_us: u64,
+    /// Scheduling slice duration for under-utilized hosts, in microseconds.
+    #[clap(short = 'u', long, default_value = "20000")]
+    slice_us_underutil: u64,
+
+    /// Scheduling slice duration for over-utilized hosts, in microseconds.
+    #[clap(short = 'o', long, default_value = "1000")]
+    slice_us_overutil: u64,
 
     /// Monitoring and load balance interval in seconds.
     #[clap(short = 'i', long, default_value = "2.0")]
@@ -305,7 +309,6 @@ impl<'a> Scheduler<'a> {
 	}
         skel.struct_ops.rusty_mut().exit_dump_len = opts.exit_dump_len;
 
-        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
         skel.rodata_mut().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
         skel.rodata_mut().kthreads_local = opts.kthreads_local;
         skel.rodata_mut().fifo_sched = opts.fifo_sched;
@@ -342,7 +345,11 @@ impl<'a> Scheduler<'a> {
 
             nr_lb_data_errors: 0,
 
-            tuner: Tuner::new(domains, opts.direct_greedy_under, opts.kick_greedy_under)?,
+            tuner: Tuner::new(domains,
+                              opts.direct_greedy_under,
+                              opts.kick_greedy_under,
+                              opts.slice_us_underutil * 1000,
+                              opts.slice_us_overutil * 1000,)?,
         })
     }
 
@@ -489,7 +496,14 @@ impl<'a> Scheduler<'a> {
             stat_pct(bpf_intf::stat_idx_RUSTY_STAT_KICK_GREEDY),
             stat_pct(bpf_intf::stat_idx_RUSTY_STAT_REPATRIATE),
         );
+        info!(
+            "dl_clamped={:5.2} dl_preset={:5.2}",
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DL_CLAMP),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DL_PRESET),
+
+        );
 
+        info!("slice_length={}us", self.tuner.slice_ns / 1000);
         info!("direct_greedy_cpumask={}", self.tuner.direct_greedy_mask);
         info!("  kick_greedy_cpumask={}", self.tuner.kick_greedy_mask);
 
diff --git a/scheds/rust/scx_rusty/src/tuner.rs b/scheds/rust/scx_rusty/src/tuner.rs
index e047efcec..0be6ff783 100644
--- a/scheds/rust/scx_rusty/src/tuner.rs
+++ b/scheds/rust/scx_rusty/src/tuner.rs
@@ -70,6 +70,9 @@ pub struct Tuner {
     pub direct_greedy_mask: Cpumask,
     pub kick_greedy_mask: Cpumask,
     pub fully_utilized: bool,
+    pub slice_ns: u64,
+    underutil_slice_ns: u64,
+    overutil_slice_ns: u64,
     dom_group: Arc<DomainGroup>,
     direct_greedy_under: f64,
     kick_greedy_under: f64,
@@ -80,7 +83,9 @@ pub struct Tuner {
 impl Tuner {
     pub fn new(dom_group: Arc<DomainGroup>,
                direct_greedy_under: f64,
-               kick_greedy_under: f64) -> Result<Self> {
+               kick_greedy_under: f64,
+               underutil_slice_ns: u64,
+               overutil_slice_ns: u64) -> Result<Self> {
         let proc_reader = procfs::ProcReader::new();
         let prev_cpu_stats = proc_reader
             .read_stat()?
@@ -95,6 +100,9 @@ impl Tuner {
            kick_greedy_under: kick_greedy_under / 100.0,
            proc_reader,
            prev_cpu_stats,
+           slice_ns: underutil_slice_ns,
+           underutil_slice_ns: underutil_slice_ns,
+           overutil_slice_ns: overutil_slice_ns,
            dom_group,
        })
     }
@@ -161,6 +169,13 @@ impl Tuner {
 
         write_to_bpf(&mut ti.direct_greedy_cpumask, &self.direct_greedy_mask);
         write_to_bpf(&mut ti.kick_greedy_cpumask, &self.kick_greedy_mask);
+        if self.fully_utilized {
+            self.slice_ns = self.overutil_slice_ns;
+        } else {
+            self.slice_ns = self.underutil_slice_ns;
+        }
+        ti.slice_ns = self.slice_ns;
+
         ti.gen += 1;
 
         self.prev_cpu_stats = curr_cpu_stats;