sched-ext · htejun · Apr 29, 2024 · Apr 29, 2024 · htejun · Apr 29, 2024
diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h
@@ -49,15 +49,16 @@ enum consts {
 	NSEC_PER_USEC			= 1000ULL,
 	NSEC_PER_MSEC			= (1000ULL * NSEC_PER_USEC),
 	LAVD_TIME_ONE_SEC		= (1000ULL * NSEC_PER_MSEC),
+	LAVD_TIME_INFINITY_NS		= 0xFFFFFFFFFFFFFFFFULL,
 	LAVD_MAX_CAS_RETRY		= 8,
 
 	LAVD_TARGETED_LATENCY_NS	= (15 * NSEC_PER_MSEC),
 	LAVD_SLICE_MIN_NS		= (300 * NSEC_PER_USEC),/* min time slice */
 	LAVD_SLICE_MAX_NS		= (3 * NSEC_PER_MSEC),	/* max time slice */
+	LAVD_SLICE_UNDECIDED		= LAVD_TIME_INFINITY_NS,
 	LAVD_SLICE_GREEDY_FT		= 3,
 	LAVD_LOAD_FACTOR_ADJ		= 6,
 	LAVD_LOAD_FACTOR_MAX		= (10 * 1000),
-	LAVD_TIME_INFINITY_NS		= 0xFFFFFFFFFFFFFFFFULL,
 
 	LAVD_LC_FREQ_MAX		= 1000000,
 	LAVD_LC_RUNTIME_MAX		= LAVD_TARGETED_LATENCY_NS,

diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -1644,7 +1644,7 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc,
 	 * Enqueue the task to the global runqueue based on its virtual
 	 * deadline.
 	 */
-	scx_bpf_dispatch_vtime(p, LAVD_GLOBAL_DSQ, LAVD_SLICE_MAX_NS,
+	scx_bpf_dispatch_vtime(p, LAVD_GLOBAL_DSQ, LAVD_SLICE_UNDECIDED,
 			       vdeadline, enq_flags);
 
 }
@@ -1679,7 +1679,7 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
 	taskc->vdeadline_delta_ns = 0;
 	taskc->eligible_delta_ns = 0;
 	taskc->victim_cpu = (s16)LAVD_CPU_ID_NONE;
-	scx_bpf_dispatch(p, SCX_DSQ_LOCAL, LAVD_SLICE_MAX_NS, enq_flags);
+	scx_bpf_dispatch(p, SCX_DSQ_LOCAL, LAVD_SLICE_UNDECIDED, enq_flags);
 	return true;
 }
 
@@ -1804,6 +1804,27 @@ void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags)
 	waker_taskc->last_runnable_clk = now;
 }
 
+static bool need_to_calc_time_slice(struct task_struct *p)
+{
+	/*
+	 * We need to calculate the task @p's time slice in two cases: 1) if it
+	 * hasn't been calculated (i.e., LAVD_SLICE_UNDECIDED) after the
+	 * enqueue or 2) if the sched_ext kernel assigns the default time slice
+	 * (i.e., SCX_SLICE_DFL).
+	 *
+	 * Calculating and assigning a time slice without checking these two
+	 * conditions could entail pathological behaviors, notably watchdog
+	 * time out. One condition that could trigger a watchdog time-out is as
+	 * follows. 1) a task is preempted by another task, which runs in a
+	 * higher scheduler class (e.g., RT or DL). 2) when the task is
+	 * re-running after, for example, the RT task preempted out, its time
+	 * slice will be replenished again. 3) If these two steps are repeated,
+	 * the task can run forever.
+	 */
+	return p->scx.slice == LAVD_SLICE_UNDECIDED ||
+	       p->scx.slice == SCX_SLICE_DFL;
+}
+
 void BPF_STRUCT_OPS(lavd_running, struct task_struct *p)
 {
 	struct cpu_ctx *cpuc;
@@ -1826,9 +1847,10 @@ void BPF_STRUCT_OPS(lavd_running, struct task_struct *p)
 	cpuc->stopping_tm_est_ns = get_est_stopping_time(taskc);
 
 	/*
-	 * Calcualte task's time slice based on updated load.
+	 * Calculate the task's time slice based on updated load if necessary.
 	 */
-	p->scx.slice = calc_time_slice(p, taskc);
+	if (need_to_calc_time_slice(p))
+		p->scx.slice = calc_time_slice(p, taskc);
 
 	/*
 	 * If there is a relevant introspection command with @p, process it.