From 5f63e0ca3033c4088eb0395d5b1feab56b3ed6d2 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Mon, 29 Apr 2024 12:13:31 +0900 Subject: [PATCH] scx_lavd: replesih time slice at ops.running() only when necessary The current code replenishes the task's time slice whenever the task becomes ops.running(). However, there is a case where such behavior can starve the other tasks, causing the watchdog timeout error. One (if not all) such case is when a task is preempted while running by the higher scheduler class (e.g., RT, DL). In such a case, the task will be transit in a cycle of ops.running() -> ops.stopping() -> ops.running() -> etc. Whenever it becomes re-running, it will be placed at the head of local DSQ and ops.running() will renew its time slice. Hence, in the worst case, the task can run forever since its time slice is never exhausted. The fix is assigning the time slice only once by checking if the time slice is calculated before. Suggested-by: Tejun Heo Signed-off-by: Changwoo Min --- scheds/rust/scx_lavd/src/bpf/intf.h | 3 ++- scheds/rust/scx_lavd/src/bpf/main.bpf.c | 30 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h index 9290f3daa..53f445c86 100644 --- a/scheds/rust/scx_lavd/src/bpf/intf.h +++ b/scheds/rust/scx_lavd/src/bpf/intf.h @@ -49,15 +49,16 @@ enum consts { NSEC_PER_USEC = 1000ULL, NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC), LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC), + LAVD_TIME_INFINITY_NS = 0xFFFFFFFFFFFFFFFFULL, LAVD_MAX_CAS_RETRY = 8, LAVD_TARGETED_LATENCY_NS = (15 * NSEC_PER_MSEC), LAVD_SLICE_MIN_NS = (300 * NSEC_PER_USEC),/* min time slice */ LAVD_SLICE_MAX_NS = (3 * NSEC_PER_MSEC), /* max time slice */ + LAVD_SLICE_UNDECIDED = LAVD_TIME_INFINITY_NS, LAVD_SLICE_GREEDY_FT = 3, LAVD_LOAD_FACTOR_ADJ = 6, LAVD_LOAD_FACTOR_MAX = (10 * 1000), - LAVD_TIME_INFINITY_NS = 0xFFFFFFFFFFFFFFFFULL, LAVD_LC_FREQ_MAX = 1000000, LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS, diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c index 4cb4645bf..95571beb5 100644 --- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c @@ -1644,7 +1644,7 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc, * Enqueue the task to the global runqueue based on its virtual * deadline. */ - scx_bpf_dispatch_vtime(p, LAVD_GLOBAL_DSQ, LAVD_SLICE_MAX_NS, + scx_bpf_dispatch_vtime(p, LAVD_GLOBAL_DSQ, LAVD_SLICE_UNDECIDED, vdeadline, enq_flags); } @@ -1679,7 +1679,7 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc, taskc->vdeadline_delta_ns = 0; taskc->eligible_delta_ns = 0; taskc->victim_cpu = (s16)LAVD_CPU_ID_NONE; - scx_bpf_dispatch(p, SCX_DSQ_LOCAL, LAVD_SLICE_MAX_NS, enq_flags); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, LAVD_SLICE_UNDECIDED, enq_flags); return true; } @@ -1804,6 +1804,27 @@ void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags) waker_taskc->last_runnable_clk = now; } +static bool need_to_calc_time_slice(struct task_struct *p) +{ + /* + * We need to calculate the task @p's time slice in two cases: 1) if it + * hasn't been calculated (i.e., LAVD_SLICE_UNDECIDED) after the + * enqueue or 2) if the sched_ext kernel assigns the default time slice + * (i.e., SCX_SLICE_DFL). + * + * Calculating and assigning a time slice without checking these two + * conditions couldĀ entail pathological behaviors, notably watchdog + * time out. One condition that could trigger a watchdog time-out is as + * follows. 1) a task is preempted by another task, which runs in a + * higher scheduler class (e.g., RT or DL). 2) when the task is + * re-running after, for example, the RT task preempted out, its time + * slice will be replenished again. 3) If these two steps are repeated, + * the task can run forever. + */ + return p->scx.slice == LAVD_SLICE_UNDECIDED || + p->scx.slice == SCX_SLICE_DFL; +} + void BPF_STRUCT_OPS(lavd_running, struct task_struct *p) { struct cpu_ctx *cpuc; @@ -1826,9 +1847,10 @@ void BPF_STRUCT_OPS(lavd_running, struct task_struct *p) cpuc->stopping_tm_est_ns = get_est_stopping_time(taskc); /* - * Calcualte task's time slice based on updated load. + * Calculate the task's time slice based on updated load if necessary. */ - p->scx.slice = calc_time_slice(p, taskc); + if (need_to_calc_time_slice(p)) + p->scx.slice = calc_time_slice(p, taskc); /* * If there is a relevant introspection command with @p, process it.