diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index 080f41264..9046b4080 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -59,10 +59,18 @@ const volatile u64 slice_ns_lag; const volatile bool local_kthreads; /* - * Threshold of voluntary context switches used to classify a task as - * interactive. + * Maximum threshold of voluntary context switches. + * + * This limits the range of nvcsw_avg_thresh (see below). */ -const volatile u64 nvcsw_thresh = 10ULL; +const volatile u64 nvcsw_max_thresh = 10ULL; + +/* + * Global average of voluntary context switches used to classify interactive + * tasks: tasks with an average amount of voluntary context switches (nvcsw) + * greater than this value will be classified as interactive. + */ +volatile u64 nvcsw_avg_thresh; /* * Time threshold to prevent task starvation. @@ -197,6 +205,15 @@ static inline bool is_kthread(const struct task_struct *p) return !!(p->flags & PF_KTHREAD); } +/* + * Return true if interactive tasks classification via voluntary context + * switches is enabled, false otherwise. + */ +static bool is_nvcsw_enabled(void) +{ + return !!nvcsw_max_thresh; +} + /* * Access a cpumask in read-only mode (typically to check bits). */ @@ -248,6 +265,14 @@ static u64 calc_avg(u64 old_val, u64 new_val) return (old_val - (old_val >> 2)) + (new_val >> 2); } +/* + * Evaluate the EWMA limited to the range [low ... high] + */ +static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high) +{ + return CLAMP(calc_avg(old_val, new_val), low, high); +} + /* * Compare two vruntime values, returns true if the first value is less than * the second one. @@ -721,13 +746,14 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p) static void update_task_interactive(struct task_ctx *tctx) { /* - * Classify interactive tasks based on the average amount of their - * voluntary context switches. + * Classify the task based on the average of voluntary context + * switches. * - * If the average of voluntarily context switches is below - * nvcsw_thresh, the task is classified as regular. + * If the task has an average greater than the global average + * (nvcsw_avg_thresh) it is classified as interactive, otherwise the + * task is classified as regular. */ - tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_thresh; + tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_avg_thresh; } /* @@ -772,19 +798,45 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable) * using an exponentially weighted moving average, see calc_avg(). */ delta_t = (s64)(now - tctx->nvcsw_ts); - if (nvcsw_thresh && delta_t > NSEC_PER_SEC) { + if (is_nvcsw_enabled() && delta_t > NSEC_PER_SEC) { u64 delta_nvcsw = p->nvcsw - tctx->nvcsw; u64 avg_nvcsw = delta_nvcsw * NSEC_PER_SEC / delta_t; - tctx->avg_nvcsw = calc_avg(tctx->avg_nvcsw, avg_nvcsw); + /* + * Evaluate the average nvcsw for the task, limited to the + * range [0 .. nvcsw_max_thresh * 8] to prevent excessive + * spikes. + */ + tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw, + 0, nvcsw_max_thresh << 3); tctx->nvcsw = p->nvcsw; tctx->nvcsw_ts = now; + /* + * Update the global voluntary context switches average using + * an exponentially weighted moving average (EWMA) with the + * formula: + * + * avg(t) = avg(t - 1) * 0.75 - task_avg(t) * 0.25 + * + * This approach is more efficient than iterating through all + * tasks and it helps to prevent rapid fluctuations that may be + * caused by bursts of voluntary context switch events. + * + * Additionally, restrict the global nvcsw_avg_thresh average + * to the range [1 .. nvcsw_max_thresh] to always allow the + * classification of some tasks as interactive. + */ + nvcsw_avg_thresh = calc_avg_clamp(nvcsw_avg_thresh, avg_nvcsw, + 1, nvcsw_max_thresh); + /* + * Reresh task status: interactive or regular. + */ update_task_interactive(tctx); dbg_msg("%d (%s) avg_nvcsw = %llu [%s]", p->pid, p->comm, tctx->avg_nvcsw, - tctx->avg_nvcsw < nvcsw_thresh ? "regular" : "interactive"); + tctx->avg_nvcsw < nvcsw_avg_thresh ? "regular" : "interactive"); } } diff --git a/scheds/rust/scx_bpfland/src/main.rs b/scheds/rust/scx_bpfland/src/main.rs index 225de85b8..cbb4f8c9d 100644 --- a/scheds/rust/scx_bpfland/src/main.rs +++ b/scheds/rust/scx_bpfland/src/main.rs @@ -81,10 +81,10 @@ struct Opts { #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)] local_kthreads: bool, - /// Threshold of voluntary context switch per second, used to classify interactive tasks - /// (0 = disable interactive tasks classification). + /// Maximum threshold of voluntary context switch per second, used to classify interactive + /// tasks (0 = disable interactive tasks classification). #[clap(short = 'c', long, default_value = "10")] - nvcsw_thresh: u64, + nvcsw_max_thresh: u64, /// Prevent the starvation making sure that at least one lower priority task is scheduled every /// starvation_thresh_us (0 = disable starvation prevention). @@ -111,6 +111,7 @@ struct Opts { struct Metrics { nr_running: Gauge, nr_interactive: Gauge, + nvcsw_avg_thresh: Gauge, nr_kthread_dispatches: Gauge, nr_direct_dispatches: Gauge, nr_prio_dispatches: Gauge, @@ -126,6 +127,9 @@ impl Metrics { nr_interactive: gauge!( "nr_interactive", "info" => "Number of running interactive tasks" ), + nvcsw_avg_thresh: gauge!( + "nvcsw_avg_thresh", "info" => "Average of voluntary context switches" + ), nr_kthread_dispatches: gauge!( "nr_kthread_dispatches", "info" => "Number of kthread direct dispatches" ), @@ -193,7 +197,7 @@ impl<'a> Scheduler<'a> { skel.rodata_mut().slice_ns_min = opts.slice_us_min * 1000; skel.rodata_mut().slice_ns_lag = opts.slice_us_lag * 1000; skel.rodata_mut().starvation_thresh_ns = opts.starvation_thresh_us * 1000; - skel.rodata_mut().nvcsw_thresh = opts.nvcsw_thresh; + skel.rodata_mut().nvcsw_max_thresh = opts.nvcsw_max_thresh; // Attach the scheduler. let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?; @@ -218,6 +222,7 @@ impl<'a> Scheduler<'a> { let nr_cpus = self.skel.bss().nr_online_cpus; let nr_running = self.skel.bss().nr_running; let nr_interactive = self.skel.bss().nr_interactive; + let nvcsw_avg_thresh = self.skel.bss().nvcsw_avg_thresh; let nr_kthread_dispatches = self.skel.bss().nr_kthread_dispatches; let nr_direct_dispatches = self.skel.bss().nr_direct_dispatches; let nr_prio_dispatches = self.skel.bss().nr_prio_dispatches; @@ -230,6 +235,8 @@ impl<'a> Scheduler<'a> { self.metrics .nr_interactive .set(nr_interactive as f64); + self.metrics + .nvcsw_avg_thresh.set(nvcsw_avg_thresh as f64); self.metrics .nr_kthread_dispatches .set(nr_kthread_dispatches as f64); @@ -244,10 +251,11 @@ impl<'a> Scheduler<'a> { .set(nr_shared_dispatches as f64); // Log scheduling statistics. - info!("running: {:>4}/{:<4} interactive: {:>4} | kthread: {:<6} | direct: {:<6} | prio: {:<6} | shared: {:<6}", + info!("running: {:>4}/{:<4} interactive: {:>4} | nvcsw: {:<4} | kthread: {:<6} | direct: {:<6} | prio: {:<6} | shared: {:<6}", nr_running, nr_cpus, nr_interactive, + nvcsw_avg_thresh, nr_kthread_dispatches, nr_direct_dispatches, nr_prio_dispatches,