sched-ext · arighi · Jul 18, 2024 · Jul 5, 2024
diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
@@ -59,10 +59,18 @@ const volatile u64 slice_ns_lag;
 const volatile bool local_kthreads;
 
 /*
- * Threshold of voluntary context switches used to classify a task as
- * interactive.
+ * Maximum threshold of voluntary context switches.
+ *
+ * This limits the range of nvcsw_avg_thresh (see below).
  */
-const volatile u64 nvcsw_thresh = 10ULL;
+const volatile u64 nvcsw_max_thresh = 10ULL;
+
+/*
+ * Global average of voluntary context switches used to classify interactive
+ * tasks: tasks with an average amount of voluntary context switches (nvcsw)
+ * greater than this value will be classified as interactive.
+ */
+volatile u64 nvcsw_avg_thresh;
 
 /*
  * Time threshold to prevent task starvation.
@@ -197,6 +205,15 @@ static inline bool is_kthread(const struct task_struct *p)
 	return !!(p->flags & PF_KTHREAD);
 }
 
+/*
+ * Return true if interactive tasks classification via voluntary context
+ * switches is enabled, false otherwise.
+ */
+static bool is_nvcsw_enabled(void)
+{
+	return !!nvcsw_max_thresh;
+}
+
 /*
  * Access a cpumask in read-only mode (typically to check bits).
  */
@@ -248,6 +265,14 @@ static u64 calc_avg(u64 old_val, u64 new_val)
 	return (old_val - (old_val >> 2)) + (new_val >> 2);
 }
 
+/*
+ * Evaluate the EWMA limited to the range [low ... high]
+ */
+static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high)
+{
+	return CLAMP(calc_avg(old_val, new_val), low, high);
+}
+
 /*
  * Compare two vruntime values, returns true if the first value is less than
  * the second one.
@@ -721,13 +746,14 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
 static void update_task_interactive(struct task_ctx *tctx)
 {
 	/*
-	 * Classify interactive tasks based on the average amount of their
-	 * voluntary context switches.
+	 * Classify the task based on the average of voluntary context
+	 * switches.
 	 *
-	 * If the average of voluntarily context switches is below
-	 * nvcsw_thresh, the task is classified as regular.
+	 * If the task has an average greater than the global average
+	 * (nvcsw_avg_thresh) it is classified as interactive, otherwise the
+	 * task is classified as regular.
 	 */
-	tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_thresh;
+	tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_avg_thresh;
 }
 
 /*
@@ -772,19 +798,45 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 	 * using an exponentially weighted moving average, see calc_avg().
 	 */
 	delta_t = (s64)(now - tctx->nvcsw_ts);
-	if (nvcsw_thresh && delta_t > NSEC_PER_SEC) {
+	if (is_nvcsw_enabled() && delta_t > NSEC_PER_SEC) {
 		u64 delta_nvcsw = p->nvcsw - tctx->nvcsw;
 		u64 avg_nvcsw = delta_nvcsw * NSEC_PER_SEC / delta_t;
 
-		tctx->avg_nvcsw = calc_avg(tctx->avg_nvcsw, avg_nvcsw);
+		/*
+		 * Evaluate the average nvcsw for the task, limited to the
+		 * range [0 .. nvcsw_max_thresh * 8] to prevent excessive
+		 * spikes.
+		 */
+		tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw,
+						 0, nvcsw_max_thresh << 3);
 		tctx->nvcsw = p->nvcsw;
 		tctx->nvcsw_ts = now;
 
+		/*
+		 * Update the global voluntary context switches average using
+		 * an exponentially weighted moving average (EWMA) with the
+		 * formula:
+		 *
+		 *   avg(t) = avg(t - 1) * 0.75 - task_avg(t) * 0.25
+		 *
+		 * This approach is more efficient than iterating through all
+		 * tasks and it helps to prevent rapid fluctuations that may be
+		 * caused by bursts of voluntary context switch events.
+		 *
+		 * Additionally, restrict the global nvcsw_avg_thresh average
+		 * to the range [1 .. nvcsw_max_thresh] to always allow the
+		 * classification of some tasks as interactive.
+		 */
+		nvcsw_avg_thresh = calc_avg_clamp(nvcsw_avg_thresh, avg_nvcsw,
+						  1, nvcsw_max_thresh);
+		/*
+		 * Reresh task status: interactive or regular.
+		 */
 		update_task_interactive(tctx);
 
 		dbg_msg("%d (%s) avg_nvcsw = %llu [%s]",
 			p->pid, p->comm, tctx->avg_nvcsw,
-			tctx->avg_nvcsw < nvcsw_thresh ? "regular" : "interactive");
+			tctx->avg_nvcsw < nvcsw_avg_thresh ? "regular" : "interactive");
 	}
 }
 

diff --git a/scheds/rust/scx_bpfland/src/main.rs b/scheds/rust/scx_bpfland/src/main.rs
@@ -81,10 +81,10 @@ struct Opts {
     #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
     local_kthreads: bool,
 
-    /// Threshold of voluntary context switch per second, used to classify interactive tasks
-    /// (0 = disable interactive tasks classification).
+    /// Maximum threshold of voluntary context switch per second, used to classify interactive
+    /// tasks (0 = disable interactive tasks classification).
     #[clap(short = 'c', long, default_value = "10")]
-    nvcsw_thresh: u64,
+    nvcsw_max_thresh: u64,
 
     /// Prevent the starvation making sure that at least one lower priority task is scheduled every
     /// starvation_thresh_us (0 = disable starvation prevention).
@@ -111,6 +111,7 @@ struct Opts {
 struct Metrics {
     nr_running: Gauge,
     nr_interactive: Gauge,
+    nvcsw_avg_thresh: Gauge,
     nr_kthread_dispatches: Gauge,
     nr_direct_dispatches: Gauge,
     nr_prio_dispatches: Gauge,
@@ -126,6 +127,9 @@ impl Metrics {
             nr_interactive: gauge!(
                 "nr_interactive", "info" => "Number of running interactive tasks"
             ),
+            nvcsw_avg_thresh: gauge!(
+                "nvcsw_avg_thresh", "info" => "Average of voluntary context switches"
+            ),
             nr_kthread_dispatches: gauge!(
                 "nr_kthread_dispatches", "info" => "Number of kthread direct dispatches"
             ),
@@ -193,7 +197,7 @@ impl<'a> Scheduler<'a> {
         skel.rodata_mut().slice_ns_min = opts.slice_us_min * 1000;
         skel.rodata_mut().slice_ns_lag = opts.slice_us_lag * 1000;
         skel.rodata_mut().starvation_thresh_ns = opts.starvation_thresh_us * 1000;
-        skel.rodata_mut().nvcsw_thresh = opts.nvcsw_thresh;
+        skel.rodata_mut().nvcsw_max_thresh = opts.nvcsw_max_thresh;
 
         // Attach the scheduler.
         let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
@@ -218,6 +222,7 @@ impl<'a> Scheduler<'a> {
         let nr_cpus = self.skel.bss().nr_online_cpus;
         let nr_running = self.skel.bss().nr_running;
         let nr_interactive = self.skel.bss().nr_interactive;
+        let nvcsw_avg_thresh = self.skel.bss().nvcsw_avg_thresh;
         let nr_kthread_dispatches = self.skel.bss().nr_kthread_dispatches;
         let nr_direct_dispatches = self.skel.bss().nr_direct_dispatches;
         let nr_prio_dispatches = self.skel.bss().nr_prio_dispatches;
@@ -230,6 +235,8 @@ impl<'a> Scheduler<'a> {
         self.metrics
             .nr_interactive
             .set(nr_interactive as f64);
+        self.metrics
+            .nvcsw_avg_thresh.set(nvcsw_avg_thresh as f64);
         self.metrics
             .nr_kthread_dispatches
             .set(nr_kthread_dispatches as f64);
@@ -244,10 +251,11 @@ impl<'a> Scheduler<'a> {
             .set(nr_shared_dispatches as f64);
 
         // Log scheduling statistics.
-        info!("running: {:>4}/{:<4} interactive: {:>4} | kthread: {:<6} | direct: {:<6} | prio: {:<6} | shared: {:<6}",
+        info!("running: {:>4}/{:<4} interactive: {:>4} | nvcsw: {:<4} | kthread: {:<6} | direct: {:<6} | prio: {:<6} | shared: {:<6}",
             nr_running,
             nr_cpus,
             nr_interactive,
+            nvcsw_avg_thresh,
             nr_kthread_dispatches,
             nr_direct_dispatches,
             nr_prio_dispatches,