Revert "Merge pull request #305 from sched-ext/rustland-fifo-mode"

This merge included additional commits that were supposed to be included in a separate pull request and have nothing to do with the fifo-mode changes. Therefore, revert the whole pull request and create a separate one with the correct list of commits required to implement this feature. Signed-off-by: Andrea Righi <[email protected]>
sched-ext · May 22, 2024 · 0d75c80 · 0d75c80
1 parent e79ab40
commit 0d75c80
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 147 deletions.
diff --git a/rust/scx_rustland_core/assets/bpf.rs b/rust/scx_rustland_core/assets/bpf.rs
@@ -183,7 +183,6 @@ impl<'cb> BpfScheduler<'cb> {
         exit_dump_len: u32,
         full_user: bool,
         low_power: bool,
-        fifo_sched: bool,
         debug: bool,
     ) -> Result<Self> {
         // Open the BPF prog first for verification.
@@ -248,7 +247,6 @@ impl<'cb> BpfScheduler<'cb> {
         skel.rodata_mut().debug = debug;
         skel.rodata_mut().full_user = full_user;
         skel.rodata_mut().low_power = low_power;
-        skel.rodata_mut().fifo_sched = fifo_sched;
 
         // Attach BPF scheduler.
         let mut skel = scx_ops_load!(skel, rustland, uei)?;

diff --git a/rust/scx_rustland_core/assets/bpf/main.bpf.c b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@@ -105,19 +105,6 @@ const volatile bool full_user;
   */
 const volatile bool low_power;
 
-/*
- * Automatically switch to simple FIFO scheduling during periods of system
- * underutilization to minimize unnecessary scheduling overhead.
- *
- * 'fifo_sched' can be used by the user-space scheduler to enable/disable this
- * behavior.
- *
- * 'is_fifo_enabled' indicates whether the scheduling has switched to FIFO mode
- * or regular scheduling mode.
- */
-const volatile bool fifo_sched;
-static bool is_fifo_enabled;
-
 /* Allow to use bpf_printk() only when @debug is set */
 #define dbg_msg(_fmt, ...) do {						\
 	if (debug)							\
@@ -172,11 +159,6 @@ struct task_ctx {
 	 * current task's cpumask.
 	 */
 	u64 cpumask_cnt;
-
-	/*
-	 * Dispatch immediately to the local DSQ.
-	 */
-	bool dispatch_local;
 };
 
 /* Map that contains task-local storage. */
@@ -218,13 +200,6 @@ struct {
 	__type(value, struct usersched_timer);
 } usersched_timer SEC(".maps");
 
-/*
- * Time period of the scheduler heartbeat, used to periodically kick the the
- * scheduler and check if we need to switch to FIFO mode or regular
- * scheduling (default 100ms).
- */
-#define USERSCHED_TIMER_NS (NSEC_PER_SEC / 10)
-
 /*
  * Map of allocated CPUs.
  */
@@ -458,6 +433,14 @@ static void dispatch_user_scheduler(void)
 	bpf_task_release(p);
 }
 
+/*
+ * Return true if we are waking up from a wait event, false otherwise.
+ */
+static bool is_waking_up(u64 wake_flags)
+{
+	return !!(wake_flags & SCX_WAKE_TTWU);
+}
+
 /*
  * Select the target CPU where a task can be executed.
  *
@@ -473,45 +456,50 @@ static void dispatch_user_scheduler(void)
 s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
-	struct task_ctx *tctx;
 	bool is_idle = false;
 	s32 cpu;
 
-	tctx = lookup_task_ctx(p);
-	if (!tctx)
+	/*
+	 * In full-user mode simply assign the previously used CPU and let the
+	 * user-space scheduler decide where it should be dispatched.
+	 */
+	if (full_user)
 		return prev_cpu;
 
 	/*
-	 * If the previously used CPU is still available, keep using it to take
-	 * advantage of the cached working set.
+	 * Always try to stick on the same CPU if it's available, to better
+	 * exploit the cached working set.
 	 */
 	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-		tctx->dispatch_local = true;
+		/*
+		 * Using SCX_DSQ_LOCAL ensures that the task will be executed
+		 * directly on the CPU returned by this function.
+		 */
+		dispatch_task(p, SCX_DSQ_LOCAL, 0, 0, 0);
+		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 		return prev_cpu;
 	}
 
 	/*
-	 * If the task was directly dispatched give it a second chance to
-	 * remain on the current CPU, instead of immediately migrating it.
+	 * If the previously used CPU is not available, check whether the task
+	 * is coming from a wait state. If not, enqueue it on the same CPU
+	 * regardless, without directly dispatching it.
 	 */
-	if (tctx->dispatch_local) {
-		tctx->dispatch_local = false;
+	if (!is_waking_up(wake_flags))
 		return prev_cpu;
-	}
 
 	/*
-	 * Find the best CPU relying on the built-in idle selection logic,
-	 * eventually migrating the task and dispatching directly from here if
-	 * a CPU is available.
+	 * If we are coming from a wait state, try to find the best CPU relying
+	 * on the built-in idle selection logic (eventually migrating the
+	 * task).
 	 */
 	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 	if (is_idle) {
-		tctx->dispatch_local = true;
-		return cpu;
+		dispatch_task(p, SCX_DSQ_LOCAL, 0, 0, 0);
+		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 	}
-	tctx->dispatch_local = false;
 
-	return prev_cpu;
+	return cpu;
 }
 
 /*
@@ -560,7 +548,6 @@ static void sched_congested(struct task_struct *p)
 void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct queued_task_ctx *task;
-	struct task_ctx *tctx;
 
 	/*
 	 * Scheduler is dispatched directly in .dispatch() when needed, so
@@ -578,35 +565,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
 	 * long (i.e., ksoftirqd/N, rcuop/N, etc.).
 	 */
 	if (is_kthread(p) && p->nr_cpus_allowed == 1) {
-		dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns, enq_flags);
-		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
-		return;
-	}
-
-	/*
-	 * Dispatch immediately on the local DSQ if .select_cpu() has found an
-	 * idle CPU.
-	 *
-	 * NOTE: assign a shorter time slice (slice_ns / 4) to task directly
-	 * dispatched to prevent them from gaining excessive CPU bandwidth.
-	 */
-	tctx = lookup_task_ctx(p);
-	if (!full_user && tctx && tctx->dispatch_local) {
-		dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns / 4, enq_flags);
-		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
-		return;
-	}
-
-	/*
-	 * Dispatch directly to the target CPU DSQ if the scheduler is set to
-	 * FIFO mode.
-	 */
-	if (!full_user && is_fifo_enabled) {
-		s32 cpu = scx_bpf_task_cpu(p);
-
-		scx_bpf_dispatch(p, cpu_to_dsq(cpu), slice_ns, enq_flags);
-		scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
-
+		dispatch_task(p, SCX_DSQ_LOCAL, 0, 0, enq_flags);
 		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 		return;
 	}
@@ -706,15 +665,15 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 	 */
 	bpf_user_ringbuf_drain(&dispatched, handle_dispatched_task, NULL, 0);
 
-	/* Consume all tasks enqueued in the shared DSQ */
+	/* Consume all tasks enqueued in the current CPU's DSQ first */
 	bpf_repeat(MAX_ENQUEUED_TASKS) {
-		if (!scx_bpf_consume(SHARED_DSQ))
+		if (!scx_bpf_consume(cpu_to_dsq(cpu)))
 			break;
 	}
 
-	/* Consume all tasks enqueued in the current CPU's DSQ first */
+	/* Consume all tasks enqueued in the shared DSQ */
 	bpf_repeat(MAX_ENQUEUED_TASKS) {
-		if (!scx_bpf_consume(cpu_to_dsq(cpu)))
+		if (!scx_bpf_consume(SHARED_DSQ))
 			break;
 	}
 }
@@ -817,6 +776,7 @@ void BPF_STRUCT_OPS(rustland_cpu_release, s32 cpu,
 		set_usersched_needed();
 }
 
+
 /*
  * A new task @p is being created.
  *
@@ -868,43 +828,6 @@ void BPF_STRUCT_OPS(rustland_exit_task, struct task_struct *p,
 	__sync_fetch_and_add(&nr_queued, 1);
 }
 
-/*
- * Check whether we can switch to FIFO mode if the system is underutilized.
- */
-static bool should_enable_fifo(void)
-{
-	/* Moving average of the tasks that are waiting to be scheduled */
-	static u64 nr_waiting_avg;
-	/* Current amount of tasks waiting to be scheduled */
-	u64 nr_waiting = nr_queued + nr_scheduled;
-
-	if (!fifo_sched)
-		return false;
-
-	/*
-	 * Exiting from FIFO mode requires to have almost all the CPUs busy.
-	 */
-	if (is_fifo_enabled)
-		return nr_running < num_possible_cpus - 1;
-
-	/*
-	 * We are not in FIFO mode, check for the task waiting to be processed
-	 * by the user-space scheduler.
-	 *
-	 * We want to evaluate a moving average of the waiting tasks to prevent
-	 * bouncing too often between FIFO mode and user-space mode.
-	 */
-	nr_waiting_avg = (nr_waiting_avg + nr_waiting) / 2;
-
-	/*
-	 * The condition to enter in FIFO mode is to have no tasks (in average)
-	 * that are waiting to be scheduled.
-	 *
-	 * Exiting from FIFO mode requires to have almost all the CPUs busy.
-	 */
-	return nr_waiting_avg == 0;
-}
-
 /*
  * Heartbeat scheduler timer callback.
  *
@@ -921,11 +844,8 @@ static int usersched_timer_fn(void *map, int *key, struct bpf_timer *timer)
 	/* Kick the scheduler */
 	set_usersched_needed();
 
-	/* Update flag that determines if FIFO scheduling needs to be enabled */
-	is_fifo_enabled = should_enable_fifo();
-
 	/* Re-arm the timer */
-	err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0);
+	err = bpf_timer_start(timer, NSEC_PER_SEC, 0);
 	if (err)
 		scx_bpf_error("Failed to arm stats timer");
 
@@ -948,7 +868,7 @@ static int usersched_timer_init(void)
 	}
 	bpf_timer_init(timer, &usersched_timer, CLOCK_BOOTTIME);
 	bpf_timer_set_callback(timer, usersched_timer_fn);
-	err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0);
+	err = bpf_timer_start(timer, NSEC_PER_SEC, 0);
 	if (err)
 		scx_bpf_error("Failed to arm scheduler timer");
 

diff --git a/scheds/rust/scx_rlfifo/src/main.rs b/scheds/rust/scx_rlfifo/src/main.rs
@@ -26,7 +26,7 @@ struct Scheduler<'a> {
 impl<'a> Scheduler<'a> {
     fn init() -> Result<Self> {
         let topo = Topology::new().expect("Failed to build host topology");
-        let bpf = BpfScheduler::init(5000, topo.nr_cpus_possible() as i32, false, 0, false, false, true, false)?;
+        let bpf = BpfScheduler::init(5000, topo.nr_cpus_possible() as i32, false, 0, false, false, false)?;
         Ok(Self { bpf })
     }
 

diff --git a/scheds/rust/scx_rustland/src/main.rs b/scheds/rust/scx_rustland/src/main.rs
@@ -100,13 +100,13 @@ struct Opts {
     #[clap(short = 'b', long, default_value = "100")]
     slice_boost: u64,
 
-    /// If specified, always enforce the built-in idle selection logic to dispatch tasks.
-    /// Otherwise allow to dispatch interactive tasks on the first CPU available.
+    /// If specified, rely on the sched-ext built-in idle selection logic to dispatch tasks.
+    /// Otherwise dispatch tasks on the first CPU available.
     ///
     /// Relying on the built-in logic can improve throughput (since tasks are more likely to remain
     /// on the same CPU when the system is overloaded), but it can reduce system responsiveness.
     ///
-    /// By default always dispatch interactive tasks on the first CPU available to increase system
+    /// By default always dispatch tasks on the first CPU available to increase system
     /// responsiveness over throughput, especially when the system is overloaded.
     #[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
     builtin_idle: bool,
@@ -137,19 +137,6 @@ struct Opts {
     #[clap(short = 'l', long, action = clap::ArgAction::SetTrue)]
     low_power: bool,
 
-    /// By default the scheduler automatically transitions to FIFO mode when the system is
-    /// underutilized. This allows to reduce unnecessary scheduling overhead and boost performance
-    /// when the system is not running at full capacity.
-    ///
-    /// Be aware that FIFO mode can lead to less predictable performance. Therefore, use this
-    /// option if performance predictability is important, such as when running real-time audio
-    /// applications or during live streaming. Conversely, avoid using this option when you care
-    /// about maximizing performance, such as gaming.
-    ///
-    /// Set this option to disable this automatic transition.
-    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
-    disable_fifo: bool,
-
     /// If specified, only tasks which have their scheduling policy set to
     /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
     /// tasks are switched.
@@ -317,7 +304,6 @@ impl<'a> Scheduler<'a> {
             opts.exit_dump_len,
             opts.full_user,
             opts.low_power,
-            !opts.disable_fifo,
             opts.debug,
         )?;
         info!("{} scheduler attached - {} CPUs", SCHEDULER_NAME, nr_cpus);
@@ -539,15 +525,21 @@ impl<'a> Scheduler<'a> {
                     let mut dispatched_task = DispatchedTask::new(&task.qtask);
 
                     // Set special dispatch flags.
-                    if task.is_interactive {
-                        if !self.builtin_idle {
-                            dispatched_task.set_flag(RL_CPU_ANY);
-                        }
-                        if !self.no_preemption {
-                            dispatched_task.set_flag(RL_PREEMPT_CPU);
-                        }
+                    if !self.builtin_idle {
+                        dispatched_task.set_flag(RL_CPU_ANY);
+                    }
+                    if task.is_interactive && !self.no_preemption {
+                        // Assign the maximum time slice to this task and allow to preempt others.
+                        //
+                        // NOTE: considering that, with preemption enabled, interactive tasks can
+                        // preempt each other (for now) and they are also more likely to release
+                        // the CPU before its assigned time slice expires, always give them the
+                        // maximum static time slice allowed.
+                        dispatched_task.set_slice_ns(self.slice_ns);
+                        dispatched_task.set_flag(RL_PREEMPT_CPU);
+                    } else {
+                        dispatched_task.set_slice_ns(self.effective_slice_ns(nr_scheduled));
                     }
-                    dispatched_task.set_slice_ns(self.effective_slice_ns(nr_scheduled));
 
                     // Send task to the BPF dispatcher.
                     match self.bpf.dispatch_task(&dispatched_task) {