Skip to content

Commit

Permalink
Revert "Merge pull request #305 from sched-ext/rustland-fifo-mode"
Browse files Browse the repository at this point in the history
This merge included additional commits that were supposed to be included
in a separate pull request and have nothing to do with the fifo-mode
changes.

Therefore, revert the whole pull request and create a separate one with
the correct list of commits required to implement this feature.

Signed-off-by: Andrea Righi <[email protected]>
  • Loading branch information
Andrea Righi committed May 22, 2024
1 parent e79ab40 commit 0d75c80
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 147 deletions.
2 changes: 0 additions & 2 deletions rust/scx_rustland_core/assets/bpf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ impl<'cb> BpfScheduler<'cb> {
exit_dump_len: u32,
full_user: bool,
low_power: bool,
fifo_sched: bool,
debug: bool,
) -> Result<Self> {
// Open the BPF prog first for verification.
Expand Down Expand Up @@ -248,7 +247,6 @@ impl<'cb> BpfScheduler<'cb> {
skel.rodata_mut().debug = debug;
skel.rodata_mut().full_user = full_user;
skel.rodata_mut().low_power = low_power;
skel.rodata_mut().fifo_sched = fifo_sched;

// Attach BPF scheduler.
let mut skel = scx_ops_load!(skel, rustland, uei)?;
Expand Down
158 changes: 39 additions & 119 deletions rust/scx_rustland_core/assets/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,6 @@ const volatile bool full_user;
*/
const volatile bool low_power;

/*
* Automatically switch to simple FIFO scheduling during periods of system
* underutilization to minimize unnecessary scheduling overhead.
*
* 'fifo_sched' can be used by the user-space scheduler to enable/disable this
* behavior.
*
* 'is_fifo_enabled' indicates whether the scheduling has switched to FIFO mode
* or regular scheduling mode.
*/
const volatile bool fifo_sched;
static bool is_fifo_enabled;

/* Allow to use bpf_printk() only when @debug is set */
#define dbg_msg(_fmt, ...) do { \
if (debug) \
Expand Down Expand Up @@ -172,11 +159,6 @@ struct task_ctx {
* current task's cpumask.
*/
u64 cpumask_cnt;

/*
* Dispatch immediately to the local DSQ.
*/
bool dispatch_local;
};

/* Map that contains task-local storage. */
Expand Down Expand Up @@ -218,13 +200,6 @@ struct {
__type(value, struct usersched_timer);
} usersched_timer SEC(".maps");

/*
* Time period of the scheduler heartbeat, used to periodically kick the the
* scheduler and check if we need to switch to FIFO mode or regular
* scheduling (default 100ms).
*/
#define USERSCHED_TIMER_NS (NSEC_PER_SEC / 10)

/*
* Map of allocated CPUs.
*/
Expand Down Expand Up @@ -458,6 +433,14 @@ static void dispatch_user_scheduler(void)
bpf_task_release(p);
}

/*
* Return true if we are waking up from a wait event, false otherwise.
*/
static bool is_waking_up(u64 wake_flags)
{
return !!(wake_flags & SCX_WAKE_TTWU);
}

/*
* Select the target CPU where a task can be executed.
*
Expand All @@ -473,45 +456,50 @@ static void dispatch_user_scheduler(void)
s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
u64 wake_flags)
{
struct task_ctx *tctx;
bool is_idle = false;
s32 cpu;

tctx = lookup_task_ctx(p);
if (!tctx)
/*
* In full-user mode simply assign the previously used CPU and let the
* user-space scheduler decide where it should be dispatched.
*/
if (full_user)
return prev_cpu;

/*
* If the previously used CPU is still available, keep using it to take
* advantage of the cached working set.
* Always try to stick on the same CPU if it's available, to better
* exploit the cached working set.
*/
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
tctx->dispatch_local = true;
/*
* Using SCX_DSQ_LOCAL ensures that the task will be executed
* directly on the CPU returned by this function.
*/
dispatch_task(p, SCX_DSQ_LOCAL, 0, 0, 0);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
return prev_cpu;
}

/*
* If the task was directly dispatched give it a second chance to
* remain on the current CPU, instead of immediately migrating it.
* If the previously used CPU is not available, check whether the task
* is coming from a wait state. If not, enqueue it on the same CPU
* regardless, without directly dispatching it.
*/
if (tctx->dispatch_local) {
tctx->dispatch_local = false;
if (!is_waking_up(wake_flags))
return prev_cpu;
}

/*
* Find the best CPU relying on the built-in idle selection logic,
* eventually migrating the task and dispatching directly from here if
* a CPU is available.
* If we are coming from a wait state, try to find the best CPU relying
* on the built-in idle selection logic (eventually migrating the
* task).
*/
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
if (is_idle) {
tctx->dispatch_local = true;
return cpu;
dispatch_task(p, SCX_DSQ_LOCAL, 0, 0, 0);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
}
tctx->dispatch_local = false;

return prev_cpu;
return cpu;
}

/*
Expand Down Expand Up @@ -560,7 +548,6 @@ static void sched_congested(struct task_struct *p)
void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
{
struct queued_task_ctx *task;
struct task_ctx *tctx;

/*
* Scheduler is dispatched directly in .dispatch() when needed, so
Expand All @@ -578,35 +565,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
* long (i.e., ksoftirqd/N, rcuop/N, etc.).
*/
if (is_kthread(p) && p->nr_cpus_allowed == 1) {
dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns, enq_flags);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
return;
}

/*
* Dispatch immediately on the local DSQ if .select_cpu() has found an
* idle CPU.
*
* NOTE: assign a shorter time slice (slice_ns / 4) to task directly
* dispatched to prevent them from gaining excessive CPU bandwidth.
*/
tctx = lookup_task_ctx(p);
if (!full_user && tctx && tctx->dispatch_local) {
dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns / 4, enq_flags);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
return;
}

/*
* Dispatch directly to the target CPU DSQ if the scheduler is set to
* FIFO mode.
*/
if (!full_user && is_fifo_enabled) {
s32 cpu = scx_bpf_task_cpu(p);

scx_bpf_dispatch(p, cpu_to_dsq(cpu), slice_ns, enq_flags);
scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);

dispatch_task(p, SCX_DSQ_LOCAL, 0, 0, enq_flags);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
return;
}
Expand Down Expand Up @@ -706,15 +665,15 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
*/
bpf_user_ringbuf_drain(&dispatched, handle_dispatched_task, NULL, 0);

/* Consume all tasks enqueued in the shared DSQ */
/* Consume all tasks enqueued in the current CPU's DSQ first */
bpf_repeat(MAX_ENQUEUED_TASKS) {
if (!scx_bpf_consume(SHARED_DSQ))
if (!scx_bpf_consume(cpu_to_dsq(cpu)))
break;
}

/* Consume all tasks enqueued in the current CPU's DSQ first */
/* Consume all tasks enqueued in the shared DSQ */
bpf_repeat(MAX_ENQUEUED_TASKS) {
if (!scx_bpf_consume(cpu_to_dsq(cpu)))
if (!scx_bpf_consume(SHARED_DSQ))
break;
}
}
Expand Down Expand Up @@ -817,6 +776,7 @@ void BPF_STRUCT_OPS(rustland_cpu_release, s32 cpu,
set_usersched_needed();
}


/*
* A new task @p is being created.
*
Expand Down Expand Up @@ -868,43 +828,6 @@ void BPF_STRUCT_OPS(rustland_exit_task, struct task_struct *p,
__sync_fetch_and_add(&nr_queued, 1);
}

/*
* Check whether we can switch to FIFO mode if the system is underutilized.
*/
static bool should_enable_fifo(void)
{
/* Moving average of the tasks that are waiting to be scheduled */
static u64 nr_waiting_avg;
/* Current amount of tasks waiting to be scheduled */
u64 nr_waiting = nr_queued + nr_scheduled;

if (!fifo_sched)
return false;

/*
* Exiting from FIFO mode requires to have almost all the CPUs busy.
*/
if (is_fifo_enabled)
return nr_running < num_possible_cpus - 1;

/*
* We are not in FIFO mode, check for the task waiting to be processed
* by the user-space scheduler.
*
* We want to evaluate a moving average of the waiting tasks to prevent
* bouncing too often between FIFO mode and user-space mode.
*/
nr_waiting_avg = (nr_waiting_avg + nr_waiting) / 2;

/*
* The condition to enter in FIFO mode is to have no tasks (in average)
* that are waiting to be scheduled.
*
* Exiting from FIFO mode requires to have almost all the CPUs busy.
*/
return nr_waiting_avg == 0;
}

/*
* Heartbeat scheduler timer callback.
*
Expand All @@ -921,11 +844,8 @@ static int usersched_timer_fn(void *map, int *key, struct bpf_timer *timer)
/* Kick the scheduler */
set_usersched_needed();

/* Update flag that determines if FIFO scheduling needs to be enabled */
is_fifo_enabled = should_enable_fifo();

/* Re-arm the timer */
err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0);
err = bpf_timer_start(timer, NSEC_PER_SEC, 0);
if (err)
scx_bpf_error("Failed to arm stats timer");

Expand All @@ -948,7 +868,7 @@ static int usersched_timer_init(void)
}
bpf_timer_init(timer, &usersched_timer, CLOCK_BOOTTIME);
bpf_timer_set_callback(timer, usersched_timer_fn);
err = bpf_timer_start(timer, USERSCHED_TIMER_NS, 0);
err = bpf_timer_start(timer, NSEC_PER_SEC, 0);
if (err)
scx_bpf_error("Failed to arm scheduler timer");

Expand Down
2 changes: 1 addition & 1 deletion scheds/rust/scx_rlfifo/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ struct Scheduler<'a> {
impl<'a> Scheduler<'a> {
fn init() -> Result<Self> {
let topo = Topology::new().expect("Failed to build host topology");
let bpf = BpfScheduler::init(5000, topo.nr_cpus_possible() as i32, false, 0, false, false, true, false)?;
let bpf = BpfScheduler::init(5000, topo.nr_cpus_possible() as i32, false, 0, false, false, false)?;
Ok(Self { bpf })
}

Expand Down
42 changes: 17 additions & 25 deletions scheds/rust/scx_rustland/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,13 @@ struct Opts {
#[clap(short = 'b', long, default_value = "100")]
slice_boost: u64,

/// If specified, always enforce the built-in idle selection logic to dispatch tasks.
/// Otherwise allow to dispatch interactive tasks on the first CPU available.
/// If specified, rely on the sched-ext built-in idle selection logic to dispatch tasks.
/// Otherwise dispatch tasks on the first CPU available.
///
/// Relying on the built-in logic can improve throughput (since tasks are more likely to remain
/// on the same CPU when the system is overloaded), but it can reduce system responsiveness.
///
/// By default always dispatch interactive tasks on the first CPU available to increase system
/// By default always dispatch tasks on the first CPU available to increase system
/// responsiveness over throughput, especially when the system is overloaded.
#[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
builtin_idle: bool,
Expand Down Expand Up @@ -137,19 +137,6 @@ struct Opts {
#[clap(short = 'l', long, action = clap::ArgAction::SetTrue)]
low_power: bool,

/// By default the scheduler automatically transitions to FIFO mode when the system is
/// underutilized. This allows to reduce unnecessary scheduling overhead and boost performance
/// when the system is not running at full capacity.
///
/// Be aware that FIFO mode can lead to less predictable performance. Therefore, use this
/// option if performance predictability is important, such as when running real-time audio
/// applications or during live streaming. Conversely, avoid using this option when you care
/// about maximizing performance, such as gaming.
///
/// Set this option to disable this automatic transition.
#[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
disable_fifo: bool,

/// If specified, only tasks which have their scheduling policy set to
/// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
/// tasks are switched.
Expand Down Expand Up @@ -317,7 +304,6 @@ impl<'a> Scheduler<'a> {
opts.exit_dump_len,
opts.full_user,
opts.low_power,
!opts.disable_fifo,
opts.debug,
)?;
info!("{} scheduler attached - {} CPUs", SCHEDULER_NAME, nr_cpus);
Expand Down Expand Up @@ -539,15 +525,21 @@ impl<'a> Scheduler<'a> {
let mut dispatched_task = DispatchedTask::new(&task.qtask);

// Set special dispatch flags.
if task.is_interactive {
if !self.builtin_idle {
dispatched_task.set_flag(RL_CPU_ANY);
}
if !self.no_preemption {
dispatched_task.set_flag(RL_PREEMPT_CPU);
}
if !self.builtin_idle {
dispatched_task.set_flag(RL_CPU_ANY);
}
if task.is_interactive && !self.no_preemption {
// Assign the maximum time slice to this task and allow to preempt others.
//
// NOTE: considering that, with preemption enabled, interactive tasks can
// preempt each other (for now) and they are also more likely to release
// the CPU before its assigned time slice expires, always give them the
// maximum static time slice allowed.
dispatched_task.set_slice_ns(self.slice_ns);
dispatched_task.set_flag(RL_PREEMPT_CPU);
} else {
dispatched_task.set_slice_ns(self.effective_slice_ns(nr_scheduled));
}
dispatched_task.set_slice_ns(self.effective_slice_ns(nr_scheduled));

// Send task to the BPF dispatcher.
match self.bpf.dispatch_task(&dispatched_task) {
Expand Down

0 comments on commit 0d75c80

Please sign in to comment.