Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scx_bpfland: introduce dynamic nvcsw threshold #439

Merged
merged 1 commit into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 63 additions & 11 deletions scheds/rust/scx_bpfland/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,18 @@ const volatile u64 slice_ns_lag;
const volatile bool local_kthreads;

/*
* Threshold of voluntary context switches used to classify a task as
* interactive.
* Maximum threshold of voluntary context switches.
*
* This limits the range of nvcsw_avg_thresh (see below).
*/
const volatile u64 nvcsw_thresh = 10ULL;
const volatile u64 nvcsw_max_thresh = 10ULL;

/*
* Global average of voluntary context switches used to classify interactive
* tasks: tasks with an average amount of voluntary context switches (nvcsw)
* greater than this value will be classified as interactive.
*/
volatile u64 nvcsw_avg_thresh;

/*
* Time threshold to prevent task starvation.
Expand Down Expand Up @@ -197,6 +205,15 @@ static inline bool is_kthread(const struct task_struct *p)
return !!(p->flags & PF_KTHREAD);
}

/*
* Return true if interactive tasks classification via voluntary context
* switches is enabled, false otherwise.
*/
static bool is_nvcsw_enabled(void)
{
return !!nvcsw_max_thresh;
}

/*
* Access a cpumask in read-only mode (typically to check bits).
*/
Expand Down Expand Up @@ -248,6 +265,14 @@ static u64 calc_avg(u64 old_val, u64 new_val)
return (old_val - (old_val >> 2)) + (new_val >> 2);
}

/*
* Evaluate the EWMA limited to the range [low ... high]
*/
static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high)
{
return CLAMP(calc_avg(old_val, new_val), low, high);
}

/*
* Compare two vruntime values, returns true if the first value is less than
* the second one.
Expand Down Expand Up @@ -721,13 +746,14 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
static void update_task_interactive(struct task_ctx *tctx)
{
/*
* Classify interactive tasks based on the average amount of their
* voluntary context switches.
* Classify the task based on the average of voluntary context
* switches.
*
* If the average of voluntarily context switches is below
* nvcsw_thresh, the task is classified as regular.
* If the task has an average greater than the global average
* (nvcsw_avg_thresh) it is classified as interactive, otherwise the
* task is classified as regular.
*/
tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_thresh;
tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_avg_thresh;
}

/*
Expand Down Expand Up @@ -772,19 +798,45 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* using an exponentially weighted moving average, see calc_avg().
*/
delta_t = (s64)(now - tctx->nvcsw_ts);
if (nvcsw_thresh && delta_t > NSEC_PER_SEC) {
if (is_nvcsw_enabled() && delta_t > NSEC_PER_SEC) {
u64 delta_nvcsw = p->nvcsw - tctx->nvcsw;
u64 avg_nvcsw = delta_nvcsw * NSEC_PER_SEC / delta_t;

tctx->avg_nvcsw = calc_avg(tctx->avg_nvcsw, avg_nvcsw);
/*
* Evaluate the average nvcsw for the task, limited to the
* range [0 .. nvcsw_max_thresh * 8] to prevent excessive
* spikes.
*/
tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw,
0, nvcsw_max_thresh << 3);
tctx->nvcsw = p->nvcsw;
tctx->nvcsw_ts = now;

/*
* Update the global voluntary context switches average using
* an exponentially weighted moving average (EWMA) with the
* formula:
*
* avg(t) = avg(t - 1) * 0.75 - task_avg(t) * 0.25
*
* This approach is more efficient than iterating through all
* tasks and it helps to prevent rapid fluctuations that may be
* caused by bursts of voluntary context switch events.
*
* Additionally, restrict the global nvcsw_avg_thresh average
* to the range [1 .. nvcsw_max_thresh] to always allow the
* classification of some tasks as interactive.
*/
nvcsw_avg_thresh = calc_avg_clamp(nvcsw_avg_thresh, avg_nvcsw,
1, nvcsw_max_thresh);
/*
* Reresh task status: interactive or regular.
*/
update_task_interactive(tctx);

dbg_msg("%d (%s) avg_nvcsw = %llu [%s]",
p->pid, p->comm, tctx->avg_nvcsw,
tctx->avg_nvcsw < nvcsw_thresh ? "regular" : "interactive");
tctx->avg_nvcsw < nvcsw_avg_thresh ? "regular" : "interactive");
}
}

Expand Down
18 changes: 13 additions & 5 deletions scheds/rust/scx_bpfland/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ struct Opts {
#[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
local_kthreads: bool,

/// Threshold of voluntary context switch per second, used to classify interactive tasks
/// (0 = disable interactive tasks classification).
/// Maximum threshold of voluntary context switch per second, used to classify interactive
/// tasks (0 = disable interactive tasks classification).
#[clap(short = 'c', long, default_value = "10")]
nvcsw_thresh: u64,
nvcsw_max_thresh: u64,

/// Prevent the starvation making sure that at least one lower priority task is scheduled every
/// starvation_thresh_us (0 = disable starvation prevention).
Expand All @@ -111,6 +111,7 @@ struct Opts {
struct Metrics {
nr_running: Gauge,
nr_interactive: Gauge,
nvcsw_avg_thresh: Gauge,
nr_kthread_dispatches: Gauge,
nr_direct_dispatches: Gauge,
nr_prio_dispatches: Gauge,
Expand All @@ -126,6 +127,9 @@ impl Metrics {
nr_interactive: gauge!(
"nr_interactive", "info" => "Number of running interactive tasks"
),
nvcsw_avg_thresh: gauge!(
"nvcsw_avg_thresh", "info" => "Average of voluntary context switches"
),
nr_kthread_dispatches: gauge!(
"nr_kthread_dispatches", "info" => "Number of kthread direct dispatches"
),
Expand Down Expand Up @@ -193,7 +197,7 @@ impl<'a> Scheduler<'a> {
skel.rodata_mut().slice_ns_min = opts.slice_us_min * 1000;
skel.rodata_mut().slice_ns_lag = opts.slice_us_lag * 1000;
skel.rodata_mut().starvation_thresh_ns = opts.starvation_thresh_us * 1000;
skel.rodata_mut().nvcsw_thresh = opts.nvcsw_thresh;
skel.rodata_mut().nvcsw_max_thresh = opts.nvcsw_max_thresh;

// Attach the scheduler.
let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
Expand All @@ -218,6 +222,7 @@ impl<'a> Scheduler<'a> {
let nr_cpus = self.skel.bss().nr_online_cpus;
let nr_running = self.skel.bss().nr_running;
let nr_interactive = self.skel.bss().nr_interactive;
let nvcsw_avg_thresh = self.skel.bss().nvcsw_avg_thresh;
let nr_kthread_dispatches = self.skel.bss().nr_kthread_dispatches;
let nr_direct_dispatches = self.skel.bss().nr_direct_dispatches;
let nr_prio_dispatches = self.skel.bss().nr_prio_dispatches;
Expand All @@ -230,6 +235,8 @@ impl<'a> Scheduler<'a> {
self.metrics
.nr_interactive
.set(nr_interactive as f64);
self.metrics
.nvcsw_avg_thresh.set(nvcsw_avg_thresh as f64);
self.metrics
.nr_kthread_dispatches
.set(nr_kthread_dispatches as f64);
Expand All @@ -244,10 +251,11 @@ impl<'a> Scheduler<'a> {
.set(nr_shared_dispatches as f64);

// Log scheduling statistics.
info!("running: {:>4}/{:<4} interactive: {:>4} | kthread: {:<6} | direct: {:<6} | prio: {:<6} | shared: {:<6}",
info!("running: {:>4}/{:<4} interactive: {:>4} | nvcsw: {:<4} | kthread: {:<6} | direct: {:<6} | prio: {:<6} | shared: {:<6}",
nr_running,
nr_cpus,
nr_interactive,
nvcsw_avg_thresh,
nr_kthread_dispatches,
nr_direct_dispatches,
nr_prio_dispatches,
Expand Down