Skip to content

Commit

Permalink
rusty: Dynamically scale slice according to system util
Browse files Browse the repository at this point in the history
In user space in rusty, the tuner detects system utilization, and uses
it to inform how we do load balancing, our greedy / direct cpumasks,
etc. Something else we could be doing but currently aren't, is using
system utilization to inform how we dispatch tasks. We currently have a
static, unchanging slice length for the runtime of the program, but this
is inefficient for all scenarios.

Giving a task a long slice length does have advantages, such as
decreasing the number of involuntary context switches, decreasing the
overhead of preemption by doing it less frequently, possibly getting
better cache locality due to a task running on a CPU for a longer amount
of time, etc. On the other hand, long slices can be problematic as well.
When a system is highly utilized, a CPU-hogging task running for too
long can harm interactive tasks. When the system is under-utilized,
those interactive tasks can likely find an idle, or under-utilized core
to run on. When the system is over-utilized, however, they're likely to
have to park in a runqueue.

Thus, in order to better accommodate such scenarios, this patch
implements a rudimentary slice scaling mechanism in scx_rusty. Rather
than having one global, static slice length, we instead have a dynamic,
global slice length that can be changed depending on system utilization.
When over-utilized, we go with a longer slice length, and vice versa for
when the system is under-utilized. With Terraria, this results in
roughly a 50% improvement in mean FPS when playing on an AMD Ryzen 9
7950X, while running Spotify, and stress-ng -c $((4 * $(nproc))).

Signed-off-by: David Vernet <[email protected]>
  • Loading branch information
Byte-Lab committed May 3, 2024
1 parent 7661898 commit 2403f60
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 7 deletions.
4 changes: 3 additions & 1 deletion scheds/rust/scx_rusty/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ const volatile u32 greedy_threshold_x_numa;
const volatile u32 debug;

/* base slice duration */
const volatile u64 slice_ns = SCX_SLICE_DFL;
static u64 slice_ns = SCX_SLICE_DFL;

/*
* Per-CPU context
Expand Down Expand Up @@ -441,6 +441,7 @@ struct {
*/
struct tune_input{
u64 gen;
u64 slice_ns;
u64 direct_greedy_cpumask[MAX_CPUS / 64];
u64 kick_greedy_cpumask[MAX_CPUS / 64];
} tune_input;
Expand Down Expand Up @@ -477,6 +478,7 @@ static void refresh_tune_params(void)
return;

tune_params_gen = tune_input.gen;
slice_ns = tune_input.slice_ns;

bpf_for(cpu, 0, nr_cpus_possible) {
u32 dom_id = cpu_to_dom_id(cpu);
Expand Down
18 changes: 13 additions & 5 deletions scheds/rust/scx_rusty/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,13 @@ const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
/// limitation will be removed in the future.
#[derive(Debug, Parser)]
struct Opts {
/// Scheduling slice duration in microseconds.
#[clap(short = 's', long, default_value = "20000")]
slice_us: u64,
/// Scheduling slice duration for under-utilized hosts, in microseconds.
#[clap(short = 'u', long, default_value = "20000")]
slice_us_underutil: u64,

/// Scheduling slice duration for over-utilized hosts, in microseconds.
#[clap(short = 'o', long, default_value = "1000")]
slice_us_overutil: u64,

/// Monitoring and load balance interval in seconds.
#[clap(short = 'i', long, default_value = "2.0")]
Expand Down Expand Up @@ -305,7 +309,6 @@ impl<'a> Scheduler<'a> {
}
skel.struct_ops.rusty_mut().exit_dump_len = opts.exit_dump_len;

skel.rodata_mut().slice_ns = opts.slice_us * 1000;
skel.rodata_mut().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
skel.rodata_mut().kthreads_local = opts.kthreads_local;
skel.rodata_mut().fifo_sched = opts.fifo_sched;
Expand Down Expand Up @@ -342,7 +345,11 @@ impl<'a> Scheduler<'a> {

nr_lb_data_errors: 0,

tuner: Tuner::new(domains, opts.direct_greedy_under, opts.kick_greedy_under)?,
tuner: Tuner::new(domains,
opts.direct_greedy_under,
opts.kick_greedy_under,
opts.slice_us_underutil * 1000,
opts.slice_us_overutil * 1000,)?,
})
}

Expand Down Expand Up @@ -496,6 +503,7 @@ impl<'a> Scheduler<'a> {

);

info!("slice_length={}us", self.tuner.slice_ns / 1000);
info!("direct_greedy_cpumask={}", self.tuner.direct_greedy_mask);
info!(" kick_greedy_cpumask={}", self.tuner.kick_greedy_mask);

Expand Down
17 changes: 16 additions & 1 deletion scheds/rust/scx_rusty/src/tuner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ pub struct Tuner {
pub direct_greedy_mask: Cpumask,
pub kick_greedy_mask: Cpumask,
pub fully_utilized: bool,
pub slice_ns: u64,
underutil_slice_ns: u64,
overutil_slice_ns: u64,
dom_group: Arc<DomainGroup>,
direct_greedy_under: f64,
kick_greedy_under: f64,
Expand All @@ -80,7 +83,9 @@ pub struct Tuner {
impl Tuner {
pub fn new(dom_group: Arc<DomainGroup>,
direct_greedy_under: f64,
kick_greedy_under: f64) -> Result<Self> {
kick_greedy_under: f64,
underutil_slice_ns: u64,
overutil_slice_ns: u64) -> Result<Self> {
let proc_reader = procfs::ProcReader::new();
let prev_cpu_stats = proc_reader
.read_stat()?
Expand All @@ -95,6 +100,9 @@ impl Tuner {
kick_greedy_under: kick_greedy_under / 100.0,
proc_reader,
prev_cpu_stats,
slice_ns: underutil_slice_ns,
underutil_slice_ns: underutil_slice_ns,
overutil_slice_ns: overutil_slice_ns,
dom_group,
})
}
Expand Down Expand Up @@ -161,6 +169,13 @@ impl Tuner {

write_to_bpf(&mut ti.direct_greedy_cpumask, &self.direct_greedy_mask);
write_to_bpf(&mut ti.kick_greedy_cpumask, &self.kick_greedy_mask);
if self.fully_utilized {
self.slice_ns = self.overutil_slice_ns;
} else {
self.slice_ns = self.underutil_slice_ns;
}
ti.slice_ns = self.slice_ns;

ti.gen += 1;

self.prev_cpu_stats = curr_cpu_stats;
Expand Down

0 comments on commit 2403f60

Please sign in to comment.