diff --git a/rust/meson.build b/rust/meson.build index 62e7364a2..7ac2aab45 100644 --- a/rust/meson.build +++ b/rust/meson.build @@ -1 +1,2 @@ subdir('scx_utils') +subdir('scx_rustland_core') diff --git a/rust/scx_rustland_core/.gitignore b/rust/scx_rustland_core/.gitignore new file mode 100644 index 000000000..03314f77b --- /dev/null +++ b/rust/scx_rustland_core/.gitignore @@ -0,0 +1 @@ +Cargo.lock diff --git a/rust/scx_rustland_core/Cargo.toml b/rust/scx_rustland_core/Cargo.toml new file mode 100644 index 000000000..91f40415a --- /dev/null +++ b/rust/scx_rustland_core/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "scx_rustland_core" +version = "0.1.0" +edition = "2021" +authors = ["Andrea Righi "] +license = "GPL-2.0-only" +repository = "https://github.com/sched-ext/scx" +description = "Framework to implement sched_ext schedulers running in user space" + +include = [ + "src/bpf/intf.h", + "src/bpf/main.bpf.c", + "src/bpf.rs", +] + +[dependencies] +anyhow = "1.0" +libbpf-rs = "0.22.0" +libc = "0.2.137" +buddy-alloc = "0.5.1" +scx_utils = { path = "../scx_utils", version = "0.6" } + +[build-dependencies] +tar = "0.4" +walkdir = "2.4" +scx_utils = { path = "../scx_utils", version = "0.6" } diff --git a/rust/scx_rustland_core/LICENSE b/rust/scx_rustland_core/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/rust/scx_rustland_core/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/rust/scx_rustland_core/README.md b/rust/scx_rustland_core/README.md new file mode 100644 index 000000000..f4675d351 --- /dev/null +++ b/rust/scx_rustland_core/README.md @@ -0,0 +1,78 @@ +# Framework to implement sched_ext schedulers running in user-space + +[sched_ext](https://github.com/sched-ext/scx) is a Linux kernel feature +which enables implementing kernel thread schedulers in BPF and dynamically +loading them. + +This crate provides a generic layer that can be used to implement sched-ext +schedulers that run in user-space. + +It provides a generic BPF abstraction that is completely agnostic of the +particular scheduling policy implemented in user-space. + +Developers can use such abstraction to implement schedulers using pure Rust +code, without having to deal with any internal kernel / BPF internal details. + +## API + +The main BPF interface is provided by the `BpfScheduler` struct. When this +object is initialized it will take care of registering and initializing the BPF +component. + +The scheduler then can use `BpfScheduler` instance to receive tasks (in the +form of `QueuedTask` objects) and dispatch tasks (in the form of DispatchedTask +objects), using respectively the methods `dequeue_task()` and `dispatch_task()`. + +Example usage (FIFO scheduler): +``` +struct Scheduler<'a> { + bpf: BpfScheduler<'a>, +} + +impl<'a> Scheduler<'a> { + fn init() -> Result { + let topo = Topology::new().expect("Failed to build host topology"); + let bpf = BpfScheduler::init(5000, topo.nr_cpus() as i32, false, false, false)?; + Ok(Self { bpf }) + } + + fn schedule(&mut self) { + match self.bpf.dequeue_task() { + Ok(Some(task)) => { + // task.cpu < 0 is used to to notify an exiting task, in this + // case we can simply ignore it. + if task.cpu >= 0 { + let _ = self.bpf.dispatch_task(&DispatchedTask { + pid: task.pid, + cpu: task.cpu, + cpumask_cnt: task.cpumask_cnt, + payload: 0, + }); + } + } + Ok(None) => { + // Notify the BPF component that all tasks have been dispatched. + self.bpf.update_tasks(Some(0), Some(0))? + + break; + } + Err(_) => { + break; + } + } + } +``` + +Moreover, a CPU ownership map (that keeps track of which PID runs on which CPU) +can be accessed using the method `get_cpu_pid()`. This also allows to keep +track of the idle and busy CPUs, with the corresponding PIDs associated to +them. + +BPF counters and statistics can be accessed using the methods `nr_*_mut()`, in +particular `nr_queued_mut()` and `nr_scheduled_mut()` can be updated to notify +the BPF component if the user-space scheduler has still some pending work to do +or not. + +Lastly, the methods `exited()` and `shutdown_and_report()` can be used +respectively to test whether the BPF component exited, and to shutdown and +report the exit message. diff --git a/rust/scx_rustland_core/bindings.h b/rust/scx_rustland_core/bindings.h new file mode 100644 index 000000000..024239315 --- /dev/null +++ b/rust/scx_rustland_core/bindings.h @@ -0,0 +1 @@ +#include "bpf_h/vmlinux/vmlinux.h" diff --git a/rust/scx_rustland_core/bpf_h b/rust/scx_rustland_core/bpf_h new file mode 120000 index 000000000..1cf48acda --- /dev/null +++ b/rust/scx_rustland_core/bpf_h @@ -0,0 +1 @@ +../../scheds/include \ No newline at end of file diff --git a/rust/scx_rustland_core/build.rs b/rust/scx_rustland_core/build.rs new file mode 100644 index 000000000..1eaf10aba --- /dev/null +++ b/rust/scx_rustland_core/build.rs @@ -0,0 +1,10 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +use scx_utils::Builder; + +fn main() { + Builder::new().build() +} diff --git a/rust/scx_rustland_core/meson.build b/rust/scx_rustland_core/meson.build new file mode 100644 index 000000000..f24a7ad71 --- /dev/null +++ b/rust/scx_rustland_core/meson.build @@ -0,0 +1,7 @@ +custom_target('scx_rustland_core', + output: '@PLAINNAME@.__PHONY__', + input: 'Cargo.toml', + command: [cargo, 'build', '--manifest-path=@INPUT@', '--target-dir=@OUTDIR@', + cargo_build_args], + env: cargo_env, + build_by_default: true) diff --git a/scheds/rust/scx_rustland/src/bpf/alloc.rs b/rust/scx_rustland_core/src/alloc.rs similarity index 94% rename from scheds/rust/scx_rustland/src/bpf/alloc.rs rename to rust/scx_rustland_core/src/alloc.rs index 1487a7c6c..25fffe6f1 100644 --- a/scheds/rust/scx_rustland/src/bpf/alloc.rs +++ b/rust/scx_rustland_core/src/alloc.rs @@ -15,7 +15,7 @@ const HEAP_SIZE: usize = 64 * 1024 * 1024; // 64M const LEAF_SIZE: usize = 64; #[repr(align(4096))] -pub struct AlignedHeap([u8; N]); +struct AlignedHeap([u8; N]); // Statically pre-allocated memory arena. static mut FAST_HEAP: AlignedHeap = AlignedHeap([0u8; FAST_HEAP_SIZE]); @@ -26,25 +26,25 @@ static mut HEAP: AlignedHeap = AlignedHeap([0u8; HEAP_SIZE]); // To prevent potential deadlock conditions under heavy loads, any scheduler that delegates // scheduling decisions to user-space should avoid triggering page faults. // -// To address this issue, replace the global allocator with a custom one (RustLandAllocator), +// To address this issue, replace the global allocator with a custom one (UserAllocator), // designed to operate on a pre-allocated buffer. This, coupled with the memory locking achieved // through mlockall(), prevents page faults from occurring during the execution of the user-space // scheduler. #[cfg_attr(not(test), global_allocator)] -pub static ALLOCATOR: RustLandAllocator = unsafe { +pub static ALLOCATOR: UserAllocator = unsafe { let fast_param = FastAllocParam::new(FAST_HEAP.0.as_ptr(), FAST_HEAP_SIZE); let buddy_param = BuddyAllocParam::new(HEAP.0.as_ptr(), HEAP_SIZE, LEAF_SIZE); - RustLandAllocator { + UserAllocator { arena: NonThreadsafeAlloc::new(fast_param, buddy_param), } }; // Main allocator class. -pub struct RustLandAllocator { - pub arena: NonThreadsafeAlloc, +pub struct UserAllocator { + arena: NonThreadsafeAlloc, } -impl RustLandAllocator { +impl UserAllocator { pub fn lock_memory(&self) { unsafe { VM.save(); @@ -75,7 +75,7 @@ impl RustLandAllocator { } // Override global allocator methods. -unsafe impl GlobalAlloc for RustLandAllocator { +unsafe impl GlobalAlloc for UserAllocator { unsafe fn alloc(&self, layout: Layout) -> *mut u8 { self.arena.alloc(layout) } diff --git a/rust/scx_rustland_core/src/bindings.rs b/rust/scx_rustland_core/src/bindings.rs new file mode 100644 index 000000000..cd503e4b5 --- /dev/null +++ b/rust/scx_rustland_core/src/bindings.rs @@ -0,0 +1,6 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); diff --git a/scheds/rust/scx_rustland/src/bpf.rs b/rust/scx_rustland_core/src/bpf.rs similarity index 82% rename from scheds/rust/scx_rustland/src/bpf.rs rename to rust/scx_rustland_core/src/bpf.rs index 6cdec059a..090ea1072 100644 --- a/scheds/rust/scx_rustland/src/bpf.rs +++ b/rust/scx_rustland_core/src/bpf.rs @@ -15,13 +15,12 @@ use libbpf_rs::skel::SkelBuilder as _; use libc::{sched_param, sched_setscheduler}; -mod alloc; -use alloc::*; - use scx_utils::init_libbpf_logging; use scx_utils::uei_exited; use scx_utils::uei_report; +use scx_rustland_core::ALLOCATOR; + // Defined in UAPI const SCHED_EXT: i32 = 7; @@ -31,7 +30,7 @@ const SCHED_EXT: i32 = 7; #[allow(dead_code)] pub const NO_CPU: i32 = -1; -/// scx_rustland: provide high-level abstractions to interact with the BPF component. +/// High-level Rust abstraction to interact with a generic sched-ext BPF component. /// /// Overview /// ======== @@ -40,7 +39,7 @@ pub const NO_CPU: i32 = -1; /// initialized it will take care of registering and initializing the BPF component. /// /// The scheduler then can use BpfScheduler() instance to receive tasks (in the form of QueuedTask -/// object) and dispatch tasks (in the form of DispatchedTask objects), using respectively the +/// objects) and dispatch tasks (in the form of DispatchedTask objects), using respectively the /// methods dequeue_task() and dispatch_task(). /// /// The CPU ownership map can be accessed using the method get_cpu_pid(), this also allows to keep @@ -51,85 +50,8 @@ pub const NO_CPU: i32 = -1; /// user-space scheduler has some pending work to do or not. /// /// Finally the methods exited() and shutdown_and_report() can be used respectively to test +/// whether the BPF component exited, and to shutdown and report the exit message. /// whether the BPF component exited, and to shutdown and report exit message. -/// -/// Example -/// ======= -/// -/// Following you can find bare minimum template that can be used to implement a simple FIFO -/// scheduler using the BPF abstraction: -/// -/// mod bpf_skel; -/// pub use bpf_skel::*; -/// mod bpf; -/// pub mod bpf_intf; -/// use bpf::*; -/// -/// use std::thread; -/// -/// use std::sync::atomic::AtomicBool; -/// use std::sync::atomic::Ordering; -/// use std::sync::Arc; -/// -/// use anyhow::Result; -/// -/// struct Scheduler<'a> { -/// bpf: BpfScheduler<'a>, -/// } -/// -/// impl<'a> Scheduler<'a> { -/// fn init() -> Result { -/// let bpf = BpfScheduler::init(20000, false, false)?; -/// Ok(Self { bpf }) -/// } -/// -/// fn dispatch_tasks(&mut self) { -/// loop { -/// match self.bpf.dequeue_task() { -/// Ok(Some(task)) => { -/// if task.cpu >= 0 { -/// let _ = self.bpf.dispatch_task( -/// &DispatchedTask { -/// pid: task.pid, -/// cpu: task.cpu, -/// payload: 0, -/// } -/// ); -/// } -/// } -/// Ok(None) => { -/// *self.bpf.nr_queued_mut() = 0; -/// *self.bpf.nr_scheduled_mut() = 0; -/// break; -/// } -/// Err(_) => { -/// break; -/// } -/// } -/// } -/// } -/// -/// fn run(&mut self, shutdown: Arc) -> Result<()> { -/// while !shutdown.load(Ordering::Relaxed) && !self.bpf.exited() { -/// self.dispatch_tasks(); -/// thread::yield_now(); -/// } -/// -/// Ok(()) -/// } -/// } -/// -/// fn main() -> Result<()> { -/// let mut sched = Scheduler::init()?; -/// let shutdown = Arc::new(AtomicBool::new(false)); -/// let shutdown_clone = shutdown.clone(); -/// ctrlc::set_handler(move || { -/// shutdown_clone.store(true, Ordering::Relaxed); -/// })?; -/// -/// sched.run(shutdown) -/// } -/// // Task queued for scheduling from the BPF component (see bpf_intf::queued_task_ctx). #[derive(Debug)] @@ -241,16 +163,31 @@ impl<'cb> BpfScheduler<'cb> { ALLOCATOR.lock_memory(); // Copy one item from the ring buffer. + // + // # Safety + // + // Each invocation of the callback will trigger the copy of exactly one QueuedTask item to + // BUF. The caller must be synchronize to ensure that multiple invocations of the callback + // are not happening at the same time, but this is implicitly guaranteed by the fact that + // the caller is a single-thread process (for now). + // + // Use of a `str` whose contents are not valid UTF-8 is undefined behavior. fn callback(data: &[u8]) -> i32 { unsafe { + // SAFETY: copying from the BPF ring buffer to BUF is safe, since the size of BUF + // is exactly the size of QueuedTask and the callback operates in chunks of + // QueuedTask items. It also copies exactly one QueuedTask at a time, this is + // guaranteed by the error code returned by this callback (see below). From a + // thread-safety perspective this is also correct, assuming the caller is a + // single-thread process (as it is for now). BUF.0.copy_from_slice(data); } // Return an unsupported error to stop early and consume only one item. // // NOTE: this is quite a hack. I wish libbpf would honor stopping after the first item - // is consumed, upon returnin a non-zero positive value here, but it doesn't seem to be - // the case: + // is consumed, upon returning a non-zero positive value here, but it doesn't seem to + // be the case: // // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/ringbuf.c?h=v6.8-rc5#n260 // @@ -305,6 +242,23 @@ impl<'cb> BpfScheduler<'cb> { } } + // Update the amount of tasks that have been queued to the user-space scheduler and dispatched. + // + // This method is used to notify the BPF component if the user-space scheduler has still some + // pending actions to complete (based on the counter of queued and scheduled tasks). + // + // NOTE: do not set allow(dead_code) for this method, any scheduler must use this method at + // some point, otherwise the BPF component will keep waking-up the user-space scheduler in a + // busy loop, causing unnecessary high CPU consumption. + pub fn update_tasks(&mut self, nr_queued: Option, nr_scheduled: Option) { + if let Some(queued) = nr_queued { + self.skel.bss_mut().nr_queued = queued; + } + if let Some(scheduled) = nr_scheduled { + self.skel.bss_mut().nr_scheduled = scheduled; + } + } + // Override the default scheduler time slice (in us). #[allow(dead_code)] pub fn set_effective_slice_us(&mut self, slice_us: u64) { @@ -324,11 +278,13 @@ impl<'cb> BpfScheduler<'cb> { } // Counter of queued tasks. + #[allow(dead_code)] pub fn nr_queued_mut(&mut self) -> &mut u64 { &mut self.skel.bss_mut().nr_queued } // Counter of scheduled tasks. + #[allow(dead_code)] pub fn nr_scheduled_mut(&mut self) -> &mut u64 { &mut self.skel.bss_mut().nr_scheduled } diff --git a/scheds/rust/scx_rustland/src/bpf/intf.h b/rust/scx_rustland_core/src/bpf/intf.h similarity index 100% rename from scheds/rust/scx_rustland/src/bpf/intf.h rename to rust/scx_rustland_core/src/bpf/intf.h diff --git a/scheds/rust/scx_rustland/src/bpf/main.bpf.c b/rust/scx_rustland_core/src/bpf/main.bpf.c similarity index 96% rename from scheds/rust/scx_rustland/src/bpf/main.bpf.c rename to rust/scx_rustland_core/src/bpf/main.bpf.c index 8c073da51..fe8330cff 100644 --- a/scheds/rust/scx_rustland/src/bpf/main.bpf.c +++ b/rust/scx_rustland_core/src/bpf/main.bpf.c @@ -1,19 +1,13 @@ /* Copyright (c) Andrea Righi */ /* - * scx_rustland: simple user-space scheduler written in Rust + * scx_rustland_core: BPF backend for schedulers running in user-space. * - * The main goal of this scheduler is be an "easy to read" template that can be - * used to quickly test more complex scheduling policies. For this reason this - * scheduler is mostly focused on simplicity and code readability. + * This BPF backend implements the low level sched-ext functionalities for a + * user-space counterpart, that implements the actual scheduling policy. * - * The scheduler is made of a BPF component (dispatcher) that implements the - * low level sched-ext functionalities and a user-space counterpart - * (scheduler), written in Rust, that implements the actual scheduling policy. - * - * The BPF dispatcher collects total cputime and weight from the tasks that - * need to run, then it sends all details to the user-space scheduler that - * decides the best order of execution of the tasks (based on the collected - * metrics). + * The BPF part collects total cputime and weight from the tasks that need to + * run, then it sends all details to the user-space scheduler that decides the + * best order of execution of the tasks (based on the collected metrics). * * The user-space scheduler then returns to the BPF component the list of tasks * to be dispatched in the proper order. diff --git a/rust/scx_rustland_core/src/bpf_intf.rs b/rust/scx_rustland_core/src/bpf_intf.rs new file mode 100644 index 000000000..9db020efd --- /dev/null +++ b/rust/scx_rustland_core/src/bpf_intf.rs @@ -0,0 +1,9 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs")); diff --git a/rust/scx_rustland_core/src/bpf_skel.rs b/rust/scx_rustland_core/src/bpf_skel.rs new file mode 100644 index 000000000..c42af33d6 --- /dev/null +++ b/rust/scx_rustland_core/src/bpf_skel.rs @@ -0,0 +1,4 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs")); diff --git a/rust/scx_rustland_core/src/lib.rs b/rust/scx_rustland_core/src/lib.rs new file mode 100644 index 000000000..64e7c55ab --- /dev/null +++ b/rust/scx_rustland_core/src/lib.rs @@ -0,0 +1,7 @@ +mod bindings; + +mod alloc; +pub use alloc::ALLOCATOR; + +mod rustland_builder; +pub use rustland_builder::RustLandBuilder; diff --git a/rust/scx_rustland_core/src/rustland_builder.rs b/rust/scx_rustland_core/src/rustland_builder.rs new file mode 100644 index 000000000..7e7fd88fd --- /dev/null +++ b/rust/scx_rustland_core/src/rustland_builder.rs @@ -0,0 +1,50 @@ +// Copyright (c) Andrea Righi + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +use anyhow::Result; + +use std::fs::File; +use std::io::Write; +use std::path::Path; + +use scx_utils::BpfBuilder; + +pub struct RustLandBuilder { + inner_builder: BpfBuilder, +} + +impl RustLandBuilder { + pub fn new() -> Result { + Ok(Self { + inner_builder: BpfBuilder::new()?, + }) + } + + fn create_file(&self, file_name: &str, content: &[u8]) { + let path = Path::new(file_name); + let mut file = File::create(&path).expect("Unable to create file"); + file.write_all(content).expect("Unable to write to file"); + } + + pub fn build(&mut self) -> Result<()> { + // Embed the BPF source files. + let intf = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf/intf.h")); + let skel = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf/main.bpf.c")); + let bpf = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf.rs")); + + // Generate BPF backend code (C). + self.create_file("intf.h", intf); + self.create_file("main.bpf.c", skel); + + self.inner_builder.enable_intf("intf.h", "bpf_intf.rs"); + self.inner_builder.enable_skel("main.bpf.c", "bpf"); + + // Generate user-space BPF connector code (Rust). + self.create_file("src/bpf.rs", bpf); + + // Build the scheduler. + self.inner_builder.build() + } +} diff --git a/rust/scx_utils/Cargo.toml b/rust/scx_utils/Cargo.toml index a0c063b6c..305bc4173 100644 --- a/rust/scx_utils/Cargo.toml +++ b/rust/scx_utils/Cargo.toml @@ -18,10 +18,12 @@ hex = "0.4.3" lazy_static = "1.4" libbpf-cargo = "0.22" libbpf-rs = "0.22.0" +buddy-alloc = "0.5.1" log = "0.4.17" regex = "1.10" sscanf = "0.4" tar = "0.4" +walkdir = "2.4" version-compare = "0.1" [build-dependencies] diff --git a/rust/scx_utils/build.rs b/rust/scx_utils/build.rs index ad00c19de..16696e311 100644 --- a/rust/scx_utils/build.rs +++ b/rust/scx_utils/build.rs @@ -3,51 +3,8 @@ // This software may be used and distributed according to the terms of the // GNU General Public License version 2. -use std::env; -use std::fs::File; -use std::path::PathBuf; - -const BPF_H: &str = "bpf_h"; - -fn gen_bpf_h() { - let file = - File::create(PathBuf::from(env::var("OUT_DIR").unwrap()).join(format!("{}.tar", BPF_H))) - .unwrap(); - let mut ar = tar::Builder::new(file); - - ar.follow_symlinks(false); - ar.append_dir_all(".", BPF_H).unwrap(); - ar.finish().unwrap(); - - for ent in walkdir::WalkDir::new(BPF_H) { - let ent = ent.unwrap(); - if !ent.file_type().is_dir() { - println!("cargo:rerun-if-changed={}", ent.path().to_string_lossy()); - } - } -} - -fn gen_bindings() { - // FIXME - bindgen's API changed between 0.68 and 0.69 so that - // `bindgen::CargoCallbacks::new()` should be used instead of - // `bindgen::CargoCallbacks`. Unfortunately, as of Dec 2023, fedora is - // shipping 0.68. To accommodate fedora, allow both 0.68 and 0.69 of - // bindgen and suppress deprecation warning. Remove the following once - // fedora can be updated to bindgen >= 0.69. - #[allow(deprecated)] - let bindings = bindgen::Builder::default() - .header("bindings.h") - .allowlist_type("scx_exit_kind") - .parse_callbacks(Box::new(bindgen::CargoCallbacks)) - .generate() - .expect("Unable to generate bindings"); - - bindings - .write_to_file(PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs")) - .expect("Couldn't write bindings"); -} +include!("src/builder.rs"); fn main() { - gen_bpf_h(); - gen_bindings(); + Builder::new().build() } diff --git a/rust/scx_utils/src/builder.rs b/rust/scx_utils/src/builder.rs new file mode 100644 index 000000000..f55b29ad6 --- /dev/null +++ b/rust/scx_utils/src/builder.rs @@ -0,0 +1,61 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +use std::env; +use std::fs::File; +use std::path::PathBuf; + +const BPF_H: &str = "bpf_h"; + +pub struct Builder; + +impl Builder { + pub fn new() -> Self { + Builder + } + + fn gen_bpf_h(&self) { + let out_dir = env::var("OUT_DIR").unwrap(); + let file = File::create(PathBuf::from(&out_dir).join(format!("{}.tar", BPF_H))).unwrap(); + let mut ar = tar::Builder::new(file); + + ar.follow_symlinks(false); + ar.append_dir_all(".", BPF_H).unwrap(); + ar.finish().unwrap(); + + for ent in walkdir::WalkDir::new(BPF_H) { + let ent = ent.unwrap(); + if !ent.file_type().is_dir() { + println!("cargo:rerun-if-changed={}", ent.path().to_string_lossy()); + } + } + } + + fn gen_bindings(&self) { + let out_dir = env::var("OUT_DIR").unwrap(); + // FIXME - bindgen's API changed between 0.68 and 0.69 so that + // `bindgen::CargoCallbacks::new()` should be used instead of + // `bindgen::CargoCallbacks`. Unfortunately, as of Dec 2023, fedora is + // shipping 0.68. To accommodate fedora, allow both 0.68 and 0.69 of + // bindgen and suppress deprecation warning. Remove the following once + // fedora can be updated to bindgen >= 0.69. + #[allow(deprecated)] + let bindings = bindgen::Builder::default() + .header("bindings.h") + .allowlist_type("scx_exit_kind") + .parse_callbacks(Box::new(bindgen::CargoCallbacks)) + .generate() + .expect("Unable to generate bindings"); + + bindings + .write_to_file(PathBuf::from(&out_dir).join("bindings.rs")) + .expect("Couldn't write bindings"); + } + + pub fn build(self) { + self.gen_bpf_h(); + self.gen_bindings(); + } +} diff --git a/rust/scx_utils/src/lib.rs b/rust/scx_utils/src/lib.rs index a262d0955..18c7de468 100644 --- a/rust/scx_utils/src/lib.rs +++ b/rust/scx_utils/src/lib.rs @@ -35,6 +35,9 @@ mod bindings; mod bpf_builder; pub use bpf_builder::BpfBuilder; +mod builder; +pub use builder::Builder; + pub mod ravg; mod libbpf_logger; diff --git a/scheds/rust/README.md b/scheds/rust/README.md index 04d3882ff..d35f1e194 100644 --- a/scheds/rust/README.md +++ b/scheds/rust/README.md @@ -15,3 +15,4 @@ main.rs or \*.bpf.c files. - [scx_layered](scx_layered/README.md) - [scx_rusty](scx_rusty/README.md) - [scx_rustland](scx_rustland/README.md) +- [scx_rlfifo](scx_rlfifo/README.md) diff --git a/scheds/rust/meson.build b/scheds/rust/meson.build index ec6d22b86..8750b18e6 100644 --- a/scheds/rust/meson.build +++ b/scheds/rust/meson.build @@ -1,3 +1,4 @@ subdir('scx_layered') subdir('scx_rusty') subdir('scx_rustland') +subdir('scx_rlfifo') diff --git a/scheds/rust/scx_rlfifo/.gitignore b/scheds/rust/scx_rlfifo/.gitignore new file mode 100644 index 000000000..3afb3353f --- /dev/null +++ b/scheds/rust/scx_rlfifo/.gitignore @@ -0,0 +1,6 @@ +src/bpf/.output +intf.h +main.bpf.c +bpf.rs +Cargo.lock +target diff --git a/scheds/rust/scx_rlfifo/Cargo.toml b/scheds/rust/scx_rlfifo/Cargo.toml new file mode 100644 index 000000000..6128db1f1 --- /dev/null +++ b/scheds/rust/scx_rlfifo/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "scx_rlfifo" +version = "0.0.1" +authors = ["Andrea Righi ", "Canonical"] +edition = "2021" +description = "A simple FIFO scheduler in Rust that runs in user-space" +license = "GPL-2.0-only" + +[dependencies] +anyhow = "1.0.65" +ctrlc = { version = "3.1", features = ["termination"] } +libbpf-rs = "0.22.0" +libc = "0.2.137" +scx_utils = { path = "../../../rust/scx_utils", version = "0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" } + +[build-dependencies] +scx_utils = { path = "../../../rust/scx_utils", version = "0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" } + +[features] +enable_backtrace = [] diff --git a/scheds/rust/scx_rlfifo/LICENSE b/scheds/rust/scx_rlfifo/LICENSE new file mode 120000 index 000000000..5853aaea5 --- /dev/null +++ b/scheds/rust/scx_rlfifo/LICENSE @@ -0,0 +1 @@ +../../../LICENSE \ No newline at end of file diff --git a/scheds/rust/scx_rlfifo/README.md b/scheds/rust/scx_rlfifo/README.md new file mode 100644 index 000000000..f57115fc3 --- /dev/null +++ b/scheds/rust/scx_rlfifo/README.md @@ -0,0 +1,20 @@ +# scx_rlfifo + +This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main). + +## Overview + +scx_rlfifo is a simple FIFO scheduler runs in user-space, based on the +scx_rustland_core framework. + +## Typical Use Case + +This scheduler is provided as a simple template that can be used as a baseline +to test more complex scheduling policies. + +## Production Ready? + +Definitely not. Using this scheduler in a production environment is not +recommended, unless there are specific requirements that necessitate a basic +FIFO scheduling approach. Even then, it's still recommended to use the kernel's +SCHED_FIFO real-time class. diff --git a/scheds/rust/scx_rlfifo/build.rs b/scheds/rust/scx_rlfifo/build.rs new file mode 100644 index 000000000..8b8285790 --- /dev/null +++ b/scheds/rust/scx_rlfifo/build.rs @@ -0,0 +1,9 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +fn main() { + scx_rustland_core::RustLandBuilder::new() + .unwrap() + .build() + .unwrap(); +} diff --git a/scheds/rust/scx_rlfifo/meson.build b/scheds/rust/scx_rlfifo/meson.build new file mode 100644 index 000000000..72ad547d8 --- /dev/null +++ b/scheds/rust/scx_rlfifo/meson.build @@ -0,0 +1,7 @@ +custom_target('scx_rlfifo', + output: '@PLAINNAME@.__PHONY__', + input: 'Cargo.toml', + command: [cargo, 'build', '--manifest-path=@INPUT@', '--target-dir=@OUTDIR@', + cargo_build_args], + env: cargo_env, + build_by_default: true) diff --git a/scheds/rust/scx_rlfifo/rustfmt.toml b/scheds/rust/scx_rlfifo/rustfmt.toml new file mode 100644 index 000000000..b7258ed0a --- /dev/null +++ b/scheds/rust/scx_rlfifo/rustfmt.toml @@ -0,0 +1,8 @@ +# Get help on options with `rustfmt --help=config` +# Please keep these in alphabetical order. +edition = "2021" +group_imports = "StdExternalCrate" +imports_granularity = "Item" +merge_derives = false +use_field_init_shorthand = true +version = "Two" diff --git a/scheds/rust/scx_rlfifo/src/bpf_intf.rs b/scheds/rust/scx_rlfifo/src/bpf_intf.rs new file mode 100644 index 000000000..9db020efd --- /dev/null +++ b/scheds/rust/scx_rlfifo/src/bpf_intf.rs @@ -0,0 +1,9 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs")); diff --git a/scheds/rust/scx_rlfifo/src/bpf_skel.rs b/scheds/rust/scx_rlfifo/src/bpf_skel.rs new file mode 100644 index 000000000..c42af33d6 --- /dev/null +++ b/scheds/rust/scx_rlfifo/src/bpf_skel.rs @@ -0,0 +1,4 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs")); diff --git a/scheds/rust/scx_rlfifo/src/main.rs b/scheds/rust/scx_rlfifo/src/main.rs new file mode 100644 index 000000000..9b61e5b69 --- /dev/null +++ b/scheds/rust/scx_rlfifo/src/main.rs @@ -0,0 +1,117 @@ +// Copyright (c) Andrea Righi + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +mod bpf_skel; +pub use bpf_skel::*; +pub mod bpf_intf; + +mod bpf; +use bpf::*; + +use scx_utils::Topology; + +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use std::time::{Duration, SystemTime}; + +use anyhow::Result; + +struct Scheduler<'a> { + bpf: BpfScheduler<'a>, +} + +impl<'a> Scheduler<'a> { + fn init() -> Result { + let topo = Topology::new().expect("Failed to build host topology"); + let bpf = BpfScheduler::init(5000, topo.nr_cpus() as i32, false, false, false)?; + Ok(Self { bpf }) + } + + fn now() -> u64 { + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() + } + + fn dispatch_tasks(&mut self) { + loop { + // Get queued taks and dispatch them in order (FIFO). + match self.bpf.dequeue_task() { + Ok(Some(task)) => { + // task.cpu < 0 is used to to notify an exiting task, in this + // case we can simply ignore the task. + if task.cpu >= 0 { + let _ = self.bpf.dispatch_task(&DispatchedTask { + pid: task.pid, + cpu: task.cpu, + cpumask_cnt: task.cpumask_cnt, + payload: 0, + }); + } + // Give the task a chance to run and prevent overflowing the dispatch queue. + std::thread::yield_now(); + } + Ok(None) => { + // Notify the BPF component that all tasks have been scheduled and dispatched. + self.bpf.update_tasks(Some(0), Some(0)); + + // All queued tasks have been dipatched, add a short sleep to reduce + // scheduler's CPU consuption. + std::thread::sleep(Duration::from_millis(1)); + break; + } + Err(_) => { + break; + } + } + } + } + + fn print_stats(&mut self) { + let nr_user_dispatches = *self.bpf.nr_user_dispatches_mut(); + let nr_kernel_dispatches = *self.bpf.nr_kernel_dispatches_mut(); + let nr_cancel_dispatches = *self.bpf.nr_cancel_dispatches_mut(); + let nr_bounce_dispatches = *self.bpf.nr_bounce_dispatches_mut(); + let nr_failed_dispatches = *self.bpf.nr_failed_dispatches_mut(); + let nr_sched_congested = *self.bpf.nr_sched_congested_mut(); + + println!( + "user={} kernel={} cancel={} bounce={} fail={} cong={}", + nr_user_dispatches, nr_kernel_dispatches, + nr_cancel_dispatches, nr_bounce_dispatches, + nr_failed_dispatches, nr_sched_congested, + ); + } + + fn run(&mut self, shutdown: Arc) -> Result<()> { + let mut prev_ts = Self::now(); + + while !shutdown.load(Ordering::Relaxed) && !self.bpf.exited() { + self.dispatch_tasks(); + + let curr_ts = Self::now(); + if curr_ts > prev_ts { + self.print_stats(); + prev_ts = curr_ts; + } + } + + self.bpf.shutdown_and_report() + } +} + +fn main() -> Result<()> { + let mut sched = Scheduler::init()?; + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_clone = shutdown.clone(); + + ctrlc::set_handler(move || { + shutdown_clone.store(true, Ordering::Relaxed); + })?; + + sched.run(shutdown) +} diff --git a/scheds/rust/scx_rustland/.gitignore b/scheds/rust/scx_rustland/.gitignore index 186dba259..3afb3353f 100644 --- a/scheds/rust/scx_rustland/.gitignore +++ b/scheds/rust/scx_rustland/.gitignore @@ -1,3 +1,6 @@ src/bpf/.output +intf.h +main.bpf.c +bpf.rs Cargo.lock target diff --git a/scheds/rust/scx_rustland/Cargo.toml b/scheds/rust/scx_rustland/Cargo.toml index a0d6890d0..51f8bafbc 100644 --- a/scheds/rust/scx_rustland/Cargo.toml +++ b/scheds/rust/scx_rustland/Cargo.toml @@ -8,21 +8,20 @@ license = "GPL-2.0-only" [dependencies] anyhow = "1.0.65" -bitvec = { version = "1.0", features = ["serde"] } clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } ctrlc = { version = "3.1", features = ["termination"] } fb_procfs = "0.7.0" -hex = "0.4.3" libbpf-rs = "0.22.0" libc = "0.2.137" -buddy-alloc = "0.5.1" log = "0.4.17" ordered-float = "3.4.0" scx_utils = { path = "../../../rust/scx_utils", version = "0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" } simplelog = "0.12.0" [build-dependencies] scx_utils = { path = "../../../rust/scx_utils", version = "0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_rustland/README.md b/scheds/rust/scx_rustland/README.md index 47d86526b..a3b252799 100644 --- a/scheds/rust/scx_rustland/README.md +++ b/scheds/rust/scx_rustland/README.md @@ -4,25 +4,24 @@ This is a single user-defined scheduler used within [sched_ext](https://github.c ## Overview -scx_rustland is made of a BPF component (dispatcher) that implements the low -level sched-ext functionalities and a user-space counterpart (scheduler), +scx_rustland is made of a BPF component (scx_rustland_core) that implements the +low level sched-ext functionalities and a user-space counterpart (scheduler), written in Rust, that implements the actual scheduling policy. -The BPF dispatcher is completely agnostic of the particular scheduling policy -implemented in user-space. For this reason developers that are willing to use -this scheduler to experiment scheduling policies should be able to simply -modify the Rust component, without having to deal with any internal kernel / -BPF details. - ## How To Install Available as a [Rust crate](https://crates.io/crates/scx_rustland): `cargo add scx_rustland` ## Typical Use Case -scx_rustland is designed to be "easy to read" template that can be used by any -developer to quickly experiment more complex scheduling policies, that can be -fully implemented in Rust. +scx_rustland is designed to prioritize interactive workloads over background +CPU-intensive workloads. For this reason the typical use case of this scheduler +involves low-latency interactive applications, such as gaming, video +conferencing and live streaming. + +scx_rustland is also designed to be an "easy to read" template that can be used +by any developer to quickly experiment more complex scheduling policies fully +implemented in Rust. ## Production Ready? @@ -32,20 +31,17 @@ with a certain cost. However, a scheduler entirely implemented in user-space holds the potential for seamless integration with sophisticated libraries, tracing tools, external -services (e.g., AI), etc. Hence, there might be situations where the benefits -outweigh the overhead, justifying the use of this scheduler in a production -environment. +services (e.g., AI), etc. + +Hence, there might be situations where the benefits outweigh the overhead, +justifying the use of this scheduler in a production environment. ## Demo [scx_rustland-terraria](https://github.com/sched-ext/scx/assets/1051723/42ec3bf2-9f1f-4403-80ab-bf5d66b7c2d5) -For this demo the scheduler includes an extra patch to impose a "time slice -penalty" on new short-lived tasks. While this approach might not be suitable -for general usage, it can yield significant advantages in this specific -scenario. - -The key takeaway is to demonstrate the ease and safety of conducting -experiments like this, as we operate in user-space, and we can accomplish -everything simply by modifying the Rust code, that is completely abstracted -from the underlying BPF/kernel internal details. +The key takeaway of this demo is to demonstrate that , despite the overhead of +running a scheduler in user-space, we can still obtain interesting results and, +in this particular case, even outperform the default Linux scheduler (EEVDF) in +terms of application responsiveness (fps), while a CPU intensive workload +(parallel kernel build) is running in the background. diff --git a/scheds/rust/scx_rustland/build.rs b/scheds/rust/scx_rustland/build.rs index 42e96b711..8b8285790 100644 --- a/scheds/rust/scx_rustland/build.rs +++ b/scheds/rust/scx_rustland/build.rs @@ -2,10 +2,8 @@ // GNU General Public License version 2. fn main() { - scx_utils::BpfBuilder::new() + scx_rustland_core::RustLandBuilder::new() .unwrap() - .enable_intf("src/bpf/intf.h", "bpf_intf.rs") - .enable_skel("src/bpf/main.bpf.c", "bpf") .build() .unwrap(); } diff --git a/scheds/rust/scx_rustland/src/main.rs b/scheds/rust/scx_rustland/src/main.rs index f0ffcf240..a43b132f0 100644 --- a/scheds/rust/scx_rustland/src/main.rs +++ b/scheds/rust/scx_rustland/src/main.rs @@ -33,15 +33,18 @@ use log::warn; const SCHEDULER_NAME: &'static str = "RustLand"; -/// scx_rustland: simple user-space scheduler written in Rust +/// scx_rustland: user-space scheduler written in Rust /// -/// The main goal of this scheduler is be an "easy to read" template that can be used to quickly -/// test more complex scheduling policies. For this reason this scheduler is mostly focused on -/// simplicity and code readability. +/// scx_rustland is designed to prioritize interactive workloads over background CPU-intensive +/// workloads. For this reason the typical use case of this scheduler involves low-latency +/// interactive applications, such as gaming, video conferencing and live streaming. /// -/// The scheduler is made of a BPF component (dispatcher) that implements the low level sched-ext -/// functionalities and a user-space counterpart (scheduler), written in Rust, that implements the -/// actual scheduling policy. +/// scx_rustland is also designed to be an "easy to read" template that can be used by any +/// developer to quickly experiment more complex scheduling policies fully implemented in Rust. +/// +/// The scheduler is made of a BPF component (scx_rustland_core) that implements the low level +/// sched-ext functionalities and a user-space counterpart (scheduler), written in Rust, that +/// implements the actual scheduling policy. /// /// The default scheduling policy implemented in the user-space scheduler is a based on virtual /// runtime (vruntime): @@ -65,10 +68,6 @@ const SCHEDULER_NAME: &'static str = "RustLand"; /// /// === Troubleshooting === /// -/// - Disable HyperThreading / SMT if you notice poor performance (add "nosmt" to the kernel boot -/// parameters): this scheduler is not NUMA-aware and it implements a simple policy of handling -/// SMT cores. -/// /// - Adjust the time slice boost parameter (option `-b`) to enhance the responsiveness of /// low-latency applications (i.e., online gaming, live streaming, video conferencing etc.). /// @@ -473,8 +472,7 @@ impl<'a> Scheduler<'a> { // Reset nr_queued and update nr_scheduled, to notify the dispatcher that // queued tasks are drained, but there is still some work left to do in the // scheduler. - *self.bpf.nr_queued_mut() = 0; - *self.bpf.nr_scheduled_mut() = self.task_pool.tasks.len() as u64; + self.bpf.update_tasks(Some(0), Some(self.task_pool.tasks.len() as u64)); break; } Err(err) => { @@ -533,10 +531,10 @@ impl<'a> Scheduler<'a> { None => break, } } - // Reset nr_scheduled to notify the dispatcher that all the tasks received by the scheduler - // has been dispatched, so there is no reason to re-activate the scheduler, unless more - // tasks are queued. - self.bpf.skel.bss_mut().nr_scheduled = self.task_pool.tasks.len() as u64; + // Update nr_scheduled to notify the dispatcher that all the tasks received by the + // scheduler has been dispatched, so there is no reason to re-activate the scheduler, + // unless more tasks are queued. + self.bpf.update_tasks(None, Some(self.task_pool.tasks.len() as u64)); } // Main scheduling function (called in a loop to periodically drain tasks from the queued list