From 7046b47b9cefe3dffdfe5e8f22e9e3a3a1b23552 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Wed, 12 Jun 2024 15:58:41 +0900
Subject: [PATCH 1/6] scx_lavd: properly calculate task's runtime after
 suspend/resume

When a device is suspended and resumed, the suspended duration is added
up to a task's runtime if the task was running on the CPU. After the
resume, the task's runtime is incorrectly long and the scheduler starts
to recognize the system is under heavy load. To avoid such problem, the
suspended duration is measured and substracted from the task's runtime.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 scheds/rust/scx_lavd/src/bpf/intf.h     |  7 ++-
 scheds/rust/scx_lavd/src/bpf/main.bpf.c | 71 +++++++++++++++++++------
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h
index 7286e0b75..84aea2a69 100644
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@@ -141,6 +141,12 @@ struct cpu_ctx {
 	volatile u64	load_run_time_ns; /* total runtime of runnable tasks */
 	volatile u64	last_kick_clk;	/* when the CPU was kicked */
 
+	/*
+	 * Information for cpu hotplug
+	 */
+	u64		online_clk;	/* when a CPU becomes online */
+	u64		offline_clk;	/* when a CPU becomes offline */
+
 	/*
 	 * Information used to keep track of latency criticality
 	 */
@@ -165,7 +171,6 @@ struct cpu_ctx {
 	/*
 	 * Fields for core compaction
 	 *
-	 * NOTE: The followings MUST be placed at the end of this struct.
 	 */
 	struct bpf_cpumask __kptr *tmp_a_mask;	/* temporary cpu mask */
 	struct bpf_cpumask __kptr *tmp_o_mask;	/* temporary cpu mask */
diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
index c4e9263ee..af099a200 100644
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -1460,6 +1460,33 @@ static u64 calc_time_slice(struct task_struct *p, struct task_ctx *taskc)
 	return slice;
 }
 
+static void reset_suspended_duration(struct cpu_ctx *cpuc)
+{
+	if (cpuc->online_clk > cpuc->offline_clk)
+		cpuc->offline_clk = cpuc->online_clk;
+}
+
+static u64 get_suspended_duration_and_reset(struct cpu_ctx *cpuc)
+{
+	/*
+	 * When a system is suspended, a task is also suspended in a running
+	 * stat on the CPU. Hence, we subtract the suspended duration when it
+	 * resumes.
+	 */
+
+	u64 duration = 0;
+
+	if (cpuc->online_clk > cpuc->offline_clk) {
+		duration = cpuc->online_clk - cpuc->offline_clk;
+		/*
+		 * Once calculated, reset the duration to zero.
+		 */
+		cpuc->offline_clk = cpuc->online_clk;
+	}
+
+	return duration;
+}
+
 static void update_stat_for_runnable(struct task_struct *p,
 				     struct task_ctx *taskc,
 				     struct cpu_ctx *cpuc)
@@ -1495,7 +1522,7 @@ static void update_stat_for_running(struct task_struct *p,
 
 	/*
 	 * Update per-CPU latency criticality information for ever-scheduled
-	 * tasks
+	 * tasks.
 	 */
 	if (cpuc->max_lat_cri < taskc->lat_cri)
 		cpuc->max_lat_cri = taskc->lat_cri;
@@ -1504,6 +1531,12 @@ static void update_stat_for_running(struct task_struct *p,
 	cpuc->sum_lat_cri += taskc->lat_cri;
 	cpuc->sched_nr++;
 
+	/*
+	 * It is clear there is no need to consider the suspended duration
+	 * while running a task, so reset the suspended duration to zero.
+	 */
+	reset_suspended_duration(cpuc);
+
 	/*
 	 * Update task's performance criticality
 	 *
@@ -1538,7 +1571,7 @@ static void update_stat_for_stopping(struct task_struct *p,
 				     struct cpu_ctx *cpuc)
 {
 	u64 now = bpf_ktime_get_ns();
-	u64 old_run_time_ns;
+	u64 old_run_time_ns, suspended_duration;
 
 	/*
 	 * Update task's run_time. When a task is scheduled consecutively
@@ -1550,7 +1583,9 @@ static void update_stat_for_stopping(struct task_struct *p,
 	 * calculation of runtime statistics.
 	 */
 	old_run_time_ns = taskc->run_time_ns;
-	taskc->acc_run_time_ns += now - taskc->last_running_clk;
+	suspended_duration = get_suspended_duration_and_reset(cpuc);
+	taskc->acc_run_time_ns += now - taskc->last_running_clk -
+				  suspended_duration;
 	taskc->run_time_ns = calc_avg(taskc->run_time_ns,
 				      taskc->acc_run_time_ns);
 	taskc->last_stopping_clk = now;
@@ -2521,21 +2556,23 @@ void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags)
 	taskc->last_quiescent_clk = now;
 }
 
-static void cpu_ctx_init_online(struct cpu_ctx *cpuc, u32 cpu_id)
+static void cpu_ctx_init_online(struct cpu_ctx *cpuc, u32 cpu_id, u64 now)
 {
-	memset(cpuc, 0, sizeof(*cpuc) - 2 * sizeof(struct bpf_cpumask *));
+	cpuc->idle_start_clk = 0;
 	cpuc->cpu_id = cpu_id;
 	cpuc->lat_prio = LAVD_LAT_PRIO_IDLE;
 	cpuc->stopping_tm_est_ns = LAVD_TIME_INFINITY_NS;
+	WRITE_ONCE(cpuc->online_clk, now);
 	barrier();
 
 	cpuc->is_online = true;
 }
 
-static void cpu_ctx_init_offline(struct cpu_ctx *cpuc, u32 cpu_id)
+static void cpu_ctx_init_offline(struct cpu_ctx *cpuc, u32 cpu_id, u64 now)
 {
-	memset(cpuc, 0, sizeof(*cpuc) - 2 * sizeof(struct bpf_cpumask *));
+	cpuc->idle_start_clk = 0;
 	cpuc->cpu_id = cpu_id;
+	WRITE_ONCE(cpuc->offline_clk, now);
 	cpuc->is_online = false;
 	barrier();
 
@@ -2549,13 +2586,14 @@ void BPF_STRUCT_OPS(lavd_cpu_online, s32 cpu)
 	 * When a cpu becomes online, reset its cpu context and trigger the
 	 * recalculation of the global cpu load.
 	 */
+	u64 now = bpf_ktime_get_ns();
 	struct cpu_ctx *cpuc;
 
 	cpuc = get_cpu_ctx_id(cpu);
 	if (!cpuc)
 		return;
 
-	cpu_ctx_init_online(cpuc, cpu);
+	cpu_ctx_init_online(cpuc, cpu, now);
 
 	__sync_fetch_and_add(&nr_cpus_onln, 1);
 	update_sys_stat();
@@ -2567,13 +2605,14 @@ void BPF_STRUCT_OPS(lavd_cpu_offline, s32 cpu)
 	 * When a cpu becomes offline, trigger the recalculation of the global
 	 * cpu load.
 	 */
+	u64 now = bpf_ktime_get_ns();
 	struct cpu_ctx *cpuc;
 
 	cpuc = get_cpu_ctx_id(cpu);
 	if (!cpuc)
 		return;
 
-	cpu_ctx_init_offline(cpuc, cpu);
+	cpu_ctx_init_offline(cpuc, cpu, now);
 
 	__sync_fetch_and_sub(&nr_cpus_onln, 1);
 	update_sys_stat();
@@ -2716,7 +2755,7 @@ static int init_cpumasks(void)
 	return err;
 }
 
-static s32 init_per_cpu_ctx(void)
+static s32 init_per_cpu_ctx(u64 now)
 {
 	int cpu;
 	int err;
@@ -2736,21 +2775,20 @@ static s32 init_per_cpu_ctx(void)
 		if (err)
 			return err;
 
-		cpu_ctx_init_online(cpuc, cpu);
+		cpu_ctx_init_online(cpuc, cpu, now);
+		cpuc->offline_clk = now;
 	}
 
 	return 0;
 }
 
-static s32 init_sys_stat(void)
+static s32 init_sys_stat(u64 now)
 {
 	struct bpf_timer *timer;
-	u64 now;
 	u32 key = 0;
 	int err;
 
 	memset(__sys_stats, 0, sizeof(__sys_stats));
-	now = bpf_ktime_get_ns();
 	__sys_stats[0].last_update_clk = now;
 	__sys_stats[1].last_update_clk = now;
 	__sys_stats[0].nr_active = nr_cpus_onln;
@@ -2774,6 +2812,7 @@ static s32 init_sys_stat(void)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(lavd_init)
 {
+	u64 now = bpf_ktime_get_ns();
 	int err;
 
 	/*
@@ -2788,7 +2827,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(lavd_init)
 	/*
 	 * Initialize per-CPU context.
 	 */
-	err = init_per_cpu_ctx();
+	err = init_per_cpu_ctx(now);
 	if (err)
 		return err;
 
@@ -2796,7 +2835,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(lavd_init)
 	 * Initialize the last update clock and the update timer to track
 	 * system-wide CPU load.
 	 */
-	err = init_sys_stat();
+	err = init_sys_stat(now);
 	if (err)
 		return err;
 

From 9d129f0afab25cf840fe698f23e87d2523da53a9 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Wed, 12 Jun 2024 20:06:17 +0900
Subject: [PATCH 2/6] scx_lavd: rename LAVD_CPU_UTIL_INTERVAL_NS to
 LAVD_SYS_STAT_INTERVAL_NS

The periodic CPU utilization routine does a lot of other work now. So we
rename LAVD_CPU_UTIL_INTERVAL_NS to LAVD_SYS_STAT_INTERVAL_NS.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 scheds/rust/scx_lavd/src/bpf/intf.h     | 4 ++--
 scheds/rust/scx_lavd/src/bpf/main.bpf.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h
index 84aea2a69..10dc3253d 100644
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@@ -78,7 +78,6 @@ enum consts {
 	LAVD_ELIGIBLE_TIME_MAX		= (LAVD_SLICE_MIN_NS >> 8),
 
 	LAVD_CPU_UTIL_MAX		= 1000, /* 100.0% */
-	LAVD_CPU_UTIL_INTERVAL_NS	= (25 * NSEC_PER_MSEC),
 	LAVD_CPU_ID_HERE		= ((u32)-2),
 	LAVD_CPU_ID_NONE		= ((u32)-1),
 	LAVD_CPU_ID_MAX			= 512,
@@ -87,12 +86,13 @@ enum consts {
 	LAVD_PREEMPT_KICK_MARGIN	= (LAVD_SLICE_MIN_NS >> 3),
 	LAVD_PREEMPT_TICK_MARGIN	= (LAVD_SLICE_MIN_NS >> 8),
 
+	LAVD_SYS_STAT_INTERVAL_NS	= (25 * NSEC_PER_MSEC),
 	LAVD_TC_PER_CORE_MAX_CTUIL	= 500, /* maximum per-core CPU utilization */
 	LAVD_TC_NR_ACTIVE_MIN		= 1, /* num of mininum active cores */
 	LAVD_TC_NR_OVRFLW		= 1, /* num of overflow cores */
 	LAVD_TC_CPU_PIN_INTERVAL	= (100 * NSEC_PER_MSEC),
 	LAVD_TC_CPU_PIN_INTERVAL_DIV	= (LAVD_TC_CPU_PIN_INTERVAL /
-					   LAVD_CPU_UTIL_INTERVAL_NS),
+					   LAVD_SYS_STAT_INTERVAL_NS),
 
 	LAVD_GLOBAL_DSQ			= 0,
 };
diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
index af099a200..d704f110f 100644
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -1000,7 +1000,7 @@ static int update_timer_cb(void *map, int *key, struct bpf_timer *timer)
 
 	update_sys_stat();
 
-	err = bpf_timer_start(timer, LAVD_CPU_UTIL_INTERVAL_NS, 0);
+	err = bpf_timer_start(timer, LAVD_SYS_STAT_INTERVAL_NS, 0);
 	if (err)
 		scx_bpf_error("Failed to arm update timer");
 
@@ -1394,7 +1394,7 @@ static u64 calc_task_load_actual(struct task_ctx *taskc)
 	 * The actual load is the CPU time consumed in a time interval, which
 	 * can be calculated from task's average run time and frequency.
 	 */
-	const s64 interval_adj = LAVD_TIME_ONE_SEC / LAVD_CPU_UTIL_INTERVAL_NS;
+	const s64 interval_adj = LAVD_TIME_ONE_SEC / LAVD_SYS_STAT_INTERVAL_NS;
 	return (taskc->run_time_ns * taskc->run_freq) / interval_adj;
 }
 
@@ -2801,7 +2801,7 @@ static s32 init_sys_stat(u64 now)
 	}
 	bpf_timer_init(timer, &update_timer, CLOCK_BOOTTIME);
 	bpf_timer_set_callback(timer, update_timer_cb);
-	err = bpf_timer_start(timer, LAVD_CPU_UTIL_INTERVAL_NS, 0);
+	err = bpf_timer_start(timer, LAVD_SYS_STAT_INTERVAL_NS, 0);
 	if (err) {
 		scx_bpf_error("Failed to arm update timer");
 		return err;

From 753f333c09ad715d4657cb5da3ab58f67afac960 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Wed, 12 Jun 2024 21:15:25 +0900
Subject: [PATCH 3/6] scx_lavd: refactoring do_update_sys_stat()

Originally, do_update_sys_stat() simply calculated the system-wide CPU
utilization. Over time, it has evolved to collect all kinds of
system-wide, periodic statistics for decision-making, so it has become
bulky. Now, it is time to refactor it for readability. This commit does
not contain functional changes other than refactoring.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 scheds/rust/scx_lavd/src/bpf/main.bpf.c | 186 ++++++++++++++++--------
 1 file changed, 123 insertions(+), 63 deletions(-)

diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
index d704f110f..19f1e882a 100644
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -692,36 +692,57 @@ static u64 calc_avg_freq(u64 old_freq, u64 interval)
 	return ewma_freq;
 }
 
-static void do_update_sys_stat(void)
+struct sys_stat_ctx {
+	struct sys_stat *stat_cur;
+	struct sys_stat	*stat_next;
+	u64		now;
+	u64		duration;
+	u64		duration_total;
+	u64		idle_total;
+	u64		compute_total;
+	u64		load_actual;
+	u64		load_ideal;
+	u64		load_run_time_ns;
+	s64		max_lat_cri;
+	s64		min_lat_cri;
+	s64		avg_lat_cri;
+	u64		sum_lat_cri;
+	u64		sched_nr;
+	u64		sum_perf_cri;
+	u64		avg_perf_cri;
+	u64		new_util;
+	u64		new_load_factor;
+	u64		nr_violation;
+};
+
+static void init_sys_stat_ctx(struct sys_stat_ctx *c)
 {
-	struct sys_stat *stat_cur = get_sys_stat_cur();
-	struct sys_stat *stat_next = get_sys_stat_next();
-	u64 now, duration, duration_total, compute;
-	u64 idle_total = 0, compute_total = 0;
-	u64 load_actual = 0, load_ideal = 0, load_run_time_ns = 0;
-	s64 max_lat_cri = 0, min_lat_cri = UINT_MAX, avg_lat_cri = 0;
-	u64 sum_lat_cri = 0, sched_nr = 0;
-	u64 sum_perf_cri = 0, avg_perf_cri = 0;
-	u64 new_util, new_load_factor;
-	u64 nr_violation = 0;
-	int cpu;
+	memset(c, 0, sizeof(*c));
 
-	now = bpf_ktime_get_ns();
-	duration = now - stat_cur->last_update_clk;
+	c->stat_cur = get_sys_stat_cur();
+	c->stat_next = get_sys_stat_next();
+	c->now = bpf_ktime_get_ns();
+	c->duration = c->now - c->stat_cur->last_update_clk;
+	c->min_lat_cri = UINT_MAX;
+}
+
+static void collect_sys_stat(struct sys_stat_ctx *c)
+{
+	int cpu;
 
 	bpf_for(cpu, 0, nr_cpus_onln) {
 		struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
 		if (!cpuc) {
-			compute_total = 0;
+			c->compute_total = 0;
 			break;
 		}
 
 		/*
 		 * Accumulate cpus' loads.
 		 */
-		load_ideal += cpuc->load_ideal;
-		load_actual += cpuc->load_actual;
-		load_run_time_ns += cpuc->load_run_time_ns;
+		c->load_ideal += cpuc->load_ideal;
+		c->load_actual += cpuc->load_actual;
+		c->load_run_time_ns += cpuc->load_run_time_ns;
 
 		/*
 		 * Accumulate task's latency criticlity information.
@@ -730,24 +751,24 @@ static void do_update_sys_stat(void)
 		 * accuracy should be small and very rare and thus should be
 		 * fine.
 		 */
-		sum_lat_cri += cpuc->sum_lat_cri;
+		c->sum_lat_cri += cpuc->sum_lat_cri;
 		cpuc->sum_lat_cri = 0;
 
-		sched_nr += cpuc->sched_nr;
+		c->sched_nr += cpuc->sched_nr;
 		cpuc->sched_nr = 0;
 
-		if (cpuc->max_lat_cri > max_lat_cri)
-			max_lat_cri = cpuc->max_lat_cri;
+		if (cpuc->max_lat_cri > c->max_lat_cri)
+			c->max_lat_cri = cpuc->max_lat_cri;
 		cpuc->max_lat_cri = 0;
 
-		if (cpuc->min_lat_cri < min_lat_cri)
-			min_lat_cri = cpuc->min_lat_cri;
+		if (cpuc->min_lat_cri < c->min_lat_cri)
+			c->min_lat_cri = cpuc->min_lat_cri;
 		cpuc->min_lat_cri = UINT_MAX;
 
 		/*
 		 * Accumulate task's performance criticlity information.
 		 */
-		sum_perf_cri += cpuc->sum_perf_cri;
+		c->sum_perf_cri += cpuc->sum_perf_cri;
 		cpuc->sum_perf_cri = 0;
 
 		/*
@@ -760,9 +781,9 @@ static void do_update_sys_stat(void)
 				break;
 
 			bool ret = __sync_bool_compare_and_swap(
-					&cpuc->idle_start_clk, old_clk, now);
+					&cpuc->idle_start_clk, old_clk, c->now);
 			if (ret) {
-				idle_total += now - old_clk;
+				c->idle_total += c->now - old_clk;
 				break;
 			}
 		}
@@ -770,71 +791,96 @@ static void do_update_sys_stat(void)
 		/*
 		 * Calculcate per-CPU utilization
 		 */
-		compute = 0;
-		if (duration > cpuc->idle_total)
-			compute = duration - cpuc->idle_total;
-		new_util = (compute * LAVD_CPU_UTIL_MAX) / duration;
-		cpuc->util = calc_avg(cpuc->util, new_util);
+		u64 compute = 0;
+		if (c->duration > cpuc->idle_total)
+			compute = c->duration - cpuc->idle_total;
+		c->new_util = (compute * LAVD_CPU_UTIL_MAX) / c->duration;
+		cpuc->util = calc_avg(cpuc->util, c->new_util);
 
 		if (cpuc->util > LAVD_TC_PER_CORE_MAX_CTUIL)
-			nr_violation += 1000;
-
+			c->nr_violation += 1000;
 
 		/*
 		 * Accmulate system-wide idle time
 		 */
-		idle_total += cpuc->idle_total;
+		c->idle_total += cpuc->idle_total;
 		cpuc->idle_total = 0;
 	}
+}
 
-	duration_total = duration * nr_cpus_onln;
-	if (duration_total > idle_total)
-		compute_total = duration_total - idle_total;
+static void calc_sys_stat(struct sys_stat_ctx *c)
+{
+	c->duration_total = c->duration * nr_cpus_onln;
+	if (c->duration_total > c->idle_total)
+		c->compute_total = c->duration_total - c->idle_total;
 
-	new_util = (compute_total * LAVD_CPU_UTIL_MAX) / duration_total;
+	c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX) /
+		      c->duration_total;
 
-	new_load_factor = (1000 * LAVD_LOAD_FACTOR_ADJ * load_run_time_ns) /
-			  (LAVD_TARGETED_LATENCY_NS * nr_cpus_onln);
-	if (new_load_factor > LAVD_LOAD_FACTOR_MAX)
-		new_load_factor = LAVD_LOAD_FACTOR_MAX;
+	c->new_load_factor = (1000 * LAVD_LOAD_FACTOR_ADJ *
+				c->load_run_time_ns) /
+				(LAVD_TARGETED_LATENCY_NS * nr_cpus_onln);
+	if (c->new_load_factor > LAVD_LOAD_FACTOR_MAX)
+		c->new_load_factor = LAVD_LOAD_FACTOR_MAX;
 
-	if (sched_nr == 0) {
+	if (c->sched_nr == 0) {
 		/*
 		 * When a system is completely idle, it is indeed possible
 		 * nothing scheduled for an interval.
 		 */
-		min_lat_cri = stat_cur->min_lat_cri;
-		max_lat_cri = stat_cur->max_lat_cri;
-		avg_lat_cri = stat_cur->avg_lat_cri;
-		avg_perf_cri = stat_cur->avg_perf_cri;
+		c->min_lat_cri = c->stat_cur->min_lat_cri;
+		c->max_lat_cri = c->stat_cur->max_lat_cri;
+		c->avg_lat_cri = c->stat_cur->avg_lat_cri;
+		c->avg_perf_cri = c->stat_cur->avg_perf_cri;
 	}
 	else {
-		avg_lat_cri = sum_lat_cri / sched_nr;
-		avg_perf_cri = sum_perf_cri / sched_nr;
+		c->avg_lat_cri = c->sum_lat_cri / c->sched_nr;
+		c->avg_perf_cri = c->sum_perf_cri / c->sched_nr;
 	}
+}
 
+static void update_sys_stat_next(struct sys_stat_ctx *c)
+{
 	/*
 	 * Update the CPU utilization to the next version.
 	 */
-	stat_next->load_actual = calc_avg(stat_cur->load_actual, load_actual);
-	stat_next->load_ideal = calc_avg(stat_cur->load_ideal, load_ideal);
-	stat_next->util = calc_avg(stat_cur->util, new_util);
-	stat_next->load_factor = calc_avg(stat_cur->load_factor, new_load_factor);
-
-	stat_next->min_lat_cri = calc_avg(stat_cur->min_lat_cri, min_lat_cri);
-	stat_next->max_lat_cri = calc_avg(stat_cur->max_lat_cri, max_lat_cri);
-	stat_next->avg_lat_cri = calc_avg(stat_cur->avg_lat_cri, avg_lat_cri);
+	struct sys_stat *stat_cur = c->stat_cur;
+	struct sys_stat *stat_next = c->stat_next;
+
+	stat_next->load_actual =
+		calc_avg(stat_cur->load_actual, c->load_actual);
+	stat_next->load_ideal =
+		calc_avg(stat_cur->load_ideal, c->load_ideal);
+	stat_next->util =
+		calc_avg(stat_cur->util, c->new_util);
+	stat_next->load_factor =
+		calc_avg(stat_cur->load_factor, c->new_load_factor);
+
+	stat_next->min_lat_cri =
+		calc_avg(stat_cur->min_lat_cri, c->min_lat_cri);
+	stat_next->max_lat_cri =
+		calc_avg(stat_cur->max_lat_cri, c->max_lat_cri);
+	stat_next->avg_lat_cri =
+		calc_avg(stat_cur->avg_lat_cri, c->avg_lat_cri);
 	stat_next->thr_lat_cri = stat_next->max_lat_cri -
-				 ((stat_next->max_lat_cri - stat_next->avg_lat_cri) >> 1);
-	stat_next->avg_perf_cri = calc_avg(stat_cur->avg_perf_cri, avg_perf_cri);
+		((stat_next->max_lat_cri - stat_next->avg_lat_cri) >> 1);
+	stat_next->avg_perf_cri =
+		calc_avg(stat_cur->avg_perf_cri, c->avg_perf_cri);
 
-	stat_next->nr_violation = calc_avg(stat_cur->nr_violation, nr_violation);
+	stat_next->nr_violation =
+		calc_avg(stat_cur->nr_violation, c->nr_violation);
+}
 
+static void calc_inc1k(struct sys_stat_ctx *c)
+{
 	/*
-	 * Calculate the increment for latency criticality to priority mapping
+	 * Calculate the increment for mapping from latency criticality to
+	 * priority.
 	 *  - Case 1. inc1k_low:   [min_lc, avg_lc) -> [half_range, 0)
 	 *  - Case 2. inc1k_high:  [avg_lc, max_lc] -> [0, -half_range)
 	 */
+	struct sys_stat *stat_next = c->stat_next;
+
 	if (stat_next->avg_lat_cri == stat_next->min_lat_cri)
 		stat_next->inc1k_low = 0;
 	else {
@@ -850,11 +896,25 @@ static void do_update_sys_stat(void)
 					 (stat_next->max_lat_cri + 1 -
 					  stat_next->avg_lat_cri);
 	}
+}
+
+static void do_update_sys_stat(void)
+{
+	struct sys_stat_ctx c;
+
+	/*
+	 * Collect and prepare the next version of stat.
+	 */
+	init_sys_stat_ctx(&c);
+	collect_sys_stat(&c);
+	calc_sys_stat(&c);
+	update_sys_stat_next(&c);
+	calc_inc1k(&c);
 
 	/*
 	 * Make the next version atomically visible.
 	 */
-	stat_next->last_update_clk = now;
+	c.stat_next->last_update_clk = c.now;
 	flip_sys_stat();
 }
 

From e6348a11e91ac27f0f3bb5668ca2ebebce7da73e Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Wed, 12 Jun 2024 23:40:40 +0900
Subject: [PATCH 4/6] scx_lavd: improve frequency scaling logic

The old logic for CPU frequency scaling is that the task's CPU
performance target (i.e., target CPU frequency) is checked every tick
interval and updated immediately. Indeed, it samples and updates a
performance target every tick interval. Ultimately, it fluctuates CPU
frequency every tick interval, resulting in less steady performance.

Now, we take a different strategy. The key idea is to increase the
frequency as soon as possible when a task starts running for quick
adoption to load spikes. However, if necessary, it decreases gradually
every tick interval to avoid frequency fluctuations.

In my testing, it shows more stable performance in many workloads
(games, compilation).

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 scheds/rust/scx_lavd/src/bpf/intf.h     |  7 +++
 scheds/rust/scx_lavd/src/bpf/main.bpf.c | 83 ++++++++++++++++++++++---
 2 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h
index 10dc3253d..374f63d70 100644
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@@ -168,6 +168,13 @@ struct cpu_ctx {
 	volatile u8	is_online;	/* is this CPU online? */
 	s32		cpu_id;		/* cpu id */
 
+	/*
+	 * Information for CPU frequency scaling
+	 */
+	u32		cpuperf_cur;	/* CPU's current performance target */
+	u32		cpuperf_task;	/* task's CPU performance target */
+	u32		cpuperf_avg;	/* EWMA of task's CPU performance target */
+
 	/*
 	 * Fields for core compaction
 	 *
diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
index 19f1e882a..88cf64fb1 100644
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -670,6 +670,15 @@ static void try_proc_introspec_cmd(struct task_struct *p,
 	}
 }
 
+static u32 calc_avg32(u32 old_val, u32 new_val)
+{
+	/*
+	 * Calculate the exponential weighted moving average (EWMA).
+	 *  - EWMA = (0.75 * old) + (0.25 * new)
+	 */
+	return (old_val - (old_val >> 2)) + (new_val >> 2);
+}
+
 static u64 calc_avg(u64 old_val, u64 new_val)
 {
 	/*
@@ -2367,14 +2376,14 @@ void BPF_STRUCT_OPS(lavd_dispatch, s32 cpu, struct task_struct *prev)
 
 }
 
-static u32 calc_cpuperf_target(struct sys_stat *stat_cur,
+static int calc_cpuperf_target(struct sys_stat *stat_cur,
 			       struct task_ctx *taskc, struct cpu_ctx *cpuc)
 {
 	u64 max_load, cpu_load;
 	u32 cpuperf_target;
 
 	if (!stat_cur || !taskc || !cpuc)
-		return 0;
+		return -EINVAL;
 
 	/*
 	 * We determine the clock frequency of a CPU using two factors: 1) the
@@ -2397,13 +2406,61 @@ static u32 calc_cpuperf_target(struct sys_stat *stat_cur,
 	max_load = stat_cur->avg_perf_cri * 1000 /* max cpu util */;
 	cpu_load = taskc->perf_cri * cpuc->util;
 	cpuperf_target = (cpu_load * SCX_CPUPERF_ONE) / max_load;
-	return min(cpuperf_target, SCX_CPUPERF_ONE);
+	cpuperf_target = min(cpuperf_target, SCX_CPUPERF_ONE);
+
+	cpuc->cpuperf_task = cpuperf_target;
+	cpuc->cpuperf_avg = calc_avg32(cpuc->cpuperf_avg, cpuperf_target);
+	return 0;
+}
+
+static bool try_increase_cpuperf_target(struct cpu_ctx *cpuc)
+{
+	/*
+	 * When a task becomes running, update CPU's performance target only
+	 * when the current task's target performance is higher. This helps
+	 * rapidly adopt workload changes by rapidly increasing CPU's
+	 * performance target.
+	 */
+	u32 target;
+
+	if (!cpuc)
+		return false;
+
+	target = max(cpuc->cpuperf_task, cpuc->cpuperf_avg);
+	if (cpuc->cpuperf_cur < target) {
+		cpuc->cpuperf_cur = target;
+		scx_bpf_cpuperf_set(cpuc->cpu_id, target);
+		return true;
+	}
+
+	return false;
+}
+
+static bool try_decrease_cpuperf_target(struct cpu_ctx *cpuc)
+{
+	/*
+	 * Upon every tick interval, we try to decrease the CPU's performance
+	 * target if the current one is higher than both the current task's
+	 * target and EWMA of past targets. This helps gradually adopt workload
+	 * changes upon sudden down falls.
+	 */
+	u32 target;
+
+	if (!cpuc)
+		return false;
+
+	target = max(cpuc->cpuperf_task, cpuc->cpuperf_avg);
+	if (cpuc->cpuperf_cur != target) {
+		cpuc->cpuperf_cur = target;
+		scx_bpf_cpuperf_set(cpuc->cpu_id, target);
+		return true;
+	}
+
+	return false;
 }
 
 void BPF_STRUCT_OPS(lavd_tick, struct task_struct *p_run)
 {
-	struct sys_stat *stat_cur = get_sys_stat_cur();
-	s32 cpu_id = scx_bpf_task_cpu(p_run);
 	struct cpu_ctx *cpuc_run;
 	struct task_ctx *taskc_run;
 	bool preempted = false;
@@ -2425,10 +2482,8 @@ void BPF_STRUCT_OPS(lavd_tick, struct task_struct *p_run)
 	 * task continues to run.
 	 */
 freq_out:
-	if (!no_freq_scaling && !preempted) {
-		u32 tgt = calc_cpuperf_target(stat_cur, taskc_run, cpuc_run);
-		scx_bpf_cpuperf_set(cpu_id, tgt);
-	}
+	if (!no_freq_scaling && !preempted)
+		try_decrease_cpuperf_target(cpuc_run);
 }
 
 void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags)
@@ -2497,6 +2552,7 @@ static bool need_to_calc_time_slice(struct task_struct *p)
 
 void BPF_STRUCT_OPS(lavd_running, struct task_struct *p)
 {
+	struct sys_stat *stat_cur = get_sys_stat_cur();
 	struct cpu_ctx *cpuc;
 	struct task_ctx *taskc;
 
@@ -2510,6 +2566,15 @@ void BPF_STRUCT_OPS(lavd_running, struct task_struct *p)
 
 	update_stat_for_running(p, taskc, cpuc);
 
+	/*
+	 * Calculate the task's CPU performance target and update if the new
+	 * target is higher than the current one. The CPU's performance target
+	 * urgently increases according to task's target but it decreases
+	 * gradually according to EWMA of past performance targets.
+	 */
+	calc_cpuperf_target(stat_cur, taskc, cpuc);
+	try_increase_cpuperf_target(cpuc);
+
 	/*
 	 * Update running task's information for preemption
 	 */

From 2e74b86b4af2484d546260e53eb4ed61e7819415 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 13 Jun 2024 00:44:04 +0900
Subject: [PATCH 5/6] scx_lavd: logging cpu performance target

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 scheds/rust/scx_lavd/src/bpf/intf.h     |  1 +
 scheds/rust/scx_lavd/src/bpf/main.bpf.c |  1 +
 scheds/rust/scx_lavd/src/main.rs        | 10 ++++++----
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h
index 374f63d70..0cc8d682d 100644
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@@ -231,6 +231,7 @@ struct task_ctx_x {
 	u64	avg_lat_cri;	/* average latency criticality */
 	u64	avg_perf_cri;	/* average performance criticality */
 	u32	nr_active;	/* number of active cores */
+	u32	cpuperf_cur;	/* CPU's current performance target */
 };
 
 
diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
index 88cf64fb1..8570ea8fd 100644
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -564,6 +564,7 @@ int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc, u32 cpu_id)
 	m->taskc_x.avg_lat_cri = stat_cur->avg_lat_cri;
 	m->taskc_x.avg_perf_cri = stat_cur->avg_perf_cri;
 	m->taskc_x.nr_active = stat_cur->nr_active;
+	m->taskc_x.cpuperf_cur = cpuc->cpuperf_cur;
 
 	memcpy(&m->taskc, taskc, sizeof(m->taskc));
 
diff --git a/scheds/rust/scx_lavd/src/main.rs b/scheds/rust/scx_lavd/src/main.rs
index 3ab872314..c4db8161a 100644
--- a/scheds/rust/scx_lavd/src/main.rs
+++ b/scheds/rust/scx_lavd/src/main.rs
@@ -188,14 +188,14 @@ impl<'a> Scheduler<'a> {
 
         if mseq % 32 == 1 {
             info!(
-                "| {:6} | {:8} | {:17} \
+                "| {:6} | {:7} | {:17} \
                    | {:4} | {:4} | {:9} \
                    | {:6} | {:8} | {:7} \
                    | {:8} | {:7} | {:8} \
                    | {:7} | {:7} | {:9} \
                    | {:9} | {:9} | {:9} \
                    | {:8} | {:8} | {:8} \
-                   | {:6} | {:6} | ",
+                   | {:8} | {:6} | {:6} |",
                 "mseq",
                 "pid",
                 "comm",
@@ -216,6 +216,7 @@ impl<'a> Scheduler<'a> {
                 "wake_freq",
                 "perf_cri",
                 "avg_pc",
+                "cpufreq",
                 "cpu_util",
                 "sys_ld",
                 "nr_act",
@@ -227,14 +228,14 @@ impl<'a> Scheduler<'a> {
         let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
 
         info!(
-            "| {:6} | {:8} | {:17} \
+            "| {:6} | {:7} | {:17} \
                | {:4} | {:4} | {:9} \
                | {:6} | {:8} | {:7} \
                | {:8} | {:7} | {:8} \
                | {:7} | {:7} | {:9} \
                | {:9} | {:9} | {:9} \
                | {:8} | {:8} | {:8} \
-               | {:6} | {:6} |",
+               | {:8} | {:6} | {:6} |",
             mseq,
             tx.pid,
             tx_comm,
@@ -255,6 +256,7 @@ impl<'a> Scheduler<'a> {
             tc.wake_freq,
             tc.perf_cri,
             tx.avg_perf_cri,
+            tx.cpuperf_cur,
             tx.cpu_util,
             tx.sys_load_factor,
             tx.nr_active,

From 747bf2a7d7e27027a2bc809077badc417254a2c0 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 13 Jun 2024 01:42:19 +0900
Subject: [PATCH 6/6] scx_lavd: add the design of CPU frequency scaling

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 scheds/rust/scx_lavd/src/bpf/main.bpf.c | 27 +++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
index 8570ea8fd..e881fadc7 100644
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@@ -131,6 +131,33 @@
  * highest priority tasks.
  *
  *
+ * 7. Performance criticality
+ * --------------------------
+ *
+ * We define the performance criticality metric to express how sensitive a task
+ * is to CPU frequency. The more performance-critical a task is, the higher the
+ * CPU frequency will be assigned. A task is more performance-critical in the
+ * following conditions: 1) the task's runtime in a second is longer (i.e.,
+ * task runtime x frequency), 2) the task's waiting or waken-up frequencies are
+ * higher (i.e., the task is in the middle of the task chain).
+ *
+ *
+ * 8. CPU frequency scaling
+ * ------------------------
+ *
+ * Two factors determine the clock frequency of a CPU: 1) the current CPU
+ * utilization and 2) the current task's CPU criticality compared to the
+ * system-wide average performance criticality. This effectively boosts the CPU
+ * clock frequency of performance-critical tasks even when the CPU utilization
+ * is low.
+ *
+ * When actually changing the CPU's performance target, we should be able to
+ * quickly capture the demand for spiky workloads while providing steady clock
+ * frequency to avoid unexpected performance fluctuations. To this end, we
+ * quickly increase the clock frequency when a task gets running but gradually
+ * decrease it upon every tick interval.
+ *
+ *
  * Copyright (c) 2023, 2024 Valve Corporation.
  * Author: Changwoo Min <changwoo@igalia.com>
  */