sched_ext: Add support for cgroup bandwidth control interface

From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Fri, 13 Jun 2025 15:06:47 -1000 - Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED. - Put bandwidth control interface files for both cgroup v1 and v2 under CONFIG_GROUP_SCHED_BANDWIDTH. - Update tg_bandwidth() to fetch configuration parameters from fair if CONFIG_CFS_BANDWIDTH, SCX otherwise. - Update tg_set_bandwidth() to update the parameters for both fair and SCX. - Add bandwidth control parameters to struct scx_cgroup_init_args. - Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth control parameter updates. - Update scx_qmap and maximal selftest to test the new feature. Signed-off-by: Tejun Heo <tj@kernel.org>
author: Tejun Heo <tj@kernel.org> 2025-06-13 15:34:22 -1000
committer: Tejun Heo <tj@kernel.org> 2025-06-20 17:03:51 -1000
commit: ddceadce63d9cb752c2472e220ded05cabaf7971 (patch)
tree: a011e5b43cd6a369b2442009851a97dcc5e47e92 /kernel/sched/ext.c
parent: sched_ext, sched/core: Factor out struct scx_task_group (diff)
download: linux-ddceadce63d9cb752c2472e220ded05cabaf7971.tar.gz
linux-ddceadce63d9cb752c2472e220ded05cabaf7971.zip
1 files changed, 63 insertions, 3 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6732e50e0679..39cba11688a9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
 struct scx_cgroup_init_args {
 	/* the weight of the cgroup [1..10000] */
 	u32			weight;
+
+	/* bandwidth control parameters from cpu.max and cpu.max.burst */
+	u64			bw_period_us;
+	u64			bw_quota_us;
+	u64			bw_burst_us;
 };
 
 enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
 	 * @cgrp: cgroup whose weight is being updated
 	 * @weight: new weight [1..10000]
 	 *
-	 * Update @tg's weight to @weight.
+	 * Update @cgrp's weight to @weight.
 	 */
 	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+	/**
+	 * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+	 * @cgrp: cgroup whose bandwidth is being updated
+	 * @period_us: bandwidth control period
+	 * @quota_us: bandwidth control quota
+	 * @burst_us: bandwidth control burst
+	 *
+	 * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+	 * cgroup interface.
+	 *
+	 * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+	 * to. For example, if @period_us is 1_000_000 and @quota_us is
+	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+	 * interpreted in the same fashion and specifies how much @cgrp can
+	 * burst temporarily. The specific control mechanism and thus the
+	 * interpretation of @period_us and burstiness is upto to the BPF
+	 * scheduler.
+	 */
+	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+				     u64 period_us, u64 quota_us, u64 burst_us);
+
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
 	/*
@@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled;
 void scx_tg_init(struct task_group *tg)
 {
 	tg->scx.weight = CGROUP_WEIGHT_DFL;
+	tg->scx.bw_period_us = default_bw_period_us();
+	tg->scx.bw_quota_us = RUNTIME_INF;
 }
 
 int scx_tg_online(struct task_group *tg)
@@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg)
 	if (scx_cgroup_enabled) {
 		if (SCX_HAS_OP(sch, cgroup_init)) {
 			struct scx_cgroup_init_args args =
-				{ .weight = tg->scx.weight };
+				{ .weight = tg->scx.weight,
+				  .bw_period_us = tg->scx.bw_period_us,
+				  .bw_quota_us = tg->scx.bw_quota_us,
+				  .bw_burst_us = tg->scx.bw_burst_us };
 
 			ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
 					      NULL, tg->css.cgroup, &args);
@@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
 	/* TODO: Implement ops->cgroup_set_idle() */
 }
 
+void scx_group_set_bandwidth(struct task_group *tg,
+			     u64 period_us, u64 quota_us, u64 burst_us)
+{
+	struct scx_sched *sch = scx_root;
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+	    (tg->scx.bw_period_us != period_us ||
+	     tg->scx.bw_quota_us != quota_us ||
+	     tg->scx.bw_burst_us != burst_us))
+		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+			    tg_cgrp(tg), period_us, quota_us, burst_us);
+
+	tg->scx.bw_period_us = period_us;
+	tg->scx.bw_quota_us = quota_us;
+	tg->scx.bw_burst_us = burst_us;
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
 static void scx_cgroup_lock(void)
 {
 	percpu_down_write(&scx_cgroup_rwsem);
@@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch)
 	rcu_read_lock();
 	css_for_each_descendant_pre(css, &root_task_group.css) {
 		struct task_group *tg = css_tg(css);
-		struct scx_cgroup_init_args args = { .weight = tg->scx.weight };
+		struct scx_cgroup_init_args args = {
+			.weight = tg->scx.weight,
+			.bw_period_us = tg->scx.bw_period_us,
+			.bw_quota_us = tg->scx.bw_quota_us,
+			.bw_burst_us = tg->scx.bw_burst_us,
+		};
 
 		if ((tg->scx.flags &
 		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
@@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
 #endif
 static void sched_ext_ops__cpu_online(s32 cpu) {}
 static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.cgroup_move		= sched_ext_ops__cgroup_move,
 	.cgroup_cancel_move	= sched_ext_ops__cgroup_cancel_move,
 	.cgroup_set_weight	= sched_ext_ops__cgroup_set_weight,
+	.cgroup_set_bandwidth	= sched_ext_ops__cgroup_set_bandwidth,
 #endif
 	.cpu_online		= sched_ext_ops__cpu_online,
 	.cpu_offline		= sched_ext_ops__cpu_offline,
author	Tejun Heo <tj@kernel.org>	2025-06-13 15:34:22 -1000
committer	Tejun Heo <tj@kernel.org>	2025-06-20 17:03:51 -1000
commit	ddceadce63d9cb752c2472e220ded05cabaf7971 (patch)
tree	a011e5b43cd6a369b2442009851a97dcc5e47e92 /kernel/sched/ext.c
parent	sched_ext, sched/core: Factor out struct scx_task_group (diff)
download	linux-ddceadce63d9cb752c2472e220ded05cabaf7971.tar.gz linux-ddceadce63d9cb752c2472e220ded05cabaf7971.zip