summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/ext.c532
-rw-r--r--kernel/sched/ext_internal.h67
2 files changed, 565 insertions, 34 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 142845bcddaa..bb3e33b660da 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,8 @@
#include <linux/btf_ids.h>
#include "ext_idle.h"
+static DEFINE_RAW_SPINLOCK(scx_sched_lock);
+
/*
* NOTE: sched_ext is in the process of growing multiple scheduler support and
* scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -20,6 +22,12 @@
static struct scx_sched __rcu *scx_root;
/*
+ * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock.
+ * Readers can hold either or rcu_read_lock().
+ */
+static LIST_HEAD(scx_sched_all);
+
+/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
* guarantee system safety. Maintain a dedicated task list which contains every
@@ -197,6 +205,7 @@ static void process_ddsp_deferred_locals(struct rq *rq);
static bool task_dead_and_done(struct task_struct *p);
static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
+static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
@@ -245,6 +254,88 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_parent - Find the parent sched
+ * @sch: sched to find the parent of
+ *
+ * Returns the parent scheduler or %NULL if @sch is root.
+ */
+static struct scx_sched *scx_parent(struct scx_sched *sch)
+{
+ if (sch->level)
+ return sch->ancestors[sch->level - 1];
+ else
+ return NULL;
+}
+
+/**
+ * scx_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sched whose descendants to walk
+ *
+ * To be used by scx_for_each_descendant_pre(). Find the next descendant to
+ * visit for pre-order traversal of @root's descendants. @root is included in
+ * the iteration and the first node to be visited.
+ */
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
+ struct scx_sched *root)
+{
+ struct scx_sched *next;
+
+ lockdep_assert(lockdep_is_held(&scx_enable_mutex) ||
+ lockdep_is_held(&scx_sched_lock));
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
+ return list_next_entry(pos, sibling);
+ pos = scx_parent(pos);
+ }
+
+ return NULL;
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+/**
+ * scx_is_descendant - Test whether sched is a descendant
+ * @sch: sched to test
+ * @ancestor: ancestor sched to test against
+ *
+ * Test whether @sch is a descendant of @ancestor.
+ */
+static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
+{
+ if (sch->level < ancestor->level)
+ return false;
+ return sch->ancestors[ancestor->level] == ancestor;
+}
+
+/**
+ * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
+ * @pos: iteration cursor
+ * @root: sched to walk the descendants of
+ *
+ * Walk @root's descendants. @root is included in the iteration and the first
+ * node to be visited. Must be called with either scx_enable_mutex or
+ * scx_sched_lock held.
+ */
+#define scx_for_each_descendant_pre(pos, root) \
+ for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \
+ (pos) = scx_next_descendant_pre((pos), (root)))
+
static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
struct task_struct *p)
{
@@ -514,7 +605,7 @@ struct scx_task_iter {
struct rq_flags rf;
u32 cnt;
bool list_locked;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
struct cgroup *cgrp;
struct cgroup_subsys_state *css_pos;
struct css_task_iter css_iter;
@@ -553,7 +644,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
{
memset(iter, 0, sizeof(*iter));
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
if (cgrp) {
lockdep_assert_held(&cgroup_mutex);
iter->cgrp = cgrp;
@@ -614,7 +705,7 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
*/
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
if (iter->cgrp) {
if (iter->css_pos)
css_task_iter_end(&iter->css_iter);
@@ -645,7 +736,7 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
cond_resched();
}
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
if (iter->cgrp) {
while (iter->css_pos) {
struct task_struct *p;
@@ -3032,7 +3123,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
scx_set_task_state(p, SCX_TASK_INIT);
if (p->scx.disallow) {
- if (unlikely(fork)) {
+ if (unlikely(scx_parent(sch))) {
+ scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
+ p->comm, p->pid);
+ } else if (unlikely(fork)) {
scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
p->comm, p->pid);
} else {
@@ -3555,25 +3649,51 @@ void scx_group_set_bandwidth(struct task_group *tg,
percpu_up_read(&scx_cgroup_ops_rwsem);
}
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+static struct cgroup *root_cgroup(void)
+{
+ return &cgrp_dfl_root.cgrp;
+}
+
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+ return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+ struct cgroup *pos;
+ struct cgroup_subsys_state *css;
+
+ cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+ rcu_assign_pointer(pos->scx_sched, sch);
+}
static void scx_cgroup_lock(void)
{
+#ifdef CONFIG_EXT_GROUP_SCHED
percpu_down_write(&scx_cgroup_ops_rwsem);
+#endif
cgroup_lock();
}
static void scx_cgroup_unlock(void)
{
cgroup_unlock();
+#ifdef CONFIG_EXT_GROUP_SCHED
percpu_up_write(&scx_cgroup_ops_rwsem);
+#endif
}
-
-#else /* CONFIG_EXT_GROUP_SCHED */
-
+#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+static struct cgroup *root_cgroup(void) { return NULL; }
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
static void scx_cgroup_lock(void) {}
static void scx_cgroup_unlock(void) {}
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
+#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
/*
* Omitted operations:
@@ -3622,13 +3742,15 @@ DEFINE_SCHED_CLASS(ext) = {
#endif
};
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
+ struct scx_sched *sch)
{
memset(dsq, 0, sizeof(*dsq));
raw_spin_lock_init(&dsq->lock);
INIT_LIST_HEAD(&dsq->list);
dsq->id = dsq_id;
+ dsq->sched = sch;
}
static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -3826,6 +3948,12 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
irq_work_sync(&sch->error_irq_work);
kthread_destroy_worker(sch->helper);
+#ifdef CONFIG_EXT_SUB_SCHED
+ kfree(sch->cgrp_path);
+ if (sch_cgroup(sch))
+ cgroup_put(sch_cgroup(sch));
+#endif /* CONFIG_EXT_SUB_SCHED */
+
free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
@@ -4405,6 +4533,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
return "unregistered from the main kernel";
case SCX_EXIT_SYSRQ:
return "disabled by sysrq-S";
+ case SCX_EXIT_PARENT:
+ return "parent exiting";
case SCX_EXIT_ERROR:
return "runtime error";
case SCX_EXIT_ERROR_BPF:
@@ -4430,6 +4560,69 @@ static void free_kick_syncs(void)
}
}
+#ifdef CONFIG_EXT_SUB_SCHED
+static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+static void drain_descendants(struct scx_sched *sch)
+{
+ /*
+ * Child scheds that finished the critical part of disabling will take
+ * themselves off @sch->children. Wait for it to drain. As propagation
+ * is recursive, empty @sch->children means that all proper descendant
+ * scheds reached unlinking stage.
+ */
+ wait_event(scx_unlink_waitq, list_empty(&sch->children));
+}
+
+static void scx_sub_disable(struct scx_sched *sch)
+{
+ struct scx_sched *parent = scx_parent(sch);
+
+ drain_descendants(sch);
+
+ mutex_lock(&scx_enable_mutex);
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ set_cgroup_sched(sch_cgroup(sch), parent);
+
+ /* TODO - perform actual disabling here */
+
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ raw_spin_lock_irq(&scx_sched_lock);
+ list_del_init(&sch->sibling);
+ list_del_rcu(&sch->all);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
+ mutex_unlock(&scx_enable_mutex);
+
+ /*
+ * @sch is now unlinked from the parent's children list. Notify and call
+ * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ * after unlinking and releasing all locks. See scx_claim_exit().
+ */
+ wake_up_all(&scx_unlink_waitq);
+
+ if (sch->ops.sub_detach && sch->sub_attached) {
+ struct scx_sub_detach_args sub_detach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+ SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+ &sub_detach_args);
+ }
+
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+ kobject_del(&sch->kobj);
+}
+#else /* CONFIG_EXT_SUB_SCHED */
+static void drain_descendants(struct scx_sched *sch) { }
+static void scx_sub_disable(struct scx_sched *sch) { }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
static void scx_root_disable(struct scx_sched *sch)
{
struct scx_exit_info *ei = sch->exit_info;
@@ -4437,9 +4630,10 @@ static void scx_root_disable(struct scx_sched *sch)
struct task_struct *p;
int cpu;
- /* guarantee forward progress by bypassing scx_ops */
+ /* guarantee forward progress and wait for descendants to be disabled */
scx_bypass(true);
WRITE_ONCE(scx_aborting, false);
+ drain_descendants(sch);
switch (scx_set_enable_state(SCX_DISABLING)) {
case SCX_DISABLING:
@@ -4498,6 +4692,11 @@ static void scx_root_disable(struct scx_sched *sch)
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
+
+ scx_cgroup_lock();
+ set_cgroup_sched(sch_cgroup(sch), NULL);
+ scx_cgroup_unlock();
+
percpu_up_write(&scx_fork_rwsem);
/*
@@ -4534,6 +4733,10 @@ static void scx_root_disable(struct scx_sched *sch)
cancel_delayed_work_sync(&scx_watchdog_work);
+ raw_spin_lock_irq(&scx_sched_lock);
+ list_del_rcu(&sch->all);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
/*
* scx_root clearing must be inside cpus_read_lock(). See
* handle_hotplug().
@@ -4591,6 +4794,24 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
* successfully reach scx_bypass().
*/
WRITE_ONCE(scx_aborting, true);
+
+ /*
+ * Propagate exits to descendants immediately. Each has a dedicated
+ * helper kthread and can run in parallel. While most of disabling is
+ * serialized, running them in separate threads allows parallelizing
+ * ops.exit(), which can take arbitrarily long prolonging bypass mode.
+ *
+ * This doesn't cause recursions as propagation only takes place for
+ * non-propagation exits.
+ */
+ if (kind != SCX_EXIT_PARENT) {
+ scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) {
+ struct scx_sched *pos;
+ scx_for_each_descendant_pre(pos, sch)
+ scx_disable(pos, SCX_EXIT_PARENT);
+ }
+ }
+
return true;
}
@@ -4611,7 +4832,10 @@ static void scx_disable_workfn(struct kthread_work *work)
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- scx_root_disable(sch);
+ if (scx_parent(sch))
+ scx_sub_disable(sch);
+ else
+ scx_root_disable(sch);
}
static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
@@ -4987,12 +5211,15 @@ static int alloc_kick_syncs(void)
return 0;
}
-static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
+ struct cgroup *cgrp,
+ struct scx_sched *parent)
{
struct scx_sched *sch;
+ s32 level = parent ? parent->level + 1 : 0;
int node, ret;
- sch = kzalloc_obj(*sch);
+ sch = kzalloc_flex(*sch, ancestors, level);
if (!sch)
return ERR_PTR(-ENOMEM);
@@ -5021,7 +5248,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
goto err_free_gdsqs;
}
- init_dsq(dsq, SCX_DSQ_GLOBAL);
+ init_dsq(dsq, SCX_DSQ_GLOBAL, sch);
sch->global_dsqs[node] = dsq;
}
@@ -5039,6 +5266,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sched_set_fifo(sch->helper->task);
+ if (parent)
+ memcpy(sch->ancestors, parent->ancestors,
+ level * sizeof(parent->ancestors[0]));
+ sch->ancestors[level] = sch;
+ sch->level = level;
+
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
@@ -5046,10 +5279,46 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
ops->priv = sch;
sch->kobj.kset = scx_kset;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto err_stop_helper;
+ cgroup_path(cgrp, buf, PATH_MAX);
+ sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
+ kfree(buf);
+ if (!sch->cgrp_path)
+ goto err_stop_helper;
+
+ sch->cgrp = cgrp;
+ INIT_LIST_HEAD(&sch->children);
+ INIT_LIST_HEAD(&sch->sibling);
+
+ if (parent)
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype,
+ &parent->sub_kset->kobj,
+ "sub-%llu", cgroup_id(cgrp));
+ else
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+
+ if (ret < 0) {
+ kfree(sch->cgrp_path);
+ goto err_stop_helper;
+ }
+
+ if (ops->sub_attach) {
+ sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
+ if (!sch->sub_kset) {
+ kobject_put(&sch->kobj);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+#else /* CONFIG_EXT_SUB_SCHED */
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
goto err_stop_helper;
-
+#endif /* CONFIG_EXT_SUB_SCHED */
return sch;
err_stop_helper:
@@ -5157,7 +5426,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret)
goto err_unlock;
- sch = scx_alloc_and_add_sched(ops);
+ sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
goto err_free_ksyncs;
@@ -5174,8 +5443,13 @@ static void scx_root_enable_workfn(struct kthread_work *work)
atomic_long_set(&scx_nr_rejected, 0);
- for_each_possible_cpu(cpu)
- cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->scx.local_dsq.sched = sch;
+ rq->scx.bypass_dsq.sched = sch;
+ rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
+ }
/*
* Keep CPUs stable during enable so that the BPF scheduler can track
@@ -5189,6 +5463,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
*/
rcu_assign_pointer(scx_root, sch);
+ raw_spin_lock_irq(&scx_sched_lock);
+ list_add_tail_rcu(&sch->all, &scx_sched_all);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
scx_idle_enable(ops);
if (sch->ops.init) {
@@ -5278,6 +5556,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
* never sees uninitialized tasks.
*/
scx_cgroup_lock();
+ set_cgroup_sched(sch_cgroup(sch), sch);
ret = scx_cgroup_init(sch);
if (ret)
goto err_disable_unlock_all;
@@ -5392,6 +5671,185 @@ err_disable:
cmd->ret = 0;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+/* verify that a scheduler can be attached to @cgrp and return the parent */
+static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
+{
+ struct scx_sched *parent = cgrp->scx_sched;
+ struct scx_sched *pos;
+
+ lockdep_assert_held(&scx_sched_lock);
+
+ /* can't attach twice to the same cgroup */
+ if (parent->cgrp == cgrp)
+ return ERR_PTR(-EBUSY);
+
+ /* does $parent allow sub-scheds? */
+ if (!parent->ops.sub_attach)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /* can't insert between $parent and its exiting children */
+ list_for_each_entry(pos, &parent->children, sibling)
+ if (cgroup_is_descendant(pos->cgrp, cgrp))
+ return ERR_PTR(-EBUSY);
+
+ return parent;
+}
+
+static void scx_sub_enable_workfn(struct kthread_work *work)
+{
+ struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
+ struct cgroup *cgrp;
+ struct scx_sched *parent, *sch;
+ s32 ret;
+
+ mutex_lock(&scx_enable_mutex);
+
+ if (!scx_enabled()) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
+ }
+
+ raw_spin_lock_irq(&scx_sched_lock);
+ parent = find_parent_sched(cgrp);
+ if (IS_ERR(parent)) {
+ raw_spin_unlock_irq(&scx_sched_lock);
+ ret = PTR_ERR(parent);
+ goto out_put_cgrp;
+ }
+ kobject_get(&parent->kobj);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
+ sch = scx_alloc_and_add_sched(ops, cgrp, parent);
+ kobject_put(&parent->kobj);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
+ goto out_put_cgrp;
+ }
+
+ raw_spin_lock_irq(&scx_sched_lock);
+ list_add_tail(&sch->sibling, &parent->children);
+ list_add_tail_rcu(&sch->all, &scx_sched_all);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
+ if (sch->level >= SCX_SUB_MAX_DEPTH) {
+ scx_error(sch, "max nesting depth %d violated",
+ SCX_SUB_MAX_DEPTH);
+ goto err_disable;
+ }
+
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+ if (ret) {
+ ret = ops_sanitize_err(sch, "init", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
+ goto err_disable;
+ }
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
+ }
+
+ if (validate_ops(sch, ops))
+ goto err_disable;
+
+ struct scx_sub_attach_args sub_attach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+
+ ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL,
+ &sub_attach_args);
+ if (ret) {
+ ret = ops_sanitize_err(sch, "sub_attach", ret);
+ scx_error(sch, "parent rejected (%d)", ret);
+ goto err_disable;
+ }
+ sch->sub_attached = true;
+
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ /*
+ * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
+ * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down.
+ */
+ set_cgroup_sched(sch_cgroup(sch), sch);
+ if (!(cgrp->self.flags & CSS_ONLINE)) {
+ scx_error(sch, "cgroup is not online");
+ goto err_unlock_and_disable;
+ }
+
+ /* TODO - perform actual enabling here */
+
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ ret = 0;
+ goto out_unlock;
+
+out_put_cgrp:
+ cgroup_put(cgrp);
+out_unlock:
+ mutex_unlock(&scx_enable_mutex);
+ cmd->ret = ret;
+ return;
+
+err_unlock_and_disable:
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+err_disable:
+ mutex_unlock(&scx_enable_mutex);
+ kthread_flush_work(&sch->disable_work);
+ cmd->ret = 0;
+}
+
+static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (!cgroup_on_dfl(cgrp))
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ /* inherit ->scx_sched from $parent */
+ if (parent)
+ rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ /* if there is a sched attached, shoot it down */
+ if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
+ scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_RSN_CGROUP_OFFLINE,
+ "cgroup %llu going offline", cgroup_id(cgrp));
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block scx_cgroup_lifetime_nb = {
+ .notifier_call = scx_cgroup_lifetime_notify,
+};
+
+static s32 __init scx_cgroup_lifetime_notifier_init(void)
+{
+ return blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &scx_cgroup_lifetime_nb);
+}
+core_initcall(scx_cgroup_lifetime_notifier_init);
+#endif /* CONFIG_EXT_SUB_SCHED */
+
static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
static struct kthread_worker *helper;
@@ -5418,7 +5876,12 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_unlock(&helper_mutex);
}
- kthread_init_work(&cmd.work, scx_root_enable_workfn);
+#ifdef CONFIG_EXT_SUB_SCHED
+ if (ops->sub_cgroup_id > 1)
+ kthread_init_work(&cmd.work, scx_sub_enable_workfn);
+ else
+#endif /* CONFIG_EXT_SUB_SCHED */
+ kthread_init_work(&cmd.work, scx_root_enable_workfn);
cmd.ops = ops;
kthread_queue_work(READ_ONCE(helper), &cmd.work);
@@ -5520,6 +5983,11 @@ static int bpf_scx_init_member(const struct btf_type *t,
case offsetof(struct sched_ext_ops, hotplug_seq):
ops->hotplug_seq = *(u64 *)(udata + moff);
return 1;
+#ifdef CONFIG_EXT_SUB_SCHED
+ case offsetof(struct sched_ext_ops, sub_cgroup_id):
+ ops->sub_cgroup_id = *(u64 *)(udata + moff);
+ return 1;
+#endif /* CONFIG_EXT_SUB_SCHED */
}
return 0;
@@ -5542,6 +6010,8 @@ static int bpf_scx_check_member(const struct btf_type *t,
case offsetof(struct sched_ext_ops, cpu_offline):
case offsetof(struct sched_ext_ops, init):
case offsetof(struct sched_ext_ops, exit):
+ case offsetof(struct sched_ext_ops, sub_attach):
+ case offsetof(struct sched_ext_ops, sub_detach):
break;
default:
if (prog->sleepable)
@@ -5619,7 +6089,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
-#endif
+#endif /* CONFIG_EXT_GROUP_SCHED */
+static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; }
+static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {}
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
static s32 sched_ext_ops__init(void) { return -EINVAL; }
@@ -5659,6 +6131,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
.cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
#endif
+ .sub_attach = sched_ext_ops__sub_attach,
+ .sub_detach = sched_ext_ops__sub_detach,
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
.init = sched_ext_ops__init,
@@ -5941,8 +6415,10 @@ void __init init_sched_ext_class(void)
struct rq *rq = cpu_rq(cpu);
int n = cpu_to_node(cpu);
- init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
- init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
+ /* local/bypass dsq's sch will be set during scx_root_enable() */
+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
+ init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL);
+
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
@@ -6598,16 +7074,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
if (!dsq)
return -ENOMEM;
- init_dsq(dsq, dsq_id);
-
rcu_read_lock();
sch = rcu_dereference(scx_root);
- if (sch)
+ if (sch) {
+ init_dsq(dsq, dsq_id, sch);
ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
dsq_hash_params);
- else
+ } else {
ret = -ENODEV;
+ }
rcu_read_unlock();
if (ret)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 417d3c6f02fe..75b7f57e20ab 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -28,6 +28,8 @@ enum scx_consts {
SCX_BYPASS_LB_DONOR_PCT = 125,
SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
SCX_BYPASS_LB_BATCH = 256,
+
+ SCX_SUB_MAX_DEPTH = 4,
};
enum scx_exit_kind {
@@ -38,6 +40,7 @@ enum scx_exit_kind {
SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+ SCX_EXIT_PARENT, /* parent exiting */
SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
@@ -62,6 +65,7 @@ enum scx_exit_kind {
enum scx_exit_code {
/* Reasons */
SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+ SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32,
/* Actions */
SCX_ECODE_ACT_RESTART = 1LLU << 48,
@@ -213,7 +217,7 @@ struct scx_exit_task_args {
bool cancelled;
};
-/* argument container for ops->cgroup_init() */
+/* argument container for ops.cgroup_init() */
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
@@ -236,12 +240,12 @@ enum scx_cpu_preempt_reason {
};
/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * Argument container for ops.cpu_acquire(). Currently empty, but may be
* expanded in the future.
*/
struct scx_cpu_acquire_args {};
-/* argument container for ops->cpu_release() */
+/* argument container for ops.cpu_release() */
struct scx_cpu_release_args {
/* the reason the CPU was preempted */
enum scx_cpu_preempt_reason reason;
@@ -250,9 +254,7 @@ struct scx_cpu_release_args {
struct task_struct *task;
};
-/*
- * Informational context provided to dump operations.
- */
+/* informational context provided to dump operations */
struct scx_dump_ctx {
enum scx_exit_kind kind;
s64 exit_code;
@@ -261,6 +263,18 @@ struct scx_dump_ctx {
u64 at_jiffies;
};
+/* argument container for ops.sub_attach() */
+struct scx_sub_attach_args {
+ struct sched_ext_ops *ops;
+ char *cgroup_path;
+};
+
+/* argument container for ops.sub_detach() */
+struct scx_sub_detach_args {
+ struct sched_ext_ops *ops;
+ char *cgroup_path;
+};
+
/**
* struct sched_ext_ops - Operation table for BPF scheduler implementation
*
@@ -721,6 +735,20 @@ struct sched_ext_ops {
#endif /* CONFIG_EXT_GROUP_SCHED */
+ /**
+ * @sub_attach: Attach a sub-scheduler
+ * @args: argument container, see the struct definition
+ *
+ * Return 0 to accept the sub-scheduler. -errno to reject.
+ */
+ s32 (*sub_attach)(struct scx_sub_attach_args *args);
+
+ /**
+ * @sub_detach: Detach a sub-scheduler
+ * @args: argument container, see the struct definition
+ */
+ void (*sub_detach)(struct scx_sub_detach_args *args);
+
/*
* All online ops must come before ops.cpu_online().
*/
@@ -762,6 +790,10 @@ struct sched_ext_ops {
*/
void (*exit)(struct scx_exit_info *info);
+ /*
+ * Data fields must comes after all ops fields.
+ */
+
/**
* @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
*/
@@ -797,6 +829,12 @@ struct sched_ext_ops {
u64 hotplug_seq;
/**
+ * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the
+ * specified cgroup.
+ */
+ u64 sub_cgroup_id;
+
+ /**
* @name: BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
@@ -900,6 +938,8 @@ struct scx_sched {
struct scx_dispatch_q **global_dsqs;
struct scx_sched_pcpu __percpu *pcpu;
+ s32 level;
+
/*
* Updates to the following warned bitfields can race causing RMW issues
* but it doesn't really matter.
@@ -907,6 +947,18 @@ struct scx_sched {
bool warned_zero_slice:1;
bool warned_deprecated_rq:1;
+ struct list_head all;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ struct list_head children;
+ struct list_head sibling;
+ struct cgroup *cgrp;
+ char *cgrp_path;
+ struct kset *sub_kset;
+
+ bool sub_attached;
+#endif /* CONFIG_EXT_SUB_SCHED */
+
atomic_t exit_kind;
struct scx_exit_info *exit_info;
@@ -916,6 +968,9 @@ struct scx_sched {
struct irq_work error_irq_work;
struct kthread_work disable_work;
struct rcu_work rcu_work;
+
+ /* all ancestors including self */
+ struct scx_sched *ancestors[];
};
enum scx_wake_flags {