diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/ext.c | 532 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 67 |
2 files changed, 565 insertions, 34 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 142845bcddaa..bb3e33b660da 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,8 @@ #include <linux/btf_ids.h> #include "ext_idle.h" +static DEFINE_RAW_SPINLOCK(scx_sched_lock); + /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -20,6 +22,12 @@ static struct scx_sched __rcu *scx_root; /* + * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. + * Readers can hold either or rcu_read_lock(). + */ +static LIST_HEAD(scx_sched_all); + +/* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to * guarantee system safety. Maintain a dedicated task list which contains every @@ -197,6 +205,7 @@ static void process_ddsp_deferred_locals(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -245,6 +254,88 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_parent - Find the parent sched + * @sch: sched to find the parent of + * + * Returns the parent scheduler or %NULL if @sch is root. + */ +static struct scx_sched *scx_parent(struct scx_sched *sch) +{ + if (sch->level) + return sch->ancestors[sch->level - 1]; + else + return NULL; +} + +/** + * scx_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: sched whose descendants to walk + * + * To be used by scx_for_each_descendant_pre(). Find the next descendant to + * visit for pre-order traversal of @root's descendants. @root is included in + * the iteration and the first node to be visited. + */ +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, + struct scx_sched *root) +{ + struct scx_sched *next; + + lockdep_assert(lockdep_is_held(&scx_enable_mutex) || + lockdep_is_held(&scx_sched_lock)); + + /* if first iteration, visit @root */ + if (!pos) + return root; + + /* visit the first child if exists */ + next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + while (pos != root) { + if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) + return list_next_entry(pos, sibling); + pos = scx_parent(pos); + } + + return NULL; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +#endif /* CONFIG_EXT_SUB_SCHED */ + +/** + * scx_is_descendant - Test whether sched is a descendant + * @sch: sched to test + * @ancestor: ancestor sched to test against + * + * Test whether @sch is a descendant of @ancestor. + */ +static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) +{ + if (sch->level < ancestor->level) + return false; + return sch->ancestors[ancestor->level] == ancestor; +} + +/** + * scx_for_each_descendant_pre - pre-order walk of a sched's descendants + * @pos: iteration cursor + * @root: sched to walk the descendants of + * + * Walk @root's descendants. @root is included in the iteration and the first + * node to be visited. Must be called with either scx_enable_mutex or + * scx_sched_lock held. + */ +#define scx_for_each_descendant_pre(pos, root) \ + for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ + (pos) = scx_next_descendant_pre((pos), (root))) + static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, struct task_struct *p) { @@ -514,7 +605,7 @@ struct scx_task_iter { struct rq_flags rf; u32 cnt; bool list_locked; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED struct cgroup *cgrp; struct cgroup_subsys_state *css_pos; struct css_task_iter css_iter; @@ -553,7 +644,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) { memset(iter, 0, sizeof(*iter)); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (cgrp) { lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; @@ -614,7 +705,7 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (iter->cgrp) { if (iter->css_pos) css_task_iter_end(&iter->css_iter); @@ -645,7 +736,7 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) cond_resched(); } -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (iter->cgrp) { while (iter->css_pos) { struct task_struct *p; @@ -3032,7 +3123,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork scx_set_task_state(p, SCX_TASK_INIT); if (p->scx.disallow) { - if (unlikely(fork)) { + if (unlikely(scx_parent(sch))) { + scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", + p->comm, p->pid); + } else if (unlikely(fork)) { scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", p->comm, p->pid); } else { @@ -3555,25 +3649,51 @@ void scx_group_set_bandwidth(struct task_group *tg, percpu_up_read(&scx_cgroup_ops_rwsem); } +#endif /* CONFIG_EXT_GROUP_SCHED */ + +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +static struct cgroup *root_cgroup(void) +{ + return &cgrp_dfl_root.cgrp; +} + +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} static void scx_cgroup_lock(void) { +#ifdef CONFIG_EXT_GROUP_SCHED percpu_down_write(&scx_cgroup_ops_rwsem); +#endif cgroup_lock(); } static void scx_cgroup_unlock(void) { cgroup_unlock(); +#ifdef CONFIG_EXT_GROUP_SCHED percpu_up_write(&scx_cgroup_ops_rwsem); +#endif } - -#else /* CONFIG_EXT_GROUP_SCHED */ - +#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +static struct cgroup *root_cgroup(void) { return NULL; } +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} - -#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ /* * Omitted operations: @@ -3622,13 +3742,15 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; + dsq->sched = sch; } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -3826,6 +3948,12 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->error_irq_work); kthread_destroy_worker(sch->helper); +#ifdef CONFIG_EXT_SUB_SCHED + kfree(sch->cgrp_path); + if (sch_cgroup(sch)) + cgroup_put(sch_cgroup(sch)); +#endif /* CONFIG_EXT_SUB_SCHED */ + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) @@ -4405,6 +4533,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "unregistered from the main kernel"; case SCX_EXIT_SYSRQ: return "disabled by sysrq-S"; + case SCX_EXIT_PARENT: + return "parent exiting"; case SCX_EXIT_ERROR: return "runtime error"; case SCX_EXIT_ERROR_BPF: @@ -4430,6 +4560,69 @@ static void free_kick_syncs(void) } } +#ifdef CONFIG_EXT_SUB_SCHED +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); + +static void drain_descendants(struct scx_sched *sch) +{ + /* + * Child scheds that finished the critical part of disabling will take + * themselves off @sch->children. Wait for it to drain. As propagation + * is recursive, empty @sch->children means that all proper descendant + * scheds reached unlinking stage. + */ + wait_event(scx_unlink_waitq, list_empty(&sch->children)); +} + +static void scx_sub_disable(struct scx_sched *sch) +{ + struct scx_sched *parent = scx_parent(sch); + + drain_descendants(sch); + + mutex_lock(&scx_enable_mutex); + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + set_cgroup_sched(sch_cgroup(sch), parent); + + /* TODO - perform actual disabling here */ + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + raw_spin_lock_irq(&scx_sched_lock); + list_del_init(&sch->sibling); + list_del_rcu(&sch->all); + raw_spin_unlock_irq(&scx_sched_lock); + + mutex_unlock(&scx_enable_mutex); + + /* + * @sch is now unlinked from the parent's children list. Notify and call + * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called + * after unlinking and releasing all locks. See scx_claim_exit(). + */ + wake_up_all(&scx_unlink_waitq); + + if (sch->ops.sub_detach && sch->sub_attached) { + struct scx_sub_detach_args sub_detach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL, + &sub_detach_args); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info); + kobject_del(&sch->kobj); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static void drain_descendants(struct scx_sched *sch) { } +static void scx_sub_disable(struct scx_sched *sch) { } +#endif /* CONFIG_EXT_SUB_SCHED */ + static void scx_root_disable(struct scx_sched *sch) { struct scx_exit_info *ei = sch->exit_info; @@ -4437,9 +4630,10 @@ static void scx_root_disable(struct scx_sched *sch) struct task_struct *p; int cpu; - /* guarantee forward progress by bypassing scx_ops */ + /* guarantee forward progress and wait for descendants to be disabled */ scx_bypass(true); WRITE_ONCE(scx_aborting, false); + drain_descendants(sch); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -4498,6 +4692,11 @@ static void scx_root_disable(struct scx_sched *sch) scx_exit_task(p); } scx_task_iter_stop(&sti); + + scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), NULL); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* @@ -4534,6 +4733,10 @@ static void scx_root_disable(struct scx_sched *sch) cancel_delayed_work_sync(&scx_watchdog_work); + raw_spin_lock_irq(&scx_sched_lock); + list_del_rcu(&sch->all); + raw_spin_unlock_irq(&scx_sched_lock); + /* * scx_root clearing must be inside cpus_read_lock(). See * handle_hotplug(). @@ -4591,6 +4794,24 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) * successfully reach scx_bypass(). */ WRITE_ONCE(scx_aborting, true); + + /* + * Propagate exits to descendants immediately. Each has a dedicated + * helper kthread and can run in parallel. While most of disabling is + * serialized, running them in separate threads allows parallelizing + * ops.exit(), which can take arbitrarily long prolonging bypass mode. + * + * This doesn't cause recursions as propagation only takes place for + * non-propagation exits. + */ + if (kind != SCX_EXIT_PARENT) { + scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { + struct scx_sched *pos; + scx_for_each_descendant_pre(pos, sch) + scx_disable(pos, SCX_EXIT_PARENT); + } + } + return true; } @@ -4611,7 +4832,10 @@ static void scx_disable_workfn(struct kthread_work *work) ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - scx_root_disable(sch); + if (scx_parent(sch)) + scx_sub_disable(sch); + else + scx_root_disable(sch); } static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) @@ -4987,12 +5211,15 @@ static int alloc_kick_syncs(void) return 0; } -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, + struct cgroup *cgrp, + struct scx_sched *parent) { struct scx_sched *sch; + s32 level = parent ? parent->level + 1 : 0; int node, ret; - sch = kzalloc_obj(*sch); + sch = kzalloc_flex(*sch, ancestors, level); if (!sch) return ERR_PTR(-ENOMEM); @@ -5021,7 +5248,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; } - init_dsq(dsq, SCX_DSQ_GLOBAL); + init_dsq(dsq, SCX_DSQ_GLOBAL, sch); sch->global_dsqs[node] = dsq; } @@ -5039,6 +5266,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sched_set_fifo(sch->helper->task); + if (parent) + memcpy(sch->ancestors, parent->ancestors, + level * sizeof(parent->ancestors[0])); + sch->ancestors[level] = sch; + sch->level = level; + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); @@ -5046,10 +5279,46 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) ops->priv = sch; sch->kobj.kset = scx_kset; + +#ifdef CONFIG_EXT_SUB_SCHED + char *buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto err_stop_helper; + cgroup_path(cgrp, buf, PATH_MAX); + sch->cgrp_path = kstrdup(buf, GFP_KERNEL); + kfree(buf); + if (!sch->cgrp_path) + goto err_stop_helper; + + sch->cgrp = cgrp; + INIT_LIST_HEAD(&sch->children); + INIT_LIST_HEAD(&sch->sibling); + + if (parent) + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, + &parent->sub_kset->kobj, + "sub-%llu", cgroup_id(cgrp)); + else + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + + if (ret < 0) { + kfree(sch->cgrp_path); + goto err_stop_helper; + } + + if (ops->sub_attach) { + sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); + if (!sch->sub_kset) { + kobject_put(&sch->kobj); + return ERR_PTR(-ENOMEM); + } + } + +#else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) goto err_stop_helper; - +#endif /* CONFIG_EXT_SUB_SCHED */ return sch; err_stop_helper: @@ -5157,7 +5426,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; - sch = scx_alloc_and_add_sched(ops); + sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); goto err_free_ksyncs; @@ -5174,8 +5443,13 @@ static void scx_root_enable_workfn(struct kthread_work *work) atomic_long_set(&scx_nr_rejected, 0); - for_each_possible_cpu(cpu) - cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.local_dsq.sched = sch; + rq->scx.bypass_dsq.sched = sch; + rq->scx.cpuperf_target = SCX_CPUPERF_ONE; + } /* * Keep CPUs stable during enable so that the BPF scheduler can track @@ -5189,6 +5463,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) */ rcu_assign_pointer(scx_root, sch); + raw_spin_lock_irq(&scx_sched_lock); + list_add_tail_rcu(&sch->all, &scx_sched_all); + raw_spin_unlock_irq(&scx_sched_lock); + scx_idle_enable(ops); if (sch->ops.init) { @@ -5278,6 +5556,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) * never sees uninitialized tasks. */ scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), sch); ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; @@ -5392,6 +5671,185 @@ err_disable: cmd->ret = 0; } +#ifdef CONFIG_EXT_SUB_SCHED +/* verify that a scheduler can be attached to @cgrp and return the parent */ +static struct scx_sched *find_parent_sched(struct cgroup *cgrp) +{ + struct scx_sched *parent = cgrp->scx_sched; + struct scx_sched *pos; + + lockdep_assert_held(&scx_sched_lock); + + /* can't attach twice to the same cgroup */ + if (parent->cgrp == cgrp) + return ERR_PTR(-EBUSY); + + /* does $parent allow sub-scheds? */ + if (!parent->ops.sub_attach) + return ERR_PTR(-EOPNOTSUPP); + + /* can't insert between $parent and its exiting children */ + list_for_each_entry(pos, &parent->children, sibling) + if (cgroup_is_descendant(pos->cgrp, cgrp)) + return ERR_PTR(-EBUSY); + + return parent; +} + +static void scx_sub_enable_workfn(struct kthread_work *work) +{ + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp; + struct scx_sched *parent, *sch; + s32 ret; + + mutex_lock(&scx_enable_mutex); + + if (!scx_enabled()) { + ret = -ENODEV; + goto out_unlock; + } + + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + + raw_spin_lock_irq(&scx_sched_lock); + parent = find_parent_sched(cgrp); + if (IS_ERR(parent)) { + raw_spin_unlock_irq(&scx_sched_lock); + ret = PTR_ERR(parent); + goto out_put_cgrp; + } + kobject_get(&parent->kobj); + raw_spin_unlock_irq(&scx_sched_lock); + + sch = scx_alloc_and_add_sched(ops, cgrp, parent); + kobject_put(&parent->kobj); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto out_put_cgrp; + } + + raw_spin_lock_irq(&scx_sched_lock); + list_add_tail(&sch->sibling, &parent->children); + list_add_tail_rcu(&sch->all, &scx_sched_all); + raw_spin_unlock_irq(&scx_sched_lock); + + if (sch->level >= SCX_SUB_MAX_DEPTH) { + scx_error(sch, "max nesting depth %d violated", + SCX_SUB_MAX_DEPTH); + goto err_disable; + } + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); + if (ret) { + ret = ops_sanitize_err(sch, "init", ret); + scx_error(sch, "ops.init() failed (%d)", ret); + goto err_disable; + } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; + } + + if (validate_ops(sch, ops)) + goto err_disable; + + struct scx_sub_attach_args sub_attach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + + ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL, + &sub_attach_args); + if (ret) { + ret = ops_sanitize_err(sch, "sub_attach", ret); + scx_error(sch, "parent rejected (%d)", ret); + goto err_disable; + } + sch->sub_attached = true; + + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + /* + * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see + * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. + */ + set_cgroup_sched(sch_cgroup(sch), sch); + if (!(cgrp->self.flags & CSS_ONLINE)) { + scx_error(sch, "cgroup is not online"); + goto err_unlock_and_disable; + } + + /* TODO - perform actual enabling here */ + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); + kobject_uevent(&sch->kobj, KOBJ_ADD); + ret = 0; + goto out_unlock; + +out_put_cgrp: + cgroup_put(cgrp); +out_unlock: + mutex_unlock(&scx_enable_mutex); + cmd->ret = ret; + return; + +err_unlock_and_disable: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable: + mutex_unlock(&scx_enable_mutex); + kthread_flush_work(&sch->disable_work); + cmd->ret = 0; +} + +static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + struct cgroup *parent = cgroup_parent(cgrp); + + if (!cgroup_on_dfl(cgrp)) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + /* inherit ->scx_sched from $parent */ + if (parent) + rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); + break; + case CGROUP_LIFETIME_OFFLINE: + /* if there is a sched attached, shoot it down */ + if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) + scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, + SCX_ECODE_RSN_CGROUP_OFFLINE, + "cgroup %llu going offline", cgroup_id(cgrp)); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_cgroup_lifetime_nb = { + .notifier_call = scx_cgroup_lifetime_notify, +}; + +static s32 __init scx_cgroup_lifetime_notifier_init(void) +{ + return blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &scx_cgroup_lifetime_nb); +} +core_initcall(scx_cgroup_lifetime_notifier_init); +#endif /* CONFIG_EXT_SUB_SCHED */ + static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { static struct kthread_worker *helper; @@ -5418,7 +5876,12 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_unlock(&helper_mutex); } - kthread_init_work(&cmd.work, scx_root_enable_workfn); +#ifdef CONFIG_EXT_SUB_SCHED + if (ops->sub_cgroup_id > 1) + kthread_init_work(&cmd.work, scx_sub_enable_workfn); + else +#endif /* CONFIG_EXT_SUB_SCHED */ + kthread_init_work(&cmd.work, scx_root_enable_workfn); cmd.ops = ops; kthread_queue_work(READ_ONCE(helper), &cmd.work); @@ -5520,6 +5983,11 @@ static int bpf_scx_init_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, hotplug_seq): ops->hotplug_seq = *(u64 *)(udata + moff); return 1; +#ifdef CONFIG_EXT_SUB_SCHED + case offsetof(struct sched_ext_ops, sub_cgroup_id): + ops->sub_cgroup_id = *(u64 *)(udata + moff); + return 1; +#endif /* CONFIG_EXT_SUB_SCHED */ } return 0; @@ -5542,6 +6010,8 @@ static int bpf_scx_check_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): + case offsetof(struct sched_ext_ops, sub_attach): + case offsetof(struct sched_ext_ops, sub_detach): break; default: if (prog->sleepable) @@ -5619,7 +6089,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} -#endif +#endif /* CONFIG_EXT_GROUP_SCHED */ +static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } +static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} static s32 sched_ext_ops__init(void) { return -EINVAL; } @@ -5659,6 +6131,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, .init = sched_ext_ops__init, @@ -5941,8 +6415,10 @@ void __init init_sched_ext_class(void) struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); - init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); + /* local/bypass dsq's sch will be set during scx_root_enable() */ + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL); + INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -6598,16 +7074,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) if (!dsq) return -ENOMEM; - init_dsq(dsq, dsq_id); - rcu_read_lock(); sch = rcu_dereference(scx_root); - if (sch) + if (sch) { + init_dsq(dsq, dsq_id, sch); ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); - else + } else { ret = -ENODEV; + } rcu_read_unlock(); if (ret) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 417d3c6f02fe..75b7f57e20ab 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -28,6 +28,8 @@ enum scx_consts { SCX_BYPASS_LB_DONOR_PCT = 125, SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + + SCX_SUB_MAX_DEPTH = 4, }; enum scx_exit_kind { @@ -38,6 +40,7 @@ enum scx_exit_kind { SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + SCX_EXIT_PARENT, /* parent exiting */ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ @@ -62,6 +65,7 @@ enum scx_exit_kind { enum scx_exit_code { /* Reasons */ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32, /* Actions */ SCX_ECODE_ACT_RESTART = 1LLU << 48, @@ -213,7 +217,7 @@ struct scx_exit_task_args { bool cancelled; }; -/* argument container for ops->cgroup_init() */ +/* argument container for ops.cgroup_init() */ struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; @@ -236,12 +240,12 @@ enum scx_cpu_preempt_reason { }; /* - * Argument container for ops->cpu_acquire(). Currently empty, but may be + * Argument container for ops.cpu_acquire(). Currently empty, but may be * expanded in the future. */ struct scx_cpu_acquire_args {}; -/* argument container for ops->cpu_release() */ +/* argument container for ops.cpu_release() */ struct scx_cpu_release_args { /* the reason the CPU was preempted */ enum scx_cpu_preempt_reason reason; @@ -250,9 +254,7 @@ struct scx_cpu_release_args { struct task_struct *task; }; -/* - * Informational context provided to dump operations. - */ +/* informational context provided to dump operations */ struct scx_dump_ctx { enum scx_exit_kind kind; s64 exit_code; @@ -261,6 +263,18 @@ struct scx_dump_ctx { u64 at_jiffies; }; +/* argument container for ops.sub_attach() */ +struct scx_sub_attach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + +/* argument container for ops.sub_detach() */ +struct scx_sub_detach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * @@ -721,6 +735,20 @@ struct sched_ext_ops { #endif /* CONFIG_EXT_GROUP_SCHED */ + /** + * @sub_attach: Attach a sub-scheduler + * @args: argument container, see the struct definition + * + * Return 0 to accept the sub-scheduler. -errno to reject. + */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + + /** + * @sub_detach: Detach a sub-scheduler + * @args: argument container, see the struct definition + */ + void (*sub_detach)(struct scx_sub_detach_args *args); + /* * All online ops must come before ops.cpu_online(). */ @@ -762,6 +790,10 @@ struct sched_ext_ops { */ void (*exit)(struct scx_exit_info *info); + /* + * Data fields must comes after all ops fields. + */ + /** * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ @@ -797,6 +829,12 @@ struct sched_ext_ops { u64 hotplug_seq; /** + * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the + * specified cgroup. + */ + u64 sub_cgroup_id; + + /** * @name: BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), @@ -900,6 +938,8 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; + s32 level; + /* * Updates to the following warned bitfields can race causing RMW issues * but it doesn't really matter. @@ -907,6 +947,18 @@ struct scx_sched { bool warned_zero_slice:1; bool warned_deprecated_rq:1; + struct list_head all; + +#ifdef CONFIG_EXT_SUB_SCHED + struct list_head children; + struct list_head sibling; + struct cgroup *cgrp; + char *cgrp_path; + struct kset *sub_kset; + + bool sub_attached; +#endif /* CONFIG_EXT_SUB_SCHED */ + atomic_t exit_kind; struct scx_exit_info *exit_info; @@ -916,6 +968,9 @@ struct scx_sched { struct irq_work error_irq_work; struct kthread_work disable_work; struct rcu_work rcu_work; + + /* all ancestors including self */ + struct scx_sched *ancestors[]; }; enum scx_wake_flags { |
