From 39be350127ec60a078edffe5b4915dafba4ba514 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2012 12:44:34 +0100 Subject: sched, block: Unify cache detection The block layer has some code trying to determine if two CPUs share a cache, the scheduler has a similar function. Expose the function used by the scheduler and make the block layer use it, thereby removing the block layers usage of CONFIG_SCHED* and topology bits. Signed-off-by: Peter Zijlstra Acked-by: Jens Axboe Link: http://lkml.kernel.org/r/1327579450.2446.95.camel@twins --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..d7c43227311d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1507,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -static inline int ttwu_share_cache(int this_cpu, int that_cpu) +bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -1518,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -5754,7 +5754,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). + * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_id); -- cgit v1.2.3 From 4ec4412e1e91f44a3dcb97b6c9172a13fc78bac9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 12 Dec 2011 20:21:08 +0100 Subject: sched: Ensure cpu_power periodic update With a lot of small tasks, the softirq sched is nearly never called when no_hz is enabled. In this case load_balance() is mainly called with the newly_idle mode which doesn't update the cpu_power. Add a next_update field which ensure a maximum update period when there is short activity. Having stale cpu_power information can skew the load-balancing decisions, this is cured by the guaranteed update. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1323717668-2143-1-git-send-email-vincent.guittot@linaro.org --- include/linux/sched.h | 1 + kernel/sched/fair.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e1959568836..92313a3f6f77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -905,6 +905,7 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; + unsigned long next_update; /* * Number of busy cpus in this group. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..8e77a6bd597b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -215,6 +215,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, const struct sched_class fair_sched_class; +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -3776,6 +3778,11 @@ void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -3883,12 +3890,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); + if (local_group) { + if (idle != CPU_NEWLY_IDLE) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -4945,8 +4955,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. -- cgit v1.2.3 From 30fd049afcfed50e022704036e8629d6bdfe84e6 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 24 Jan 2012 22:33:56 +0600 Subject: sched: Remove sched_switch Currently we don't utilize the sched_switch field anymore. But, simply removing sched_switch field from the middle of the sched_stat output will break tools. So, to stay compatible we hardcode it to zero and remove the field from the scheduler data structures. Update the schedstat documentation accordingly. Signed-off-by: Rakib Mullick Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327422836.27181.5.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-stats.txt | 3 ++- kernel/sched/debug.c | 1 - kernel/sched/sched.h | 1 - kernel/sched/stats.c | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.txt index 1cd5d51bc761..8259b34a66ae 100644 --- a/Documentation/scheduler/sched-stats.txt +++ b/Documentation/scheduler/sched-stats.txt @@ -38,7 +38,8 @@ First field is a sched_yield() statistic: 1) # of times sched_yield() was called Next three are schedule() statistics: - 2) # of times we switched to the expired queue and reused it + 2) This field is a legacy array expiration count field used in the O(1) + scheduler. We kept it for ABI compatibility, but it is always set to zero. 3) # of times schedule() was called 4) # of times schedule() left the processor idle diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(yld_count); - P(sched_switch); P(sched_count); P(sched_goidle); #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..8a2c768d2f98 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -462,7 +462,6 @@ struct rq { unsigned int yld_count; /* schedule() stats */ - unsigned int sched_switch; unsigned int sched_count; unsigned int sched_goidle; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", + "cpu%d %u 0 %u %u %u %u %llu %llu %lu", cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); -- cgit v1.2.3 From ed387b781ea6e14b78f449aa2ee4f270b60b01ac Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 31 Jan 2012 11:40:32 +0900 Subject: sched: Move SMP-only variable into the SMP section This also fixes the following compilation warning on !SMP: CC kernel/sched/fair.o kernel/sched/fair.c:218:36: warning: 'max_load_balance_interval' defined but not used [-Wunused-variable] Signed-off-by: Hiroshi Shimamoto Reviewed-by: Vincent Guittot Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F2754A0.9090306@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e77a6bd597b..4ab60a24ea49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -215,8 +215,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, const struct sched_class fair_sched_class; -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /************************************************************** * CFS operations on generic schedulable entities: */ @@ -3086,6 +3084,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * Fair scheduling class load-balancing methods: */ +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. -- cgit v1.2.3 From 761b3ef50e1c2649cffbfa67a4dcb2dcdb7982ed Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 31 Jan 2012 13:47:36 +0800 Subject: cgroup: remove cgroup_subsys argument from callbacks The argument is not used at all, and it's not necessary, because a specific callback handler of course knows which subsys it belongs to. Now only ->pupulate() takes this argument, because the handlers of this callback always call cgroup_add_file()/cgroup_add_files(). So we reduce a few lines of code, though the shrinking of object size is minimal. 16 files changed, 113 insertions(+), 162 deletions(-) text data bss dec hex filename 5486240 656987 7039960 13183187 c928d3 vmlinux.o.orig 5486170 656987 7039960 13183117 c9288d vmlinux.o Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- Documentation/cgroups/cgroups.txt | 26 +++++++++------------ block/blk-cgroup.c | 22 +++++++----------- include/linux/cgroup.h | 29 ++++++++++------------- include/net/sock.h | 7 +++--- include/net/tcp_memcontrol.h | 2 +- kernel/cgroup.c | 43 +++++++++++++++++------------------ kernel/cgroup_freezer.c | 11 ++++----- kernel/cpuset.c | 16 ++++--------- kernel/events/core.c | 13 ++++------- kernel/sched/core.c | 20 +++++++--------- mm/memcontrol.c | 48 ++++++++++++++++----------------------- net/core/netprio_cgroup.c | 10 ++++---- net/core/sock.c | 6 ++--- net/ipv4/tcp_memcontrol.c | 2 +- net/sched/cls_cgroup.c | 10 ++++---- security/device_cgroup.c | 10 ++++---- 16 files changed, 113 insertions(+), 162 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index a7c96ae5557c..8e74980ab385 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -558,8 +558,7 @@ Each subsystem may export the following methods. The only mandatory methods are create/destroy. Any others that are null are presumed to be successful no-ops. -struct cgroup_subsys_state *create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +struct cgroup_subsys_state *create(struct cgroup *cgrp) (cgroup_mutex held by caller) Called to create a subsystem state object for a cgroup. The @@ -574,7 +573,7 @@ identified by the passed cgroup object having a NULL parent (since it's the root of the hierarchy) and may be an appropriate place for initialization code. -void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +void destroy(struct cgroup *cgrp) (cgroup_mutex held by caller) The cgroup system is about to destroy the passed cgroup; the subsystem @@ -585,7 +584,7 @@ cgroup->parent is still valid. (Note - can also be called for a newly-created cgroup if an error occurs after this subsystem's create() method has been called for the new cgroup). -int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +int pre_destroy(struct cgroup *cgrp); Called before checking the reference count on each subsystem. This may be useful for subsystems which have some extra references even if @@ -593,8 +592,7 @@ there are not tasks in the cgroup. If pre_destroy() returns error code, rmdir() will fail with it. From this behavior, pre_destroy() can be called multiple times against a cgroup. -int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) Called prior to moving one or more tasks into a cgroup; if the @@ -615,8 +613,7 @@ fork. If this method returns 0 (success) then this should remain valid while the caller holds cgroup_mutex and it is ensured that either attach() or cancel_attach() will be called in future. -void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) Called when a task attach operation has failed after can_attach() has succeeded. @@ -625,23 +622,22 @@ function, so that the subsystem can implement a rollback. If not, not necessary. This will be called only about subsystems whose can_attach() operation have succeeded. The parameters are identical to can_attach(). -void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) Called after the task has been attached to the cgroup, to allow any post-attachment activity that requires memory allocations or blocking. The parameters are identical to can_attach(). -void fork(struct cgroup_subsy *ss, struct task_struct *task) +void fork(struct task_struct *task) Called when a task is forked into a cgroup. -void exit(struct cgroup_subsys *ss, struct task_struct *task) +void exit(struct task_struct *task) Called during task exit. -int populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +int populate(struct cgroup *cgrp) (cgroup_mutex held by caller) Called after creation of a cgroup to allow a subsystem to populate @@ -651,7 +647,7 @@ include/linux/cgroup.h for details). Note that although this method can return an error code, the error code is currently not always handled well. -void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) +void post_clone(struct cgroup *cgrp) (cgroup_mutex held by caller) Called during cgroup_create() to do any parameter @@ -659,7 +655,7 @@ initialization which might be required before a task could attach. For example in cpusets, no task may attach before 'cpus' and 'mems' are set up. -void bind(struct cgroup_subsys *ss, struct cgroup *root) +void bind(struct cgroup *root) (cgroup_mutex and ss->hierarchy_mutex held by caller) Called when a cgroup subsystem is rebound to a different hierarchy diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f26309444..1359d637831f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -28,13 +28,10 @@ static LIST_HEAD(blkio_list); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); -static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, - struct cgroup *); -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup_taskset *); -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup_taskset *); -static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); +static struct cgroup_subsys_state *blkiocg_create(struct cgroup *); +static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *); +static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *); +static void blkiocg_destroy(struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); /* for encoding cft->private value on file */ @@ -1548,7 +1545,7 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) ARRAY_SIZE(blkio_files)); } -static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +static void blkiocg_destroy(struct cgroup *cgroup) { struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); unsigned long flags; @@ -1598,8 +1595,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) kfree(blkcg); } -static struct cgroup_subsys_state * -blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) { struct blkio_cgroup *blkcg; struct cgroup *parent = cgroup->parent; @@ -1628,8 +1624,7 @@ done: * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; @@ -1648,8 +1643,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return ret; } -static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7da3e745b74c..501adb1b2f43 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -452,23 +452,18 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); */ struct cgroup_subsys { - struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, - struct cgroup *cgrp); - int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); - int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset); - void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); - void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task); - int (*populate)(struct cgroup_subsys *ss, - struct cgroup *cgrp); - void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); + struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); + int (*pre_destroy)(struct cgroup *cgrp); + void (*destroy)(struct cgroup *cgrp); + int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + void (*fork)(struct task_struct *task); + void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task); + int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp); + void (*post_clone)(struct cgroup *cgrp); + void (*bind)(struct cgroup *root); int subsys_id; int active; diff --git a/include/net/sock.h b/include/net/sock.h index bb972d254dff..705d1add19a1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -68,7 +68,7 @@ struct cgroup; struct cgroup_subsys; #ifdef CONFIG_NET int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss); -void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss); +void mem_cgroup_sockets_destroy(struct cgroup *cgrp); #else static inline int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) @@ -76,7 +76,7 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; } static inline -void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +void mem_cgroup_sockets_destroy(struct cgroup *cgrp) { } #endif @@ -869,8 +869,7 @@ struct proto { */ int (*init_cgroup)(struct cgroup *cgrp, struct cgroup_subsys *ss); - void (*destroy_cgroup)(struct cgroup *cgrp, - struct cgroup_subsys *ss); + void (*destroy_cgroup)(struct cgroup *cgrp); struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg); #endif }; diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h index 3512082fa909..48410ff25c9e 100644 --- a/include/net/tcp_memcontrol.h +++ b/include/net/tcp_memcontrol.h @@ -13,7 +13,7 @@ struct tcp_memcontrol { struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg); int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); -void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); +void tcp_destroy_cgroup(struct cgroup *cgrp); unsigned long long tcp_max_memory(const struct mem_cgroup *memcg); void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx); #endif /* _TCP_MEMCG_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 43a224f167b5..865d89a580c7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); + ret = ss->pre_destroy(cgrp); if (ret) break; } @@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(ss, cgrp); + ss->bind(cgrp); mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { @@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); if (ss->bind) - ss->bind(ss, dummytop); + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; @@ -1908,7 +1908,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1932,7 +1932,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } synchronize_rcu(); @@ -1954,7 +1954,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } return retval; @@ -2067,7 +2067,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2104,7 +2104,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } /* @@ -2128,7 +2128,7 @@ out_cancel_attach: if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } out_free_group_list: @@ -3756,7 +3756,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); + struct cgroup_subsys_state *css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); @@ -3770,7 +3770,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); + ss->post_clone(cgrp); } cgroup_lock_hierarchy(root); @@ -3804,7 +3804,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4028,7 +4028,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(ss, dummytop); + css = ss->create(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4117,7 +4117,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * no ss->create seems to need anything important in the ss struct, so * this can happen first (i.e. before the rootnode attachment). */ - css = ss->create(ss, dummytop); + css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[i] = NULL; @@ -4135,7 +4135,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) int ret = cgroup_init_idr(ss, css); if (ret) { dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); + ss->destroy(dummytop); subsys[i] = NULL; mutex_unlock(&cgroup_mutex); return ret; @@ -4233,7 +4233,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * pointer to find their state. note that this also takes care of * freeing the css_id. */ - ss->destroy(ss, dummytop); + ss->destroy(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4509,7 +4509,7 @@ void cgroup_fork_callbacks(struct task_struct *child) for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) - ss->fork(ss, child); + ss->fork(child); } } } @@ -4611,7 +4611,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); + ss->exit(cgrp, old_cgrp, tsk); } } } @@ -5066,8 +5066,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5077,7 +5076,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, return css; } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..f86e93920b62 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys; * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, return &freezer->css; } -static void freezer_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task) * a write to that file racing against an attach, and hence the * can_attach() result will remain valid until the attach completes. */ -static int freezer_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, +static int freezer_can_attach(struct cgroup *new_cgroup, struct cgroup_taskset *tset) { struct freezer *freezer; @@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..5d575836dba6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1399,8 +1399,7 @@ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; @@ -1436,8 +1435,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; struct task_struct *task; @@ -1833,8 +1831,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) * (and likewise for mems) to the new cgroup. Called with cgroup_mutex * held. */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void cpuset_post_clone(struct cgroup *cgroup) { struct cgroup *parent, *child; struct cpuset *cs, *parent_cs; @@ -1857,13 +1854,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, /* * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1902,7 +1896,7 @@ static struct cgroup_subsys_state *cpuset_create( * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void cpuset_destroy(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a00..a5d1ee92b0d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6906,8 +6906,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) { struct perf_cgroup *jc; @@ -6924,8 +6923,7 @@ static struct cgroup_subsys_state *perf_cgroup_create( return &jc->css; } -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void perf_cgroup_destroy(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -6941,8 +6939,7 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -6950,8 +6947,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df00cb09263e..ff12f7216062 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7530,8 +7530,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7548,15 +7547,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpu_cgroup_destroy(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7574,7 +7572,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7584,8 +7582,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7935,8 +7933,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { */ /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { struct cpuacct *ca; @@ -7966,8 +7963,7 @@ out: } /* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpuacct_destroy(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dbff4dcde35..ae2f0a8ab761 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4580,10 +4580,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(cont, ss); }; -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { - mem_cgroup_sockets_destroy(cont, ss); + mem_cgroup_sockets_destroy(cont); } #else static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4591,8 +4590,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { } #endif @@ -4884,7 +4882,7 @@ err_cleanup: } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +mem_cgroup_create(struct cgroup *cont) { struct mem_cgroup *memcg, *parent; long error = -ENOMEM; @@ -4946,20 +4944,18 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static int mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); return mem_cgroup_force_empty(memcg, false); } -static void mem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void mem_cgroup_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(ss, cont); + kmem_cgroup_destroy(cont); mem_cgroup_put(memcg); } @@ -5296,9 +5292,8 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; @@ -5336,9 +5331,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); } @@ -5453,9 +5447,8 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); struct mm_struct *mm = get_task_mm(p); @@ -5470,20 +5463,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { } #endif diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3a9fd4826b75..22036ab732cf 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -23,9 +23,8 @@ #include #include -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup *cgrp); static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); struct cgroup_subsys net_prio_subsys = { @@ -120,8 +119,7 @@ static void update_netdev_tables(void) rtnl_unlock(); } -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; int ret; @@ -145,7 +143,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, return &cs->css; } -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cgrp_destroy(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; struct net_device *dev; diff --git a/net/core/sock.c b/net/core/sock.c index 5c5af9988f94..688037cb3b6e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -160,19 +160,19 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) out: list_for_each_entry_continue_reverse(proto, &proto_list, node) if (proto->destroy_cgroup) - proto->destroy_cgroup(cgrp, ss); + proto->destroy_cgroup(cgrp); mutex_unlock(&proto_list_mutex); return ret; } -void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +void mem_cgroup_sockets_destroy(struct cgroup *cgrp) { struct proto *proto; mutex_lock(&proto_list_mutex); list_for_each_entry_reverse(proto, &proto_list, node) if (proto->destroy_cgroup) - proto->destroy_cgroup(cgrp, ss); + proto->destroy_cgroup(cgrp); mutex_unlock(&proto_list_mutex); } #endif diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..e714c6834c90 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -94,7 +94,7 @@ create_files: } EXPORT_SYMBOL(tcp_init_cgroup); -void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) +void tcp_destroy_cgroup(struct cgroup *cgrp) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct cg_proto *cg_proto; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index f84fdc3a7f27..1afaa284fcd7 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -22,9 +22,8 @@ #include #include -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup *cgrp); static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); struct cgroup_subsys net_cls_subsys = { @@ -51,8 +50,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) struct cgroup_cls_state, css); } -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_cls_state *cs; @@ -66,7 +64,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, return &cs->css; } -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cgrp_destroy(struct cgroup *cgrp) { kfree(cgrp_cls_state(cgrp)); } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 8b5b5d8612c6..c43a3323feea 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -61,8 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; -static int devcgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgrp, struct cgroup_taskset *set) +static int devcgroup_can_attach(struct cgroup *new_cgrp, + struct cgroup_taskset *set) { struct task_struct *task = cgroup_taskset_first(set); @@ -156,8 +156,7 @@ remove: /* * called from kernel/cgroup.c with cgroup_lock() held. */ -static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) { struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; struct cgroup *parent_cgroup; @@ -195,8 +194,7 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, return &dev_cgroup->css; } -static void devcgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void devcgroup_destroy(struct cgroup *cgroup) { struct dev_cgroup *dev_cgroup; struct dev_whitelist_item *wh, *tmp; -- cgit v1.2.3 From 62f6536a630affe3176deb48554d27ee58b65077 Mon Sep 17 00:00:00 2001 From: "Nikunj A. Dadhania" Date: Fri, 17 Feb 2012 08:34:30 +0530 Subject: sched: Remove rcu_read_lock/unlock() from select_idle_sibling() select_idle_sibling() is called from select_task_rq_fair(), which already has the RCU read lock held. Signed-off-by: Nikunj A. Dadhania Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120217030409.11748.12491.stgit@abhimanyu Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ab60a24ea49..6ce9992926d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2670,8 +2670,6 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { sg = sd->groups; @@ -2693,8 +2691,6 @@ next: } while (sg != sd->groups); } done: - rcu_read_unlock(); - return target; } -- cgit v1.2.3 From de5bdff7a72acc281219be2b8edeeca1fd81c542 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 16 Feb 2012 14:52:21 +0900 Subject: sched: Make initial SCHED_RR timeslace DEF_TIMESLICE Current the initial SCHED_RR timeslice of init_task is HZ, which means 1s, and is not same as the default SCHED_RR timeslice DEF_TIMESLICE. Change that initial timeslice to the DEF_TIMESLICE. Signed-off-by: Hiroshi Shimamoto [ s/DEF_TIMESLICE/RR_TIMESLICE/g ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F3C9995.3010800@ct.jp.nec.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 +- include/linux/sched.h | 6 ++++++ kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 4 ---- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9c66b1ada9d7..f994d51f70f2 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,7 +149,7 @@ extern struct cred init_cred; }, \ .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ - .time_slice = HZ, \ + .time_slice = RR_TIMESLICE, \ .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5dba2ad52431..eb5de466f099 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1234,6 +1234,12 @@ struct sched_rt_entity { #endif }; +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + struct rcu_node; enum perf_event_task_context { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec5..f70206c2c802 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1972,7 +1972,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = DEF_TIMESLICE; + p->rt.time_slice = RR_TIMESLICE; /* * Requeue to the end of queue if we are not the only element @@ -2000,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return DEF_TIMESLICE; + return RR_TIMESLICE; else return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8a2c768d2f98..c0660a1a0cd1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running; /* * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. */ -#define DEF_TIMESLICE (100 * HZ / 1000) /* * single value that denotes runtime == period, ie unlimited time. -- cgit v1.2.3 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- arch/Kconfig | 29 ++++--- arch/ia64/include/asm/paravirt.h | 6 +- arch/ia64/kernel/paravirt.c | 4 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +- arch/x86/include/asm/paravirt.h | 6 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/paravirt.c | 4 +- arch/x86/kvm/mmu_audit.c | 8 +- include/linux/jump_label.h | 139 ++++++++++++++++++++++++---------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 +-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +- include/net/sock.h | 6 +- kernel/events/core.c | 16 ++-- kernel/jump_label.c | 128 ++++++++++++++++++------------- kernel/sched/core.c | 18 ++--- kernel/sched/fair.c | 8 +- kernel/sched/sched.h | 14 ++-- kernel/tracepoint.c | 20 ++--- net/core/dev.c | 24 +++--- net/core/net-sysfs.c | 4 +- net/core/sock.c | 4 +- net/core/sysctl_net_core.c | 4 +- net/ipv4/tcp_memcontrol.c | 6 +- net/netfilter/core.c | 6 +- 31 files changed, 298 insertions(+), 205 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'kernel/sched') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..5b448a74d0f7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -47,18 +47,29 @@ config KPROBES If in doubt, say "N". config JUMP_LABEL - bool "Optimize trace point call sites" + bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL help + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. + + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. + If it is detected that the compiler has support for "asm goto", - the kernel will compile trace point locations with just a - nop instruction. When trace points are enabled, the nop will - be converted to a jump to the trace function. This technique - lowers overhead and stress on the branch prediction of the - processor. - - On i386, options added to the compiler flags may increase - the size of the kernel slightly. + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. + + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. + + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config OPTPROBES def_bool y diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 32551d304cd7..b149b88ea795 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu) pv_time_ops.init_missing_ticks_accounting(cpu); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline int paravirt_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 100868216c55..1b22f6de2932 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = { * pv_time_ops * time operations */ -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static int ia64_native_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 1881b316ca45..4d6d77ed9b9d 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,7 +20,7 @@ #define WORD_INSN ".word" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\tnop\n\t" "nop\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 938986e412f1..ae098c438f00 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 95a6cf2b5b67..6c32190dc73e 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -13,7 +13,7 @@ #define ASM_ALIGN ".balign 4" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index fc73a82366f8..5080d16a832f 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,7 +7,7 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a32b18ce6ead..3a16c1483b45 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -9,12 +9,12 @@ #define JUMP_LABEL_NOP_SIZE 5 -#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - JUMP_LABEL_INITIAL_NOP + STATIC_KEY_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74fb..c0180fd372d2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline u64 paravirt_steal_clock(int cpu) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void) static __init int activate_jump_labels(void) { if (has_steal_clock) { - jump_label_inc(¶virt_steal_enabled); + static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) - jump_label_inc(¶virt_steal_rq_enabled); + static_key_slow_inc(¶virt_steal_rq_enabled); } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..ada2f99388dd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..ea7b4fd34676 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea7..2172da2d9bb4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..7dfaae7846ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d0..29734be334c1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..0d21e6f1cf53 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 000000000000..27bd3f8a0857 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7e..bd96ecd0e05c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf020..dcde2d9268cd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #endif /* SOCK_REFCNT_DEBUG */ #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) -extern struct jump_label_key memcg_socket_limit_enabled; +extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) { return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); } -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled) +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 static inline struct cg_proto *parent_cg_proto(struct proto *proto, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c3b9de55f6b..5e0f8bb89b2b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); } } @@ -4982,7 +4982,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5843,7 +5843,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 543782e7cdd2..bf9dcadbb53a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifdef HAVE_JUMP_LABEL @@ -29,10 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) +bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } +EXPORT_SYMBOL_GPL(static_key_enabled); static int jump_label_cmp(const void *a, const void *b) { @@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { @@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key, if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -200,8 +221,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES struct module *mod = __module_address((unsigned long)key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..112c6824476b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..423547ada38a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..b4cd6d8ea150 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include +# include # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..da7ce7f0e566 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,7 +134,7 @@ #include #include #include -#include +#include #include #include "net-sysfs.h" @@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct jump_label_key netstamp_needed __read_mostly; +static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call jump_label_dec() from irq context +/* We are not allowed to call static_key_slow_dec() from irq context * If net_disable_timestamp() is called from irq context, defer the - * jump_label_dec() calls. + * static_key_slow_dec() calls. */ static atomic_t netstamp_needed_deferred; #endif @@ -1457,12 +1457,12 @@ void net_enable_timestamp(void) if (deferred) { while (--deferred) - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); return; } #endif WARN_ON(in_interrupt()); - jump_label_inc(&netstamp_needed); + static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1474,19 +1474,19 @@ void net_disable_timestamp(void) return; } #endif - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp.tv64 = 0; - if (static_branch(&netstamp_needed)) + if (static_key_false(&netstamp_needed)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ - if (static_branch(&netstamp_needed)) { \ + if (static_key_false(&netstamp_needed)) { \ if ((COND) && !(SKB)->tstamp.tv64) \ __net_timestamp(SKB); \ } \ @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); -struct jump_label_key rps_needed __read_mostly; +struct static_key rps_needed __read_mostly; static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a1727cda03d7..495586232aa1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (map) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (old_map) { kfree_rcu(old_map, rcu); - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); } free_cpumask_var(mask); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..3a4e5817a2a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,7 +111,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -struct jump_label_key memcg_socket_limit_enabled; +struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d05559d4d9cd..0c2850874254 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (orig_sock_table) { - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..602fb305365f 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) net->ipv4.sysctl_tcp_mem[i]); if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - jump_label_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_socket_limit_enabled); return 0; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b301..e1b7e051332e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } -- cgit v1.2.3 From 42c62a589f1ccbf38a02cb732231f9c2fccc5ab0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2011 22:03:48 +0200 Subject: sched/rt: Keep period timer ticking when rt throttling is active When a runqueue is throttled we cannot disable the period timer because that timer is the only way to undo the throttling. We got stale throttling entries when a rq was throttled and then the global sysctl was disabled, which stopped the timer. Signed-off-by: Peter Zijlstra [ Added changelog ] Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-nuj34q52p6ro7szapuz84i0v@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f70206c2c802..6d1eb0be1870 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1; + int i, idle = 1, throttled = 0; const struct cpumask *span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - span = sched_rt_period_mask(); for_each_cpu(i, span) { int enqueue = 0; @@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (!rt_rq_throttled(rt_rq)) enqueue = 1; } + if (rt_rq->rt_throttled) + throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + return idle; } @@ -884,7 +886,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit v1.2.3 From 7abc63b1bd412f7655b62ef3e35c3c11c5134636 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2011 22:03:48 +0200 Subject: sched/rt: Do not throttle when PI boosting When a runqueue has rt_runtime_us = 0 then the only way it can accumulate rt_time is via PI boosting. That causes the runqueue to be throttled and replenishing does not change anything due to rt_runtime_us = 0. So avoid that situation by clearing rt_time and skip the throttling alltogether. Signed-off-by: Peter Zijlstra [ Changelog ] Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-7x70cypsotjb4jvcor3edctk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6d1eb0be1870..7f7e7cdcb472 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -857,8 +857,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + if (likely(rt_b->rt_runtime)) { + rt_rq->rt_throttled = 1; + printk_once(KERN_WARNING "sched: RT throttling activated\n"); + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; -- cgit v1.2.3 From c5491ea779793f977d282754db478157cc409d82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:09:35 +0100 Subject: sched/rt: Add schedule_preempt_disabled() Add helper to get rid of the ever repeating: preempt_enable_no_resched(); schedule(); preempt_disable(); patterns. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-wxx7btox7coby6ifv5vzhzgp@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1a6398424fab..03dd224d0667 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -361,6 +361,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +extern void schedule_preempt_disabled(void); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 91fe6a0e9098..73022395c00e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3246,6 +3246,18 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -- cgit v1.2.3 From ba74c1448f127649046615ec017bded7b2a76f29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 13:32:17 +0100 Subject: sched/rt: Document scheduler related skip-resched-check sites Create a distinction between scheduler related preempt_enable_no_resched() calls and the nearly one hundred other places in the kernel that do not want to reschedule, for one reason or another. This distinction matters for -rt, where the scheduler and the non-scheduler preempt models (and checks) are different. For upstream it's purely documentational. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-gs88fvx2mdv5psnzxnv575ke@git.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/idle.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- include/linux/preempt.h | 5 ++++- kernel/sched/core.c | 6 +++--- kernel/softirq.c | 4 ++-- 5 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 65035141552b..c97fc60c790c 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -102,7 +102,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); if (cpu_should_die()) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cpu_die(); } schedule_preempt_disabled(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index ab9a29268213..06b5b5fc20c7 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -106,7 +106,7 @@ void cpu_idle(void) #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(cpu)) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cpu_play_dead(); } #endif diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 58969b2a8a82..5a710b9c578e 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -48,12 +48,14 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define sched_preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) +#define preempt_enable_no_resched() sched_preempt_enable_no_resched() + #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -92,6 +94,7 @@ do { \ #else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) +#define sched_preempt_enable_no_resched() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 73022395c00e..643cc37fcb23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3220,7 +3220,7 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; } @@ -3253,7 +3253,7 @@ EXPORT_SYMBOL(schedule); */ void __sched schedule_preempt_disabled(void) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); preempt_disable(); } @@ -4486,7 +4486,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 79b524767a24..f268369ebe1f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -353,7 +353,7 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); } /* @@ -759,7 +759,7 @@ static int run_ksoftirqd(void * __bind_cpu) if (local_softirq_pending()) __do_softirq(); local_irq_enable(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cond_resched(); preempt_disable(); rcu_note_context_switch((long)__bind_cpu); -- cgit v1.2.3 From 63b2001169e75cd71e917ec953fdab572e3f944a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Dec 2011 00:04:00 +0100 Subject: sched/wait: Add __wake_up_all_locked() API For code which protects the waitqueue itself with another lock it makes no sense to acquire the waitqueue lock for wakeup all. Provide __wake_up_all_locked(). This is an optimization on the vanilla kernel (to be used by the PCI code) and an important semantic distinction on -rt. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ux6m4b8jonb9inx8xafh77ds@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 5 +++-- kernel/sched/core.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/wait.h b/include/linux/wait.h index a9ce45e8501c..7d9a9e990ce6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -157,7 +157,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); @@ -170,7 +170,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) -#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL) +#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) +#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 643cc37fcb23..820f7453fda9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3418,9 +3418,9 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) { - __wake_up_common(q, mode, 1, 0, NULL); + __wake_up_common(q, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -- cgit v1.2.3 From 1c4dd99bed5f6f70932bf8dacdd54d04a2619778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Jun 2011 20:07:38 +0200 Subject: sched/rt: Prevent idle task boosting Idle task boosting is a nono in general. There is one exception, when PREEMPT_RT and NOHZ is active: The idle task calls get_next_timer_interrupt() and holds the timer wheel base->lock on the CPU and another CPU wants to access the timer (probably to cancel it). We can safely ignore the boosting request, as the idle CPU runs this code with interrupts disabled and will complete the lock protected section without being interrupted. So there is no real need to boost. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-755rvsosz7sdzot12a3gbha6@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 820f7453fda9..c2bbdecaa27d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3779,6 +3779,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -3802,11 +3820,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; -- cgit v1.2.3 From 3c7d51843b03a6839e9ec7cda724e54d2319a63a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Jul 2011 20:46:52 +0200 Subject: sched/rt: Do not submit new work when PI-blocked When we are PI-blocked then we want to get things done ASAP. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-vw8et3445km5b8mpihf4trae@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ kernel/sched/core.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 03dd224d0667..c628a9151437 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2082,12 +2082,20 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice; extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} #else static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } # define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} #endif extern bool yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c2bbdecaa27d..25e06a5e048b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3227,7 +3227,7 @@ need_resched: static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + if (!tsk->state || tsk_is_pi_blocked(tsk)) return; /* * If we are going to sleep and we have plugged IO queued, -- cgit v1.2.3 From 8e45cb545d98bc58e75b7de89ec8d3e5c8459ee6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Feb 2012 12:47:19 +0100 Subject: sched: Move load-balancing arguments into helper struct Passing large sets of similar arguments all around the load-balancer gets tiresom when you want to modify something. Stick them all in a helper structure and pass the structure around. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-5slqz0vhsdzewrfk9eza1aon@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 177 +++++++++++++++++++++++++++------------------------- 1 file changed, 93 insertions(+), 84 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79e9e13c31ab..55b1f117419a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3135,13 +3135,25 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ #define LBF_ABORT 0x10 +struct lb_env { + struct sched_domain *sd; + + int this_cpu; + struct rq *this_rq; + + struct rq *busiest_rq; + struct cfs_rq *busiest_cfs_rq; + + enum cpu_idle_type idle; + unsigned long max_load_move; + unsigned int flags; +}; + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; /* @@ -3150,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *lb_flags &= ~LBF_ALL_PINNED; + env->flags &= ~LBF_ALL_PINNED; - if (task_running(rq, p)) { + if (task_running(env->busiest_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3167,12 +3179,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock_task, sd); + tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd); if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif @@ -3193,31 +3205,27 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * * Called with both runqueues locked. */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; struct cfs_rq *cfs_rq; - int pinned = 0; - for_each_leaf_cfs_rq(busiest, cfs_rq) { + for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (throttled_lb_pair(task_group(p), - busiest->cpu, this_cpu)) + env->busiest_rq->cpu, env->this_cpu)) break; - if (!can_migrate_task(p, busiest, this_cpu, - sd, idle, &pinned)) + if (!can_migrate_task(p, env)) continue; - pull_task(busiest, p, this_rq, this_cpu); + pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); /* * Right now, this is only the second place pull_task() * is called, so we can safely collect pull_task() * stats here rather than inside pull_task(). */ - schedstat_inc(sd, lb_gained[idle]); + schedstat_inc(env->sd, lb_gained[env->idle]); return 1; } } @@ -3225,31 +3233,26 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, - struct cfs_rq *busiest_cfs_rq) +static unsigned long balance_tasks(struct lb_env *env) { int loops = 0, pulled = 0; - long rem_load_move = max_load_move; + long rem_load_move = env->max_load_move; struct task_struct *p, *n; - if (max_load_move == 0) + if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { + list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; + env->flags |= LBF_NEED_BREAK; break; } if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) + !can_migrate_task(p, env)) continue; - pull_task(busiest, p, this_rq, this_cpu); + pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); pulled++; rem_load_move -= p->se.load.weight; @@ -3259,8 +3262,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) { + env->flags |= LBF_ABORT; break; } #endif @@ -3278,9 +3281,9 @@ out: * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ - schedstat_add(sd, lb_gained[idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], pulled); - return max_load_move - rem_load_move; + return env->max_load_move - rem_load_move; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3363,40 +3366,39 @@ static void update_h_load(long cpu) walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long load_balance_fair(struct lb_env *env) { - long rem_load_move = max_load_move; - struct cfs_rq *busiest_cfs_rq; + unsigned long max_load_move = env->max_load_move; + long rem_load_move = env->max_load_move; rcu_read_lock(); - update_h_load(cpu_of(busiest)); + update_h_load(cpu_of(env->busiest_rq)); - for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; + for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) { + unsigned long busiest_h_load = env->busiest_cfs_rq->h_load; + unsigned long busiest_weight = env->busiest_cfs_rq->load.weight; u64 rem_load, moved_load; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) break; /* * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight || - throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) + if (!env->busiest_cfs_rq->task_weight) + continue; + + if (throttled_lb_pair(env->busiest_cfs_rq->tg, + cpu_of(env->busiest_rq), + env->this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); - moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, - busiest_cfs_rq); + env->max_load_move = rem_load; + moved_load = balance_tasks(env); if (!moved_load) continue; @@ -3416,15 +3418,10 @@ static inline void update_shares(int cpu) { } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long load_balance_fair(struct lb_env *env) { - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, - &busiest->cfs); + env->busiest_cfs_rq = &env->busiest_rq->cfs; + return balance_tasks(env); } #endif @@ -3435,21 +3432,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, * * Called with both runqueues locked. */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static int move_tasks(struct lb_env *env) { + unsigned long max_load_move = env->max_load_move; unsigned long total_load_moved = 0, load_moved; do { - load_moved = load_balance_fair(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, lb_flags); - + env->max_load_move = max_load_move - total_load_moved; + load_moved = load_balance_fair(env); total_load_moved += load_moved; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) break; #ifdef CONFIG_PREEMPT @@ -3458,8 +3451,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) { + env->flags |= LBF_ABORT; break; } #endif @@ -4459,13 +4452,20 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, lb_flags = 0, active_balance = 0; + int ld_moved, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct lb_env env = { + .sd = sd, + .this_cpu = this_cpu, + .this_rq = this_rq, + .idle = idle, + }; + cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]); @@ -4500,11 +4500,13 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; + env.max_load_move = imbalance; + env.busiest_rq = busiest; + local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); + ld_moved = move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4514,18 +4516,18 @@ redo: if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (lb_flags & LBF_ABORT) + if (env.flags & LBF_ABORT) goto out_balanced; - if (lb_flags & LBF_NEED_BREAK) { - lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (lb_flags & LBF_ABORT) + if (env.flags & LBF_NEED_BREAK) { + env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (env.flags & LBF_ABORT) goto out_balanced; goto redo; } /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { + if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4555,7 +4557,7 @@ redo: tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4608,7 +4610,7 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && + if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4718,10 +4720,17 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .this_cpu = target_cpu, + .this_rq = target_rq, + .busiest_rq = busiest_rq, + .idle = CPU_IDLE, + }; + schedstat_inc(sd, alb_count); - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) + if (move_one_task(&env)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); -- cgit v1.2.3 From ddcdf6e7d9919d139031fa2a6addd9544a9a833e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Feb 2012 19:27:40 +0100 Subject: sched: Rename load-balancing fields s/env->this_/env->dst_/g s/env->busiest_/env->src_/g s/pull_task/move_task/g Makes everything clearer. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-0yvgms8t8x962drpvl0fu0kk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 118 ++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 58 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55b1f117419a..233d05171bf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2918,7 +2918,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as pull_task(), in which we + * This is possible from callers such as move_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -3084,17 +3084,37 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 + +struct lb_env { + struct sched_domain *sd; + + int src_cpu; + struct rq *src_rq; + struct cfs_rq *src_cfs_rq; + + int dst_cpu; + struct rq *dst_rq; + + enum cpu_idle_type idle; + unsigned long max_load_move; + unsigned int flags; +}; + /* - * pull_task - move a task from a remote runqueue to the local runqueue. + * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) +static void move_task(struct task_struct *p, struct lb_env *env) { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); } /* @@ -3129,26 +3149,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 - -struct lb_env { - struct sched_domain *sd; - - int this_cpu; - struct rq *this_rq; - - struct rq *busiest_rq; - struct cfs_rq *busiest_cfs_rq; - - enum cpu_idle_type idle; - unsigned long max_load_move; - unsigned int flags; -}; - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -3162,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->busiest_rq, p)) { + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3179,7 +3179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS @@ -3210,20 +3210,20 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) { + for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (throttled_lb_pair(task_group(p), - env->busiest_rq->cpu, env->this_cpu)) + env->src_cpu, env->dst_cpu)) break; if (!can_migrate_task(p, env)) continue; - pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); + move_task(p, env); /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). */ schedstat_inc(env->sd, lb_gained[env->idle]); return 1; @@ -3242,7 +3242,7 @@ static unsigned long balance_tasks(struct lb_env *env) if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) { + list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) { env->flags |= LBF_NEED_BREAK; break; @@ -3252,7 +3252,7 @@ static unsigned long balance_tasks(struct lb_env *env) !can_migrate_task(p, env)) continue; - pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); + move_task(p, env); pulled++; rem_load_move -= p->se.load.weight; @@ -3277,9 +3277,9 @@ static unsigned long balance_tasks(struct lb_env *env) } out: /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). */ schedstat_add(env->sd, lb_gained[env->idle], pulled); @@ -3372,11 +3372,11 @@ static unsigned long load_balance_fair(struct lb_env *env) long rem_load_move = env->max_load_move; rcu_read_lock(); - update_h_load(cpu_of(env->busiest_rq)); + update_h_load(cpu_of(env->src_rq)); - for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) { - unsigned long busiest_h_load = env->busiest_cfs_rq->h_load; - unsigned long busiest_weight = env->busiest_cfs_rq->load.weight; + for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { + unsigned long busiest_h_load = env->src_cfs_rq->h_load; + unsigned long busiest_weight = env->src_cfs_rq->load.weight; u64 rem_load, moved_load; if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) @@ -3385,12 +3385,12 @@ static unsigned long load_balance_fair(struct lb_env *env) /* * empty group or part of a throttled hierarchy */ - if (!env->busiest_cfs_rq->task_weight) + if (!env->src_cfs_rq->task_weight) continue; - if (throttled_lb_pair(env->busiest_cfs_rq->tg, - cpu_of(env->busiest_rq), - env->this_cpu)) + if (throttled_lb_pair(env->src_cfs_rq->tg, + cpu_of(env->src_rq), + env->dst_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; @@ -3420,7 +3420,7 @@ static inline void update_shares(int cpu) static unsigned long load_balance_fair(struct lb_env *env) { - env->busiest_cfs_rq = &env->busiest_rq->cfs; + env->src_cfs_rq = &env->src_rq->cfs; return balance_tasks(env); } #endif @@ -3451,7 +3451,7 @@ static int move_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) { + if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { env->flags |= LBF_ABORT; break; } @@ -4461,8 +4461,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct lb_env env = { .sd = sd, - .this_cpu = this_cpu, - .this_rq = this_rq, + .dst_cpu = this_cpu, + .dst_rq = this_rq, .idle = idle, }; @@ -4502,7 +4502,8 @@ redo: */ env.flags |= LBF_ALL_PINNED; env.max_load_move = imbalance; - env.busiest_rq = busiest; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; local_irq_save(flags); double_rq_lock(this_rq, busiest); @@ -4722,9 +4723,10 @@ static int active_load_balance_cpu_stop(void *data) if (likely(sd)) { struct lb_env env = { .sd = sd, - .this_cpu = target_cpu, - .this_rq = target_rq, - .busiest_rq = busiest_rq, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, .idle = CPU_IDLE, }; -- cgit v1.2.3 From 367456c756a6b84f493ca9cc5b17b1f5d38ef466 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Feb 2012 21:49:09 +0100 Subject: sched: Ditch per cgroup task lists for load-balancing Per cgroup load-balance has numerous problems, chief amongst them that there is no real sane order in them. So stop pretending it makes sense and enqueue all tasks on a single list. This also allows us to more easily fix the fwd progress issue uncovered by the lock-break stuff. Rotate the list on failure to migreate and limit the total iterations to nr_running (which with releasing the lock isn't strictly accurate but close enough). Also add a filter that skips very light tasks on the first attempt around the list, this attempts to avoid shooting whole cgroups around without affecting over balance. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-tx8yqydc7eimgq7i4rkc3a4g@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 + kernel/sched/fair.c | 176 ++++++++++++++++++++++----------------------------- kernel/sched/sched.h | 10 +-- 3 files changed, 80 insertions(+), 109 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25e06a5e048b..95545126be1c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6959,6 +6959,9 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + + INIT_LIST_HEAD(&rq->cfs_tasks); + rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_flags = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 233d05171bf5..a0424fc4cc54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } +#ifdef CONFIG_SMP + if (entity_is_task(se)) + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +#endif cfs_rq->nr_running++; } @@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); + if (entity_is_task(se)) list_del_init(&se->group_node); - } cfs_rq->nr_running--; } @@ -3085,17 +3070,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 +#define LBF_NEED_BREAK 0x02 +#define LBF_ABORT 0x04 struct lb_env { struct sched_domain *sd; int src_cpu; struct rq *src_rq; - struct cfs_rq *src_cfs_rq; int dst_cpu; struct rq *dst_rq; @@ -3103,6 +3085,10 @@ struct lb_env { enum cpu_idle_type idle; unsigned long max_load_move; unsigned int flags; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; }; /* @@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; - struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { - list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), - env->src_cpu, env->dst_cpu)) - break; + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) + continue; - if (!can_migrate_task(p, env)) - continue; + if (!can_migrate_task(p, env)) + continue; - move_task(p, env); - /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). - */ - schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; - } + move_task(p, env); + /* + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; } - return 0; } +static unsigned long task_h_load(struct task_struct *p); + static unsigned long balance_tasks(struct lb_env *env) { - int loops = 0, pulled = 0; long rem_load_move = env->max_load_move; struct task_struct *p, *n; + unsigned long load; + int pulled = 0; if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) { + env->flags |= LBF_ABORT; + break; + } + /* take a beather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sysctl_sched_nr_migrate; env->flags |= LBF_NEED_BREAK; break; } - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, env)) - continue; + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + goto next; + + load = task_h_load(p); + if (load < 16 && !env->sd->nr_balance_failed) + goto next; + + if ((load * 2) > rem_load_move) + goto next; + + if (!can_migrate_task(p, env)) + goto next; move_task(p, env); pulled++; - rem_load_move -= p->se.load.weight; + rem_load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env) */ if (rem_load_move <= 0) break; + + continue; +next: + list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); } out: /* @@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + rcu_read_unlock(); } -static unsigned long load_balance_fair(struct lb_env *env) +static unsigned long task_h_load(struct task_struct *p) { - unsigned long max_load_move = env->max_load_move; - long rem_load_move = env->max_load_move; - - rcu_read_lock(); - update_h_load(cpu_of(env->src_rq)); - - for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { - unsigned long busiest_h_load = env->src_cfs_rq->h_load; - unsigned long busiest_weight = env->src_cfs_rq->load.weight; - u64 rem_load, moved_load; - - if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - - /* - * empty group or part of a throttled hierarchy - */ - if (!env->src_cfs_rq->task_weight) - continue; - - if (throttled_lb_pair(env->src_cfs_rq->tg, - cpu_of(env->src_rq), - env->dst_cpu)) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - env->max_load_move = rem_load; - - moved_load = balance_tasks(env); - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); + struct cfs_rq *cfs_rq = task_cfs_rq(p); + unsigned long load; - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); + load = p->se.load.weight; + load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - return max_load_move - rem_load_move; + return load; } #else static inline void update_shares(int cpu) { } -static unsigned long load_balance_fair(struct lb_env *env) +static inline void update_h_load(long cpu) { - env->src_cfs_rq = &env->src_rq->cfs; - return balance_tasks(env); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.load.weight; } #endif @@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env) unsigned long max_load_move = env->max_load_move; unsigned long total_load_moved = 0, load_moved; + update_h_load(cpu_of(env->src_rq)); do { env->max_load_move = max_load_move - total_load_moved; - load_moved = load_balance_fair(env); + load_moved = balance_tasks(env); total_load_moved += load_moved; if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) @@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, + .loop_break = sysctl_sched_nr_migrate, }; cpumask_copy(cpus, cpu_active_mask); @@ -4504,6 +4480,7 @@ redo: env.max_load_move = imbalance; env.src_cpu = busiest->cpu; env.src_rq = busiest; + env.loop_max = busiest->nr_running; local_irq_save(flags); double_rq_lock(this_rq, busiest); @@ -4521,9 +4498,7 @@ redo: goto out_balanced; if (env.flags & LBF_NEED_BREAK) { - env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (env.flags & LBF_ABORT) - goto out_balanced; + env.flags &= ~LBF_NEED_BREAK; goto redo; } @@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0660a1a0cd1..753bdd567416 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -212,9 +212,6 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct list_head tasks; - struct list_head *balance_iterator; - /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). @@ -241,11 +238,6 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - /* * h_load = weight * f(tg) * @@ -420,6 +412,8 @@ struct rq { int cpu; int online; + struct list_head cfs_tasks; + u64 rt_avg; u64 age_stamp; u64 idle_stamp; -- cgit v1.2.3 From 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Feb 2012 17:41:27 +0900 Subject: sched: Clean up parameter passing of proc_sched_autogroup_set_nice() Pass nice as a value to proc_sched_autogroup_set_nice(). No side effect is expected, and the variable err will be overwritten with the return value. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F45FBB7.5090607@ct.jp.nec.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 3 +-- include/linux/sched.h | 2 +- kernel/sched/auto_group.c | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel/sched') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd49b02..965d4bde3a3b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1310,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - err = nice; - err = proc_sched_autogroup_set_nice(p, &err); + err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; diff --git a/include/linux/sched.h b/include/linux/sched.h index c628a9151437..c298fb9cf5ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2065,7 +2065,7 @@ extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); #endif #else static inline void sched_autogroup_create_attach(struct task_struct *p) { } diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (*nice < -20 || *nice > 19) + if (nice < -20 || nice > 19) return -EINVAL; - err = security_task_setnice(current, *nice); + err = security_task_setnice(current, nice); if (err) return err; - if (*nice < 0 && !can_nice(current, *nice)) + if (nice < 0 && !can_nice(current, nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); if (!err) - ag->nice = *nice; + ag->nice = nice; up_write(&ag->lock); autogroup_kref_put(ag); -- cgit v1.2.3 From 5d6523ebd2f67de9d23285aad7f3910e7b0aee83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 10 Mar 2012 00:07:36 +0100 Subject: sched: Fix load-balance wreckage Commit 367456c ("sched: Ditch per cgroup task lists for load-balancing") completely wrecked load-balancing due to a few silly mistakes. Correct those and remove more pointless code. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 110 +++++++++++++++++++--------------------------------- 1 file changed, 39 insertions(+), 71 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a0424fc4cc54..def17aa302d5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 struct lb_env { struct sched_domain *sd; @@ -3083,7 +3082,7 @@ struct lb_env { struct rq *dst_rq; enum cpu_idle_type idle; - unsigned long max_load_move; + long load_move; unsigned int flags; unsigned int loop; @@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); -static unsigned long balance_tasks(struct lb_env *env) +/* + * move_tasks tries to move up to load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env) { - long rem_load_move = env->max_load_move; - struct task_struct *p, *n; + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; unsigned long load; int pulled = 0; - if (env->max_load_move == 0) - goto out; + if (env->load_move <= 0) + return 0; + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { env->loop++; /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) { - env->flags |= LBF_ABORT; + if (env->loop > env->loop_max) break; - } - /* take a beather every nr_migrate tasks */ + + /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { env->loop_break += sysctl_sched_nr_migrate; env->flags |= LBF_NEED_BREAK; break; } - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, - env->dst_cpu)) + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) goto next; load = task_h_load(p); + if (load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load * 2) > rem_load_move) + if ((load / 2) > env->load_move) goto next; if (!can_migrate_task(p, env)) @@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env) move_task(p, env); pulled++; - rem_load_move -= load; + env->load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE) { - env->flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) break; - } #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ - if (rem_load_move <= 0) + if (env->load_move <= 0) break; continue; next: - list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); + list_move_tail(&p->se.group_node, tasks); } -out: + /* * Right now, this is one of only two places move_task() is called, * so we can safely collect move_task() stats here rather than @@ -3289,7 +3294,7 @@ out: */ schedstat_add(env->sd, lb_gained[env->idle], pulled); - return env->max_load_move - rem_load_move; + return pulled; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct lb_env *env) -{ - unsigned long max_load_move = env->max_load_move; - unsigned long total_load_moved = 0, load_moved; - - update_h_load(cpu_of(env->src_rq)); - do { - env->max_load_move = max_load_move - total_load_moved; - load_moved = balance_tasks(env); - total_load_moved += load_moved; - - if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { - env->flags |= LBF_ABORT; - break; - } -#endif - } while (load_moved && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - /********** Helpers for find_busiest_group ************************/ /* * sd_lb_stats - Structure to store the statistics of a sched_domain @@ -4477,31 +4445,31 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.max_load_move = imbalance; + env.load_move = imbalance; env.src_cpu = busiest->cpu; env.src_rq = busiest; env.loop_max = busiest->nr_running; +more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(&env); + if (!env.loop) + update_h_load(env.src_cpu); + ld_moved += move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (env.flags & LBF_ABORT) - goto out_balanced; - - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto redo; - } - /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); -- cgit v1.2.3 From 5fbd036b552f633abb394a319f7c62a5c86a9cd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Dec 2011 17:09:22 +0100 Subject: sched: Cleanup cpu_active madness Stepan found: CPU0 CPUn _cpu_up() __cpu_up() boostrap() notify_cpu_starting() set_cpu_online() while (!cpu_active()) cpu_relax() smp_call_function(.wait=1) /* we find cpu_online() is true */ arch_send_call_function_ipi_mask() /* wait-forever-more */ local_irq_enable() cpu_notify(CPU_ONLINE) sched_cpu_active() set_cpu_active() Now the purpose of cpu_active is mostly with bringing down a cpu, where we mark it !active to avoid the load-balancer from moving tasks to it while we tear down the cpu. This is required because we only update the sched_domain tree after we brought the cpu-down. And this is needed so that some tasks can still run while we bring it down, we just don't want new tasks to appear. On cpu-up however the sched_domain tree doesn't yet include the new cpu, so its invisible to the load-balancer, regardless of the active state. So instead of setting the active state after we boot the new cpu (and consequently having to wait for it before enabling interrupts) set the cpu active before we set it online and avoid the whole mess. Reported-by: Stepan Moskovchenko Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1323965362.18942.71.camel@twins Signed-off-by: Ingo Molnar --- arch/arm/kernel/smp.c | 7 ------- arch/hexagon/kernel/smp.c | 2 -- arch/s390/kernel/smp.c | 6 ------ arch/x86/kernel/smpboot.c | 13 ------------- kernel/sched/core.c | 2 +- 5 files changed, 1 insertion(+), 29 deletions(-) (limited to 'kernel/sched') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index cdeb727527d3..d616ed51e7a7 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -295,13 +295,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) */ percpu_timer_setup(); - while (!cpu_active(cpu)) - cpu_relax(); - - /* - * cpu_active bit is set, so it's safe to enalbe interrupts - * now. - */ local_irq_enable(); local_fiq_enable(); diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index c871a2cffaef..0123c63e9a3a 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -179,8 +179,6 @@ void __cpuinit start_secondary(void) printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu); set_cpu_online(cpu, true); - while (!cpumask_test_cpu(cpu, cpu_active_mask)) - cpu_relax(); local_irq_enable(); cpu_idle(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2398ce6b15ae..b0e28c47ab83 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -550,12 +550,6 @@ int __cpuinit start_secondary(void *cpuvoid) S390_lowcore.restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; __ctl_set_bit(0, 28); /* Enable lowcore protection */ - /* - * Wait until the cpu which brought this one up marked it - * active before enabling interrupts. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..58f78165d308 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); - /* - * Wait until the cpu which brought this one up marked it - * online before enabling interrupts. If we don't do that then - * we can end up waking up the softirq thread before this cpu - * reached the active state, which makes the scheduler unhappy - * and schedule the softirq thread on the wrong cpu. This is - * only observable with forced threaded interrupts, but in - * theory it could also happen w/o them. It's just way harder - * to achieve. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); - /* enable local interrupts */ local_irq_enable(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95545126be1c..b1ccce819ce2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5410,7 +5410,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3 From 554cecaf733623b327eef9652b65965eb1081b81 Mon Sep 17 00:00:00 2001 From: Diwakar Tundlam Date: Wed, 7 Mar 2012 14:44:26 -0800 Subject: sched/nohz: Correctly initialize 'next_balance' in 'nohz' idle balancer The 'next_balance' field of 'nohz' idle balancer must be initialized to jiffies. Since jiffies is initialized to negative 300 seconds the 'nohz' idle balancer does not run for the first 300s (5mins) after bootup. If no new processes are spawed or no idle cycles happen, the load on the cpus will remain unbalanced for that duration. Signed-off-by: Diwakar Tundlam Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1DD7BFEDD3147247B1355BEFEFE4665237994F30EF@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index def17aa302d5..11f3979bad2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5571,6 +5571,7 @@ __init void init_sched_fair_class(void) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ + nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif -- cgit v1.2.3 From 3ccf3e8306156a28213adc720aba807e9a901ad5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Feb 2012 10:47:00 +0100 Subject: printk/sched: Introduce special printk_sched() for those awkward moments There's a few awkward printk()s inside of scheduler guts that people prefer to keep but really are rather deadlock prone. Fudge around it by storing the text in a per-cpu buffer and poll it using the existing printk_tick() handler. This will drop output when its more frequent than once a tick, however only the affinity thing could possible go that fast and for that just one should suffice to notify the admin he's done something silly.. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-wua3lmkt3dg8nfts66o6brne@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/printk.h | 10 ++++++++++ kernel/printk.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched/core.c | 2 +- kernel/sched/rt.c | 8 +++++++- 4 files changed, 55 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/printk.h b/include/linux/printk.h index f0e22f75143f..1f77a4174ee0 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -100,6 +100,11 @@ int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); +/* + * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + */ +__printf(1, 2) __cold int printk_sched(const char *fmt, ...); + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -127,6 +132,11 @@ int printk(const char *s, ...) { return 0; } +static inline __printf(1, 2) __cold +int printk_sched(const char *s, ...) +{ + return 0; +} static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..7ca7ba591e21 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1208,13 +1208,47 @@ int is_console_locked(void) return console_locked; } +/* + * Delayed printk facility, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + local_irq_restore(flags); + + return r; +} void printk_tick(void) { if (__this_cpu_read(printk_pending)) { - __this_cpu_write(printk_pending, 0); - wake_up_interruptible(&log_wait); + int pending = __this_cpu_xchg(printk_pending, 0); + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); } } @@ -1228,7 +1262,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - this_cpu_write(printk_pending, 1); + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b1ccce819ce2..8781cec7c3e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1284,7 +1284,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + printk_sched("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7f7e7cdcb472..b60dad720173 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -864,8 +864,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + + if (!once) { + once = true; + printk_sched("sched: RT throttling activated\n"); + } } else { /* * In case we did anyway, make it go away, -- cgit v1.2.3 From 8e3fabfde445a872c8aec2296846badf24d7c8b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Mar 2012 18:54:26 +0100 Subject: sched: Update yield() docs Suggested-by: Joe Perches Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1331056466.11248.327.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8781cec7c3e6..47614a5cdd47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4577,8 +4577,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! */ void __sched yield(void) { -- cgit v1.2.3 From c308b56b5398779cd3da0f62ab26b0453494c3d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Mar 2012 15:04:46 +0100 Subject: sched: Fix nohz load accounting -- again! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various people reported nohz load tracking still being wrecked, but Doug spotted the actual problem. We fold the nohz remainder in too soon, causing us to loose samples and under-account. So instead of playing catch-up up-front, always do a single load-fold with whatever state we encounter and only then fold the nohz remainder and play catch-up. Reported-by: Doug Smythies Reported-by: LesÃ…=82aw Kope=C4=87 Reported-by: Aman Gupta Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-4v31etnhgg9kwd6ocgx3rxl8@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 47614a5cdd47..e3ccc13c4caa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2266,13 +2266,10 @@ calc_load_n(unsigned long load, unsigned long exp, * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { long delta, active, n; - if (time_before(jiffies, calc_load_update)) - return; - /* * If we crossed a calc_load_update boundary, make sure to fold * any pending idle changes, the respective CPUs might have @@ -2284,31 +2281,25 @@ static void calc_global_nohz(unsigned long ticks) atomic_long_add(delta, &calc_load_tasks); /* - * If we were idle for multiple load cycles, apply them. + * It could be the one fold was all it took, we done! */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; + if (time_before(jiffies, calc_load_update + 10)) + return; - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - calc_load_update += n * LOAD_FREQ; - } + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. - * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. - */ + calc_load_update += n * LOAD_FREQ; } #else void calc_load_account_idle(struct rq *this_rq) @@ -2320,7 +2311,7 @@ static inline long calc_load_fold_idle(void) return 0; } -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { } #endif @@ -2348,8 +2339,6 @@ void calc_global_load(unsigned long ticks) { long active; - calc_global_nohz(ticks); - if (time_before(jiffies, calc_load_update + 10)) return; @@ -2361,6 +2350,16 @@ void calc_global_load(unsigned long ticks) avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; + + /* + * Account one period with whatever state we found before + * folding in the nohz state and ageing the entire idle period. + * + * This avoids loosing a sample when we go idle between + * calc_load_account_active() (10 ticks ago) and now and thus + * under-accounting. + */ + calc_global_nohz(); } /* -- cgit v1.2.3