From e2b245f89ee3f5b03fb42d843a79a58cf4773181 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Mon, 1 Aug 2011 11:03:28 +0200 Subject: sched: Remove rq->avg_load_per_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a2d47777 ("sched: fix stale value in average load per task") the variable rq->avg_load_per_task is no longer required. Remove it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1312189408-17172-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..b1e8f0eca1f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -520,8 +520,6 @@ struct rq { int cpu; int online; - unsigned long avg_load_per_task; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -1569,11 +1567,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) - rq->avg_load_per_task = rq->load.weight / nr_running; - else - rq->avg_load_per_task = 0; + return rq->load.weight / nr_running; - return rq->avg_load_per_task; + return 0; } #ifdef CONFIG_PREEMPT -- cgit v1.2.3 From c350a04efd1c89cd256b2abc8f07a21d0d53ff24 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 27 Jul 2011 17:14:55 +0200 Subject: sched: fix broken SCHED_RESET_ON_FORK handling Setting child->prio = current->normal_prio _after_ SCHED_RESET_ON_FORK has been handled for an RT parent gives birth to a deranged mutant child with non-RT policy, but RT prio and sched_class. Move PI leakage protection up, always set priorities and weight, and if the child is leaving RT class, reset rt_priority to the proper value. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1311779695.8691.2.camel@marge.simson.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b1e8f0eca1f9..cf427bb2b65e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2843,20 +2843,24 @@ void sched_fork(struct task_struct *p) */ p->state = TASK_RUNNING; + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; - p->normal_prio = p->static_prio; - } - - if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); - p->normal_prio = p->static_prio; - set_load_weight(p); - } + p->rt_priority = 0; + } else if (PRIO_TO_NICE(p->static_prio) < 0) + p->static_prio = NICE_TO_PRIO(0); + + p->prio = p->normal_prio = __normal_prio(p); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -2865,11 +2869,6 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; -- cgit v1.2.3 From 953bfcd10e6f3697233e8e5128c611d275da39c1 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:27 -0700 Subject: sched: Implement hierarchical task accounting for SCHED_OTHER Introduce hierarchical task accounting for the group scheduling case in CFS, as well as promoting the responsibility for maintaining rq->nr_running to the scheduling classes. The primary motivation for this is that with scheduling classes supporting bandwidth throttling it is possible for entities participating in throttled sub-trees to not have root visible changes in rq->nr_running across activate and de-activate operations. This in turn leads to incorrect idle and weight-per-task load balance decisions. This also allows us to make a small fixlet to the fastpath in pick_next_task() under group scheduling. Note: this issue also exists with the existing sched_rt throttling mechanism. This patch does not address that. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184756.878333391@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- kernel/sched_fair.c | 6 ++++++ kernel/sched_rt.c | 5 ++++- kernel/sched_stoptask.c | 2 ++ 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cf427bb2b65e..cd1a531ca8ff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -311,7 +311,7 @@ struct task_group root_task_group; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long nr_running; + unsigned long nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime; @@ -1802,7 +1802,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); - inc_nr_running(rq); } /* @@ -1814,7 +1813,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); - dec_nr_running(rq); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -4258,7 +4256,7 @@ pick_next_task(struct rq *rq) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4b732a3552b..f86b0cb5eb29 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1310,16 +1310,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running++; flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running++; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } + inc_nr_running(rq); hrtick_update(rq); } @@ -1339,6 +1342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running--; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -1358,11 +1362,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running--; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } + dec_nr_running(rq); hrtick_update(rq); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a8c207ff3492..a9d3c6bc684a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -936,6 +936,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -946,6 +948,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); + + dec_nr_running(rq); } /* @@ -1841,4 +1845,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ - diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 6f437632afab..8b44e7fa7fb3 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { + inc_nr_running(rq); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { + dec_nr_running(rq); } static void yield_task_stop(struct rq *rq) -- cgit v1.2.3 From ab84d31e15502fb626169ba2663381e34bf965b2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:28 -0700 Subject: sched: Introduce primitives to account for CFS bandwidth tracking In this patch we introduce the notion of CFS bandwidth, partitioned into globally unassigned bandwidth, and locally claimed bandwidth. - The global bandwidth is per task_group, it represents a pool of unclaimed bandwidth that cfs_rqs can allocate from. - The local bandwidth is tracked per-cfs_rq, this represents allotments from the global pool bandwidth assigned to a specific cpu. Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem: - cpu.cfs_period_us : the bandwidth period in usecs - cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed to consume over period above. Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com Signed-off-by: Ingo Molnar --- init/Kconfig | 12 ++++ kernel/sched.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched_fair.c | 16 +++++ 3 files changed, 220 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/init/Kconfig b/init/Kconfig index d62778390e55..d19b3a77ab44 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED depends on CGROUP_SCHED default CGROUP_SCHED +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See tip/Documentation/scheduler/sched-bwc.txt for more information. + config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" depends on EXPERIMENTAL diff --git a/kernel/sched.c b/kernel/sched.c index cd1a531ca8ff..f08cb23be96c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -247,6 +247,14 @@ struct cfs_rq; static LIST_HEAD(task_groups); +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota; +#endif +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -278,6 +286,8 @@ struct task_group { #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif + + struct cfs_bandwidth cfs_bandwidth; }; /* task_group_lock serializes the addition/removal of task groups */ @@ -377,9 +387,48 @@ struct cfs_rq { unsigned long load_contribution; #endif +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; +#endif #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_BANDWIDTH +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +static inline u64 default_cfs_period(void); + +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{} +#else +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -7971,6 +8020,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, /* allow initial update_cfs_load() to truncate */ cfs_rq->load_stamp = 1; #endif + init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; @@ -8110,6 +8160,7 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8351,6 +8402,8 @@ static void free_fair_sched_group(struct task_group *tg) { int i; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -8378,6 +8431,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); @@ -8753,7 +8808,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) return walk_tg_tree(tg_schedulable, tg_nop, &data); } -static int tg_set_bandwidth(struct task_group *tg, +static int tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime) { int i, err = 0; @@ -8792,7 +8847,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_runtime(struct task_group *tg) @@ -8817,7 +8872,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) if (rt_period == 0) return -EINVAL; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_period(struct task_group *tg) @@ -9007,6 +9062,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) scale_load_down(tg->shares); } + +#ifdef CONFIG_CFS_BANDWIDTH +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ + int i; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + static DEFINE_MUTEX(mutex); + + if (tg == &root_task_group) + return -EINVAL; + + /* + * Ensure we have at some amount of bandwidth every period. This is + * to prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota < min_cfs_quota_period || period < min_cfs_quota_period) + return -EINVAL; + + /* + * Likewise, bound things on the otherside by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period > max_cfs_quota_period) + return -EINVAL; + + mutex_lock(&mutex); + raw_spin_lock_irq(&cfs_b->lock); + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + raw_spin_unlock_irq(&cfs_b->lock); + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock_irq(&rq->lock); + cfs_rq->runtime_enabled = quota != RUNTIME_INF; + cfs_rq->runtime_remaining = 0; + raw_spin_unlock_irq(&rq->lock); + } + mutex_unlock(&mutex); + + return 0; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ + u64 quota, period; + + period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ + u64 quota_us; + + if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) + return -1; + + quota_us = tg_cfs_bandwidth(tg)->quota; + do_div(quota_us, NSEC_PER_USEC); + + return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 quota, period; + + period = (u64)cfs_period_us * NSEC_PER_USEC; + quota = tg_cfs_bandwidth(tg)->quota; + + if (period <= 0) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + do_div(cfs_period_us, NSEC_PER_USEC); + + return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_quota_us) +{ + return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9041,6 +9218,18 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_shares_write_u64, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, +#endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", @@ -9350,4 +9539,3 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ - diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f86b0cb5eb29..f24f4171019d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1234,6 +1234,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) check_preempt_tick(cfs_rq, curr); } + +/************************************************** + * CFS bandwidth control machinery + */ + +#ifdef CONFIG_CFS_BANDWIDTH +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ + return 100000000ULL; +} +#endif + /************************************************** * CFS operations on tasks: */ -- cgit v1.2.3 From a790de99599a29ad3f18667530cf4b9f4b7e3234 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:29 -0700 Subject: sched: Validate CFS quota hierarchies Add constraints validation for CFS bandwidth hierarchies. Validate that: max(child bandwidth) <= parent_bandwidth In a quota limited hierarchy, an unconstrained entity (e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent. This constraint is chosen over sum(child_bandwidth) as notion of over-commit is valuable within SCHED_OTHER. Some basic code from the RT case is re-factored for reuse. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.083774572@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f08cb23be96c..ea6850d93b2a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -252,6 +252,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota; + s64 hierarchal_quota; #endif }; @@ -1518,7 +1519,8 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -8708,12 +8710,7 @@ unsigned long sched_group_shares(struct task_group *tg) } #endif -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -8721,6 +8718,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) return div64_u64(runtime << 20, period); } +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) @@ -8741,7 +8745,7 @@ struct rt_schedulable_data { u64 rt_runtime; }; -static int tg_schedulable(struct task_group *tg, void *data) +static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; @@ -8805,7 +8809,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) .rt_runtime = runtime, }; - return walk_tg_tree(tg_schedulable, tg_nop, &data); + return walk_tg_tree(tg_rt_schedulable, tg_nop, &data); } static int tg_set_rt_bandwidth(struct task_group *tg, @@ -9064,14 +9068,17 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) } #ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i; + int i, ret = 0; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - static DEFINE_MUTEX(mutex); if (tg == &root_task_group) return -EINVAL; @@ -9092,7 +9099,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; - mutex_lock(&mutex); + mutex_lock(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); + if (ret) + goto out_unlock; + raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -9107,9 +9118,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } - mutex_unlock(&mutex); +out_unlock: + mutex_unlock(&cfs_constraints_mutex); - return 0; + return ret; } int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) @@ -9183,6 +9195,78 @@ static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); } +struct cfs_schedulable_data { + struct task_group *tg; + u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, + struct cfs_schedulable_data *d) +{ + u64 quota, period; + + if (tg == d->tg) { + period = d->period; + quota = d->quota; + } else { + period = tg_get_cfs_period(tg); + quota = tg_get_cfs_quota(tg); + } + + /* note: these should typically be equivalent */ + if (quota == RUNTIME_INF || quota == -1) + return RUNTIME_INF; + + return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ + struct cfs_schedulable_data *d = data; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + s64 quota = 0, parent_quota = -1; + + if (!tg->parent) { + quota = RUNTIME_INF; + } else { + struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); + + quota = normalize_cfs_quota(tg, d); + parent_quota = parent_b->hierarchal_quota; + + /* + * ensure max(child_quota) <= parent_quota, inherit when no + * limit is set + */ + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } + cfs_b->hierarchal_quota = quota; + + return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ + struct cfs_schedulable_data data = { + .tg = tg, + .period = period, + .quota = quota, + }; + + if (quota != RUNTIME_INF) { + do_div(data.period, NSEC_PER_USEC); + do_div(data.quota, NSEC_PER_USEC); + } + + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From ec12cb7f31e28854efae7dd6f9544e0a66379040 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:30 -0700 Subject: sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth Account bandwidth usage on the cfs_rq level versus the task_groups to which they belong. Whether we are tracking bandwidth on a given cfs_rq is maintained under cfs_rq->runtime_enabled. cfs_rq's which belong to a bandwidth constrained task_group have their runtime accounted via the update_curr() path, which withdraws bandwidth from the global pool as desired. Updates involving the global pool are currently protected under cfs_bandwidth->lock, local runtime is protected by rq->lock. This patch only assigns and tracks quota, no action is taken in the case that cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned. Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +++ kernel/sched.c | 4 ++- kernel/sched_fair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 10 +++++++ 4 files changed, 94 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ac2c0578e0f..bc6f5f2e24fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2040,6 +2040,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/kernel/sched.c b/kernel/sched.c index ea6850d93b2a..35561c63a490 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -251,7 +251,7 @@ struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; ktime_t period; - u64 quota; + u64 quota, runtime; s64 hierarchal_quota; #endif }; @@ -407,6 +407,7 @@ static inline u64 default_cfs_period(void); static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); } @@ -9107,6 +9108,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; + cfs_b->runtime = quota; raw_spin_unlock_irq(&cfs_b->lock); for_each_possible_cpu(i) { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f24f4171019d..9502aa899f73 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; */ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; +#ifdef CONFIG_CFS_BANDWIDTH +/* + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool + * each time a cfs_rq requests quota. + * + * Note: in the case that the slice exceeds the runtime remaining (either due + * to consumption or the quota being specified to be smaller than the slice) + * we will always only issue the remaining available time. + * + * default: 5 msec, units: microseconds + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + static const struct sched_class fair_sched_class; /************************************************************** @@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq) cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } + + account_cfs_rq_runtime(cfs_rq, delta_exec); } static inline void @@ -1248,6 +1266,58 @@ static inline u64 default_cfs_period(void) { return 100000000ULL; } + +static inline u64 sched_cfs_bandwidth_slice(void) +{ + return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; +} + +static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + u64 amount = 0, min_amount; + + /* note: this is a positive sum as runtime_remaining <= 0 */ + min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota == RUNTIME_INF) + amount = min_amount; + else if (cfs_b->runtime > 0) { + amount = min(cfs_b->runtime, min_amount); + cfs_b->runtime -= amount; + } + raw_spin_unlock(&cfs_b->lock); + + cfs_rq->runtime_remaining += amount; +} + +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + if (!cfs_rq->runtime_enabled) + return; + + cfs_rq->runtime_remaining -= delta_exec; + if (cfs_rq->runtime_remaining > 0) + return; + + assign_cfs_rq_runtime(cfs_rq); +} + +static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + if (!cfs_rq->runtime_enabled) + return; + + __account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +#else +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} #endif /************************************************** @@ -4266,8 +4336,13 @@ static void set_curr_task_fair(struct rq *rq) { struct sched_entity *se = &rq->curr->se; - for_each_sched_entity(se) - set_next_entity(cfs_rq_of(se), se); + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + set_next_entity(cfs_rq, se); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -- cgit v1.2.3 From 58088ad0152ba4b7997388c93d0ca208ec1ece75 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:31 -0700 Subject: sched: Add a timer to handle CFS bandwidth refresh This patch adds a per-task_group timer which handles the refresh of the global CFS bandwidth pool. Since the RT pool is using a similar timer there's some small refactoring to share this support. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 107 +++++++++++++++++++++++++++++++++++++++++----------- kernel/sched_fair.c | 40 ++++++++++++++++++-- 2 files changed, 123 insertions(+), 24 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 35561c63a490..34bf8e6db9af 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - ktime_t now; + unsigned long delta; + ktime_t soft, hard, now; + for (;;) { + if (hrtimer_active(period_timer)) + break; + + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); + + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; @@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) return; raw_spin_lock(&rt_b->rt_runtime_lock); - for (;;) { - unsigned long delta; - ktime_t soft, hard; - - if (hrtimer_active(&rt_b->rt_period_timer)) - break; - - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - - soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); - hard = hrtimer_get_expires(&rt_b->rt_period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -253,6 +256,9 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchal_quota; + + int idle, timer_active; + struct hrtimer period_timer; #endif }; @@ -403,6 +409,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { @@ -410,6 +438,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -417,8 +448,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; } +/* requires cfs_b->lock, may release to reprogram timer */ +static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{} +{ + hrtimer_cancel(&cfs_b->period_timer); +} #else static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} @@ -9078,7 +9135,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i, ret = 0; + int i, ret = 0, runtime_enabled; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); if (tg == &root_task_group) @@ -9105,10 +9162,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (ret) goto out_unlock; + runtime_enabled = quota != RUNTIME_INF; raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; cfs_b->runtime = quota; + + /* restart the period timer (if active) to handle new period expiry */ + if (runtime_enabled && cfs_b->timer_active) { + /* force a reprogram */ + cfs_b->timer_active = 0; + __start_cfs_bandwidth(cfs_b); + } raw_spin_unlock_irq(&cfs_b->lock); for_each_possible_cpu(i) { @@ -9116,7 +9181,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = quota != RUNTIME_INF; + cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9502aa899f73..af73a8a85eef 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1284,9 +1284,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; - else if (cfs_b->runtime > 0) { - amount = min(cfs_b->runtime, min_amount); - cfs_b->runtime -= amount; + else { + /* ensure bandwidth timer remains active under consumption */ + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b); + + if (cfs_b->runtime > 0) { + amount = min(cfs_b->runtime, min_amount); + cfs_b->runtime -= amount; + cfs_b->idle = 0; + } } raw_spin_unlock(&cfs_b->lock); @@ -1315,6 +1322,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, __account_cfs_rq_runtime(cfs_rq, delta_exec); } +/* + * Responsible for refilling a task_group's bandwidth and unthrottling its + * cfs_rqs as appropriate. If there has been no activity within the last + * period the timer is deactivated until scheduling resumes; cfs_b->idle is + * used to track this state. + */ +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +{ + int idle = 1; + + raw_spin_lock(&cfs_b->lock); + /* no need to continue the timer with no bandwidth constraint */ + if (cfs_b->quota == RUNTIME_INF) + goto out_unlock; + + idle = cfs_b->idle; + cfs_b->runtime = cfs_b->quota; + + /* mark as potentially idle for the upcoming period */ + cfs_b->idle = 1; +out_unlock: + if (idle) + cfs_b->timer_active = 0; + raw_spin_unlock(&cfs_b->lock); + + return idle; +} #else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} -- cgit v1.2.3 From a9cf55b2861057a213e610da2fec52125439a11d Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:32 -0700 Subject: sched: Expire invalid runtime Since quota is managed using a global state but consumed on a per-cpu basis we need to ensure that our per-cpu state is appropriately synchronized. Most importantly, runtime that is state (from a previous period) should not be locally consumable. We take advantage of existing sched_clock synchronization about the jiffy to efficiently detect whether we have (globally) crossed a quota boundary above. One catch is that the direction of spread on sched_clock is undefined, specifically, we don't know whether our local clock is behind or ahead of the one responsible for the current expiration time. Fortunately we can differentiate these by considering whether the global deadline has advanced. If it has not, then we assume our clock to be "fast" and advance our local expiration; otherwise, we know the deadline has truly passed and we expire our local runtime. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++- kernel/sched_fair.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 84 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 34bf8e6db9af..a2d55144bd9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -256,6 +256,7 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchal_quota; + u64 runtime_expires; int idle, timer_active; struct hrtimer period_timer; @@ -396,6 +397,7 @@ struct cfs_rq { #endif #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + u64 runtime_expires; s64 runtime_remaining; #endif #endif @@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; - cfs_b->runtime = quota; + __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index af73a8a85eef..9d1adbd0b615 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1272,11 +1272,30 @@ static inline u64 sched_cfs_bandwidth_slice(void) return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; } +/* + * Replenish runtime according to assigned quota and update expiration time. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding + * additional synchronization around rq->lock. + * + * requires cfs_b->lock + */ +static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +{ + u64 now; + + if (cfs_b->quota == RUNTIME_INF) + return; + + now = sched_clock_cpu(smp_processor_id()); + cfs_b->runtime = cfs_b->quota; + cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +} + static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 amount = 0, min_amount, expires; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -1285,9 +1304,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - /* ensure bandwidth timer remains active under consumption */ - if (!cfs_b->timer_active) + /* + * If the bandwidth pool has become inactive, then at least one + * period must have elapsed since the last consumption. + * Refresh the global state and ensure bandwidth timer becomes + * active. + */ + if (!cfs_b->timer_active) { + __refill_cfs_bandwidth_runtime(cfs_b); __start_cfs_bandwidth(cfs_b); + } if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -1295,19 +1321,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; + /* + * we may have advanced our local expiration to account for allowed + * spread between our sched_clock and the one on which runtime was + * issued. + */ + if ((s64)(expires - cfs_rq->runtime_expires) > 0) + cfs_rq->runtime_expires = expires; } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +/* + * Note: This depends on the synchronization provided by sched_clock and the + * fact that rq->clock snapshots this value. + */ +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_rq->runtime_enabled) + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct rq *rq = rq_of(cfs_rq); + + /* if the deadline is ahead of our clock, nothing to do */ + if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + return; + + if (cfs_rq->runtime_remaining < 0) return; + /* + * If the local deadline has passed we have to consider the + * possibility that our sched_clock is 'fast' and the global deadline + * has not truly expired. + * + * Fortunately we can check determine whether this the case by checking + * whether the global deadline has advanced. + */ + + if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + /* extend local deadline, drift is bounded above by 2 ticks */ + cfs_rq->runtime_expires += TICK_NSEC; + } else { + /* global deadline is ahead, expiration has passed */ + cfs_rq->runtime_remaining = 0; + } +} + +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - if (cfs_rq->runtime_remaining > 0) + expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) return; assign_cfs_rq_runtime(cfs_rq); @@ -1338,7 +1406,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) goto out_unlock; idle = cfs_b->idle; - cfs_b->runtime = cfs_b->quota; + /* if we're going inactive then everything else can be deferred */ + if (idle) + goto out_unlock; + + __refill_cfs_bandwidth_runtime(cfs_b); + /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -1557,7 +1630,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; } - #else static inline unsigned long effective_load(struct task_group *tg, int cpu, -- cgit v1.2.3 From 85dac906bec3bb41bfaa7ccaa65c4706de5cfdf8 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:33 -0700 Subject: sched: Add support for throttling group entities Now that consumption is tracked (via update_curr()) we add support to throttle group entities (and their corresponding cfs_rqs) in the case where this is no run-time remaining. Throttled entities are dequeued to prevent scheduling, additionally we mark them as throttled (using cfs_rq->throttled) to prevent them from becoming re-enqueued until they are unthrottled. A list of a task_group's throttled entities are maintained on the cfs_bandwidth structure. Note: While the machinery for throttling is added in this patch the act of throttling an entity exceeding its bandwidth is deferred until later within the series. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.480608533@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++ kernel/sched_fair.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a2d55144bd9c..044260a9418d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -260,6 +260,8 @@ struct cfs_bandwidth { int idle, timer_active; struct hrtimer period_timer; + struct list_head throttled_cfs_rq; + #endif }; @@ -399,6 +401,9 @@ struct cfs_rq { int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; + + int throttled; + struct list_head throttled_list; #endif #endif }; @@ -441,6 +446,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->period_timer.function = sched_cfs_period_timer; } @@ -448,6 +454,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); } /* requires cfs_b->lock, may release to reprogram timer */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9d1adbd0b615..72c9d4ed5991 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1291,7 +1291,8 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); } -static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); @@ -1332,6 +1333,8 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if ((s64)(expires - cfs_rq->runtime_expires) > 0) cfs_rq->runtime_expires = expires; + + return cfs_rq->runtime_remaining > 0; } /* @@ -1378,7 +1381,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, if (likely(cfs_rq->runtime_remaining > 0)) return; - assign_cfs_rq_runtime(cfs_rq); + /* + * if we're unable to extend our runtime we resched so that the active + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) + resched_task(rq_of(cfs_rq)->curr); } static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, @@ -1390,6 +1398,47 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, __account_cfs_rq_runtime(cfs_rq, delta_exec); } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_rq->throttled; +} + +static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* account load preceding throttle */ + update_cfs_load(cfs_rq, 0); + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) + rq->nr_running -= task_delta; + + cfs_rq->throttled = 1; + raw_spin_lock(&cfs_b->lock); + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + raw_spin_unlock(&cfs_b->lock); +} + /* * Responsible for refilling a task_group's bandwidth and unthrottling its * cfs_rqs as appropriate. If there has been no activity within the last @@ -1425,6 +1474,11 @@ out_unlock: #else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} #endif /************************************************** @@ -1503,7 +1557,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); + + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running increment below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; cfs_rq->h_nr_running++; + flags = ENQUEUE_WAKEUP; } @@ -1511,11 +1575,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + if (cfs_rq_throttled(cfs_rq)) + break; + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } - inc_nr_running(rq); + if (!se) + inc_nr_running(rq); hrtick_update(rq); } @@ -1535,6 +1603,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running decrement below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; cfs_rq->h_nr_running--; /* Don't dequeue parent if it has other entities besides us */ @@ -1557,11 +1634,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + if (cfs_rq_throttled(cfs_rq)) + break; + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } - dec_nr_running(rq); + if (!se) + dec_nr_running(rq); hrtick_update(rq); } -- cgit v1.2.3 From 671fd9dabe5239ad218c7eb48b2b9edee50250e6 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:34 -0700 Subject: sched: Add support for unthrottling group entities At the start of each period we refresh the global bandwidth pool. At this time we must also unthrottle any cfs_rq entities who are now within bandwidth once more (as quota permits). Unthrottled entities have their corresponding cfs_rq->throttled flag cleared and their entities re-enqueued. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.574628950@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++ kernel/sched_fair.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 044260a9418d..4bbabc2c4a77 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9192,6 +9192,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&rq->lock); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } out_unlock: diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 72c9d4ed5991..76411950ff3b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1439,6 +1439,84 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_b->lock); } +static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + cfs_rq->throttled = 0; + raw_spin_lock(&cfs_b->lock); + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + if (!se) + rq->nr_running += task_delta; + + /* determine whether we need to wake up potentially idle cpu */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + u64 remaining, u64 expires) +{ + struct cfs_rq *cfs_rq; + u64 runtime = remaining; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock(&rq->lock); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > remaining) + runtime = remaining; + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; + cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) + unthrottle_cfs_rq(cfs_rq); + +next: + raw_spin_unlock(&rq->lock); + + if (!remaining) + break; + } + rcu_read_unlock(); + + return remaining; +} + /* * Responsible for refilling a task_group's bandwidth and unthrottling its * cfs_rqs as appropriate. If there has been no activity within the last @@ -1447,23 +1525,64 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - int idle = 1; + u64 runtime, runtime_expires; + int idle = 1, throttled; raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) goto out_unlock; - idle = cfs_b->idle; + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + /* idle depends on !throttled (for the case of a large deficit) */ + idle = cfs_b->idle && !throttled; + /* if we're going inactive then everything else can be deferred */ if (idle) goto out_unlock; __refill_cfs_bandwidth_runtime(cfs_b); + if (!throttled) { + /* mark as potentially idle for the upcoming period */ + cfs_b->idle = 1; + goto out_unlock; + } + + /* + * There are throttled entities so we must first use the new bandwidth + * to unthrottle them before making it generally available. This + * ensures that all existing debts will be paid before a new cfs_rq is + * allowed to run. + */ + runtime = cfs_b->runtime; + runtime_expires = cfs_b->runtime_expires; + cfs_b->runtime = 0; + + /* + * This check is repeated as we are holding onto the new bandwidth + * while we unthrottle. This can potentially race with an unthrottled + * group trying to acquire new bandwidth from the global pool. + */ + while (throttled && runtime > 0) { + raw_spin_unlock(&cfs_b->lock); + /* we can't nest cfs_b->lock while distributing bandwidth */ + runtime = distribute_cfs_runtime(cfs_b, runtime, + runtime_expires); + raw_spin_lock(&cfs_b->lock); + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + } - /* mark as potentially idle for the upcoming period */ - cfs_b->idle = 1; + /* return (any) remaining runtime */ + cfs_b->runtime = runtime; + /* + * While we are ensured activity in the period following an + * unthrottle, this also covers the case in which the new bandwidth is + * insufficient to cover the existing bandwidth deficit. (Forcing the + * timer to remain active while there are any throttled entities.) + */ + cfs_b->idle = 0; out_unlock: if (idle) cfs_b->timer_active = 0; -- cgit v1.2.3 From 8277434ef1202ce30315f8edb3fc760aa6e74493 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:35 -0700 Subject: sched: Allow for positional tg_tree walks Extend walk_tg_tree to accept a positional argument static int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, void *data) Existing semantics are preserved, caller must hold rcu_lock() or sufficient analogue. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.677889157@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4bbabc2c4a77..8ec1e7ac2894 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1591,20 +1591,23 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) typedef int (*tg_visitor)(struct task_group *, void *); /* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +static int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; - rcu_read_lock(); - parent = &root_task_group; + parent = from; + down: ret = (*down)(parent, data); if (ret) - goto out_unlock; + goto out; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1613,19 +1616,29 @@ up: continue; } ret = (*up)(parent, data); - if (ret) - goto out_unlock; + if (ret || parent == from) + goto out; child = parent; parent = parent->parent; if (parent) goto up; -out_unlock: - rcu_read_unlock(); - +out: return ret; } +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ + +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + static int tg_nop(struct task_group *tg, void *data) { return 0; @@ -8870,13 +8883,19 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { + int ret; + struct rt_schedulable_data data = { .tg = tg, .rt_period = period, .rt_runtime = runtime, }; - return walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; } static int tg_set_rt_bandwidth(struct task_group *tg, @@ -9333,6 +9352,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { + int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -9344,7 +9364,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_lock(); + ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_unlock(); + + return ret; } #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 64660c864f46202b932b911a69deb09805bdbaf8 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:36 -0700 Subject: sched: Prevent interactions with throttled entities From the perspective of load-balance and shares distribution, throttled entities should be invisible. However, both of these operations work on 'active' lists and are not inherently aware of what group hierarchies may be present. In some cases this may be side-stepped (e.g. we could sideload via tg_load_down in load balance) while in others (e.g. update_shares()) it is more difficult to compute without incurring some O(n^2) costs. Instead, track hierarchicaal throttled state at time of transition. This allows us to easily identify whether an entity belongs to a throttled hierarchy and avoid incorrect interactions with it. Also, when an entity leaves a throttled hierarchy we need to advance its time averaging for shares averaging so that the elapsed throttled time is not considered as part of the cfs_rq's operation. We also use this information to prevent buddy interactions in the wakeup and yield_to() paths. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.777916795@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 94 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8ec1e7ac2894..5db05f6fb470 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -402,7 +402,7 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; - int throttled; + int throttled, throttle_count; struct list_head throttled_list; #endif #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 76411950ff3b..5a2089492a98 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -706,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED +/* we need this in update_cfs_load and load-balance functions below */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) @@ -728,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) u64 now, delta; unsigned long load = cfs_rq->load.weight; - if (cfs_rq->tg == &root_task_group) + if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) return; now = rq_of(cfs_rq)->clock_task; @@ -837,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) tg = cfs_rq->tg; se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se) + if (!se || throttled_hierarchy(cfs_rq)) return; #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) @@ -1403,6 +1405,65 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_rq->throttled; } +/* check whether cfs_rq, or any parent, is throttled */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return cfs_rq->throttle_count; +} + +/* + * Ensure that neither of the group entities corresponding to src_cpu or + * dest_cpu are members of a throttled hierarchy when performing group + * load-balance operations. + */ +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + + src_cfs_rq = tg->cfs_rq[src_cpu]; + dest_cfs_rq = tg->cfs_rq[dest_cpu]; + + return throttled_hierarchy(src_cfs_rq) || + throttled_hierarchy(dest_cfs_rq); +} + +/* updated child weight may affect parent so we have to do this bottom up */ +static int tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; +#ifdef CONFIG_SMP + if (!cfs_rq->throttle_count) { + u64 delta = rq->clock_task - cfs_rq->load_stamp; + + /* leaving throttled state, advance shares averaging windows */ + cfs_rq->load_stamp += delta; + cfs_rq->load_last += delta; + + /* update entity weight now that we are on_rq again */ + update_cfs_shares(cfs_rq); + } +#endif + + return 0; +} + +static int tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + /* group is entering throttled state, record last load */ + if (!cfs_rq->throttle_count) + update_cfs_load(cfs_rq, 0); + cfs_rq->throttle_count++; + + return 0; +} + static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -1413,7 +1474,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; /* account load preceding throttle */ - update_cfs_load(cfs_rq, 0); + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; for_each_sched_entity(se) { @@ -1454,6 +1517,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); + update_rq_clock(rq); + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + if (!cfs_rq->load.weight) return; @@ -1598,6 +1665,17 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + return 0; +} #endif /************************************************** @@ -2493,6 +2571,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, for_each_leaf_cfs_rq(busiest, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), + busiest->cpu, this_cpu)) + break; if (!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) @@ -2608,8 +2689,13 @@ static void update_shares(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) + for_each_leaf_cfs_rq(rq, cfs_rq) { + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + update_shares_cpu(cfs_rq->tg, cpu); + } rcu_read_unlock(); } @@ -2659,9 +2745,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, u64 rem_load, moved_load; /* - * empty group + * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight) + if (!busiest_cfs_rq->task_weight || + throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; -- cgit v1.2.3 From 8cb120d3e41a0464a559d639d519cef563717a4e Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:38 -0700 Subject: sched: Migrate throttled tasks on HOTPLUG Throttled tasks are invisisble to cpu-offline since they are not eligible for selection by pick_next_task(). The regular 'escape' path for a thread that is blocked at offline is via ttwu->select_task_rq, however this will not handle a throttled group since there are no individual thread wakeups on an unthrottle. Resolve this by unthrottling offline cpus so that threads can be migrated. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.989000590@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5db05f6fb470..397317248ddd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6335,6 +6335,30 @@ static void calc_global_load_remove(struct rq *rq) rq->calc_load_active = 0; } +#ifdef CONFIG_CFS_BANDWIDTH +static void unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} +#else +static void unthrottle_offline_cfs_rqs(struct rq *rq) {} +#endif + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -6360,6 +6384,9 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only -- cgit v1.2.3 From e8da1b18b32064c43881bceef0f051c2110c9ab9 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Thu, 21 Jul 2011 09:43:40 -0700 Subject: sched: Add exports tracking cfs bandwidth control statistics This change introduces statistics exports for the cpu sub-system, these are added through the use of a stat file similar to that exported by other subsystems. The following exports are included: nr_periods: number of periods in which execution occurred nr_throttled: the number of periods above in which execution was throttle throttled_time: cumulative wall-time that any cpus have been throttled for this group Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184758.198901931@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++++++++++ kernel/sched_fair.c | 7 +++++++ 2 files changed, 28 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 397317248ddd..35c91859f8a6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,6 +262,9 @@ struct cfs_bandwidth { struct hrtimer period_timer; struct list_head throttled_cfs_rq; + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; #endif }; @@ -402,6 +405,7 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; + u64 throttled_timestamp; int throttled, throttle_count; struct list_head throttled_list; #endif @@ -9397,6 +9401,19 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + + cb->fill(cb, "nr_periods", cfs_b->nr_periods); + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); + cb->fill(cb, "throttled_time", cfs_b->throttled_time); + + return 0; +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -9443,6 +9460,10 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_cfs_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, + { + .name = "stat", + .read_map = cpu_stats_show, + }, #endif #ifdef CONFIG_RT_GROUP_SCHED { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f9f671a7d0af..d201f28c1de7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1506,6 +1506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; + cfs_rq->throttled_timestamp = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -1523,8 +1524,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -1612,6 +1615,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) throttled = !list_empty(&cfs_b->throttled_cfs_rq); /* idle depends on !throttled (for the case of a large deficit) */ idle = cfs_b->idle && !throttled; + cfs_b->nr_periods += overrun; /* if we're going inactive then everything else can be deferred */ if (idle) @@ -1625,6 +1629,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) goto out_unlock; } + /* account preceding periods in which throttling occurred */ + cfs_b->nr_throttled += overrun; + /* * There are throttled entities so we must first use the new bandwidth * to unthrottle them before making it generally available. This -- cgit v1.2.3 From d8b4986d3dbc4fabc2054d63f1d31d6ed2fb1ca8 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:41 -0700 Subject: sched: Return unused runtime on group dequeue When a local cfs_rq blocks we return the majority of its remaining quota to the global bandwidth pool for use by other runqueues. We do this only when the quota is current and there is more than min_cfs_rq_quota [1ms by default] of runtime remaining on the rq. In the case where there are throttled runqueues and we have sufficient bandwidth to meter out a slice, a second timer is kicked off to handle this delivery, unthrottling where appropriate. Using a 'worst case' antagonist which executes on each cpu for 1ms before moving onto the next on a fairly large machine: no quota generations: 197.47 ms /cgroup/a/cpuacct.usage 199.46 ms /cgroup/a/cpuacct.usage 205.46 ms /cgroup/a/cpuacct.usage 198.46 ms /cgroup/a/cpuacct.usage 208.39 ms /cgroup/a/cpuacct.usage Since we are allowed to use "stale" quota our usage is effectively bounded by the rate of input into the global pool and performance is relatively stable. with quota generations [1s increments]: 119.58 ms /cgroup/a/cpuacct.usage 119.65 ms /cgroup/a/cpuacct.usage 119.64 ms /cgroup/a/cpuacct.usage 119.63 ms /cgroup/a/cpuacct.usage 119.60 ms /cgroup/a/cpuacct.usage The large deficit here is due to quota generations (/intentionally/) preventing us from now using previously stranded slack quota. The cost is that this quota becomes unavailable. with quota generations and quota return: 200.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 198.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 200.06 ms /cgroup/a/cpuacct.usage By returning unused quota we're able to both stably consume our desired quota and prevent unintentional overages due to the abuse of slack quota from previous quota periods (especially on a large machine). Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++++- kernel/sched_fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 35c91859f8a6..6baade0d7649 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -259,7 +259,7 @@ struct cfs_bandwidth { u64 runtime_expires; int idle, timer_active; - struct hrtimer period_timer; + struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; /* statistics */ @@ -421,6 +421,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 default_cfs_period(void); static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { @@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); } #else static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d201f28c1de7..1ca2cd44d64a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); } @@ -1674,6 +1679,108 @@ out_unlock: return idle; } +/* a cfs_rq won't donate quota below this amount */ +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; +/* minimum remaining period time to redistribute slack quota */ +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; +/* how long we wait to gather additional slack before distributing */ +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; + +/* are we near the end of the current quota period? */ +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) +{ + struct hrtimer *refresh_timer = &cfs_b->period_timer; + u64 remaining; + + /* if the call-back is running a quota refresh is already occurring */ + if (hrtimer_callback_running(refresh_timer)) + return 1; + + /* is a quota refresh about to occur? */ + remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); + if (remaining < min_expire) + return 1; + + return 0; +} + +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) +{ + u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; + + /* if there's a quota refresh soon don't bother with slack */ + if (runtime_refresh_within(cfs_b, min_left)) + return; + + start_bandwidth_timer(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period)); +} + +/* we know any runtime found here is valid as update_curr() precedes return */ +static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; + + if (slack_runtime <= 0) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && + cfs_rq->runtime_expires == cfs_b->runtime_expires) { + cfs_b->runtime += slack_runtime; + + /* we are under rq->lock, defer unthrottling using a timer */ + if (cfs_b->runtime > sched_cfs_bandwidth_slice() && + !list_empty(&cfs_b->throttled_cfs_rq)) + start_cfs_slack_bandwidth(cfs_b); + } + raw_spin_unlock(&cfs_b->lock); + + /* even if it's not valid for return we don't want to try again */ + cfs_rq->runtime_remaining -= slack_runtime; +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) + return; + + __return_cfs_rq_runtime(cfs_rq); +} + +/* + * This is done with a timer (instead of inline with bandwidth return) since + * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. + */ +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) +{ + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + u64 expires; + + /* confirm we're still not at a refresh boundary */ + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + runtime = cfs_b->runtime; + cfs_b->runtime = 0; + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + if (!runtime) + return; + + runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + + raw_spin_lock(&cfs_b->lock); + if (expires == cfs_b->runtime_expires) + cfs_b->runtime = runtime; + raw_spin_unlock(&cfs_b->lock); +} + /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of @@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { -- cgit v1.2.3 From fa14ff4accfb24e59d2473f3d864d6648d80563b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Sep 2011 13:06:17 +0200 Subject: sched: Convert to struct llist Use the generic llist primitives. We had a private lockless list implementation in the scheduler in the wake-list code, now that we have a generic llist implementation that provides all required operations, switch to it. This patch is not expected to change any behavior. Signed-off-by: Peter Zijlstra Cc: Huang Ying Cc: Andrew Morton Link: http://lkml.kernel.org/r/1315836353.26517.42.camel@twins Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- kernel/sched.c | 48 ++++++++++-------------------------------------- 2 files changed, 12 insertions(+), 39 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fda2888a6ab..fc3e8911818a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -90,6 +90,7 @@ struct sched_param { #include #include #include +#include #include @@ -1225,7 +1226,7 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct task_struct *wake_entry; + struct llist_node wake_entry; int on_cpu; #endif int on_rq; diff --git a/kernel/sched.c b/kernel/sched.c index c5cf15e1eb57..1874c7418319 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -702,7 +702,7 @@ struct rq { #endif #ifdef CONFIG_SMP - struct task_struct *wake_list; + struct llist_head wake_list; #endif }; @@ -2698,42 +2698,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_do_pending(struct task_struct *list) +static void sched_ttwu_pending(void) { struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; raw_spin_lock(&rq->lock); - while (list) { - struct task_struct *p = list; - list = list->wake_entry; + while (llist) { + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); ttwu_do_activate(rq, p, 0); } raw_spin_unlock(&rq->lock); } -#ifdef CONFIG_HOTPLUG_CPU - -static void sched_ttwu_pending(void) -{ - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; - - sched_ttwu_do_pending(list); -} - -#endif /* CONFIG_HOTPLUG_CPU */ - void scheduler_ipi(void) { - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) + if (llist_empty(&this_rq()->wake_list)) return; /* @@ -2750,25 +2734,13 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - sched_ttwu_do_pending(list); + sched_ttwu_pending(); irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *next = rq->wake_list; - - for (;;) { - struct task_struct *old = next; - - p->wake_entry = next; - next = cmpxchg(&rq->wake_list, old, p); - if (next == old) - break; - } - - if (!next) + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) smp_send_reschedule(cpu); } -- cgit v1.2.3 From 908a3283728d92df36e0c7cd63304fd35e93a8a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2011 15:32:06 +0200 Subject: sched: Fix idle_cpu() On -rt we observed hackbench waking all 400 tasks to a single cpu. This is because of select_idle_sibling()'s interaction with the new ipi based wakeup scheme. The existing idle_cpu() test only checks to see if the current task on that cpu is the idle task, it does not take already queued tasks into account, nor does it take queued to be woken tasks into account. If the remote wakeup IPIs come hard enough, there won't be time to schedule away from the idle task, and would thus keep thinking the cpu was in fact idle, regardless of the fact that there were already several hundred tasks runnable. We couldn't reproduce on mainline, but there's no reason it couldn't happen. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3o30p18b2paswpc9ohy2gltp@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1874c7418319..4cdc91cf48f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5138,7 +5138,20 @@ EXPORT_SYMBOL(task_nice); */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; } /** -- cgit v1.2.3 From ca38062e57e97791c2f62e3dbd06caf3ebb5721c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 3 Oct 2011 15:09:00 -0700 Subject: sched: Use resched IPI to kick off the nohz idle balance Current use of smp call function to kick the nohz idle balance can deadlock in this scenario. 1. cpu-A did a generic_exec_single() to cpu-B and after queuing its call single data (csd) to the call single queue, cpu-A took a timer interrupt. Actual IPI to cpu-B to process the call single queue is not yet sent. 2. As part of the timer interrupt handler, cpu-A decided to kick cpu-B for the idle load balancing (sets cpu-B's rq->nohz_balance_kick to 1) and __smp_call_function_single() with nowait will queue the csd to the cpu-B's queue. But the generic_exec_single() won't send an IPI to cpu-B as the call single queue was not empty. 3. cpu-A is busy with lot of interrupts 4. Meanwhile cpu-B is entering and exiting idle and noticed that it has it's rq->nohz_balance_kick set to '1'. So it will go ahead and do the idle load balancer and clear its rq->nohz_balance_kick. 5. At this point, csd queued as part of the step-2 above is still locked and waiting to be serviced on cpu-B. 6. cpu-A is still busy with interrupt load and now it got another timer interrupt and as part of it decided to kick cpu-B for another idle load balancing (as it finds cpu-B's rq->nohz_balance_kick cleared in step-4 above) and does __smp_call_function_single() with the same csd that is still locked. 7. And we get a deadlock waiting for the csd_lock() in the __smp_call_function_single(). Main issue here is that cpu-B can service the idle load balancer kick request from cpu-A even with out receiving the IPI and this lead to doing multiple __smp_call_function_single() on the same csd leading to deadlock. To kick a cpu, scheduler already has the reschedule vector reserved. Use that mechanism (kick_process()) instead of using the generic smp call function mechanism to kick off the nohz idle load balancing and avoid the deadlock. [ This issue is present from 2.6.35+ kernels, but marking it -stable only from v3.0+ as the proposed fix depends on the scheduler_ipi() that is introduced recently. ] Reported-by: Prarit Bhargava Signed-off-by: Suresh Siddha Cc: stable@kernel.org # v3.0+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111003220934.834943260@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++++++++-- kernel/sched_fair.c | 29 +++++++++-------------------- 2 files changed, 28 insertions(+), 22 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4cdc91cf48f6..9e49af00ae3e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1404,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -2717,7 +2729,7 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list)) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -2735,6 +2747,12 @@ void scheduler_ipi(void) */ irq_enter(); sched_ttwu_pending(); + + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick() && !need_resched())) + raise_softirq_irqoff(SCHED_SOFTIRQ); irq_exit(); } @@ -8288,7 +8306,6 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_balance_kick = 0; - init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); #endif #endif init_rq_hrtick(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fef0bfde7c8c..6c5fa1099229 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -4269,22 +4269,6 @@ out_unlock: } #ifdef CONFIG_NO_HZ - -static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); - -static void trigger_sched_softirq(void *data) -{ - raise_softirq_irqoff(SCHED_SOFTIRQ); -} - -static inline void init_sched_softirq_csd(struct call_single_data *csd) -{ - csd->func = trigger_sched_softirq; - csd->info = NULL; - csd->flags = 0; - csd->priv = 0; -} - /* * idle load balancing details * - One of the idle CPUs nominates itself as idle load_balancer, while @@ -4450,11 +4434,16 @@ static void nohz_balancer_kick(int cpu) } if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { - struct call_single_data *cp; - cpu_rq(ilb_cpu)->nohz_balance_kick = 1; - cp = &per_cpu(remote_sched_softirq_cb, cpu); - __smp_call_function_single(ilb_cpu, cp, 0); + + smp_mb(); + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); } return; } -- cgit v1.2.3 From 6eb57e0d65ebd99a71d435dc96d83e725752eef8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 3 Oct 2011 15:09:01 -0700 Subject: sched: Request for idle balance during nohz idle load balance rq's idle_at_tick is set to idle/busy during the timer tick depending on the cpu was idle or not. This will be used later in the load balance that will be done in the softirq context (which is a process context in -RT kernels). For nohz kernels, for the cpu doing nohz idle load balance on behalf of all the idle cpu's, its rq->idle_at_tick might have a stale value (which is recorded when it got the timer tick presumably when it is busy). As the nohz idle load balancing is also being done at the same place as the regular load balancing, nohz idle load balancing was bailing out when it sees rq's idle_at_tick not set. Thus leading to poor system utilization. Rename rq's idle_at_tick to idle_balance and set it when someone requests for nohz idle balance on an idle cpu. Reported-by: Srivatsa Vaddagiri Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111003220934.892350549@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++--- kernel/sched_fair.c | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9e49af00ae3e..7bc9b0e84eb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -644,7 +644,7 @@ struct rq { unsigned long cpu_power; - unsigned char idle_at_tick; + unsigned char idle_balance; /* For active balancing */ int post_schedule; int active_balance; @@ -2751,8 +2751,10 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) + if (unlikely(got_nohz_idle_kick() && !need_resched())) { + this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); + } irq_exit(); } @@ -4247,7 +4249,7 @@ void scheduler_tick(void) perf_event_task_tick(); #ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); + rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c5fa1099229..506db0966eb8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -4676,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; - if (rq->idle_at_tick) + if (idle_cpu(cpu)) return 0; first_pick_cpu = atomic_read(&nohz.first_pick_cpu); @@ -4712,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? + enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); -- cgit v1.2.3 From fa17b507f142d37aeac322a95f6f7c6375f25601 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2011 12:23:22 +0200 Subject: sched: Wrap scheduler p->cpus_allowed access This task is preparatory for the migrate_disable() implementation, but stands on its own and provides a cleanup. It currently only converts those sites required for task-placement. Kosaki-san once mentioned replacing cpus_allowed with a proper cpumask_t instead of the NR_CPUS sized array it currently is, that would also require something like this. Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: KOSAKI Motohiro Link: http://lkml.kernel.org/n/tip-e42skvaddos99psip0vce41o@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- kernel/sched_fair.c | 12 ++++++------ kernel/sched_rt.c | 4 ++-- lib/smp_processor_id.c | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7bc9b0e84eb3..45174ca5c8ea 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2544,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); if (dest_cpu < nr_cpu_ids) return dest_cpu; @@ -2585,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -6262,7 +6262,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 506db0966eb8..5c9e67923b7c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2183,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) + tsk_cpus_allowed(p))) continue; local_group = cpumask_test_cpu(this_cpu, @@ -2229,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int i; /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2273,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target) if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { target = i; break; @@ -2316,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; new_cpu = prev_cpu; } @@ -2697,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } @@ -4087,7 +4087,7 @@ redo: * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, - &busiest->curr->cpus_allowed)) { + tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0cc188cf7664..57a10842afa1 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1179,7 +1179,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -1324,7 +1324,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !task->on_rq)) { diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4689cb073da4..503f087382a4 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -22,7 +22,7 @@ notrace unsigned int debug_smp_processor_id(void) * Kernel threads bound to a single CPU can safely use * smp_processor_id(): */ - if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu))) + if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu))) goto out; /* -- cgit v1.2.3 From 4939602a2441306008c6dca38216b741d4e09a42 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Jun 2011 15:45:46 +0200 Subject: sched: Unify the ->cpus_allowed mask copy Currently every sched_class::set_cpus_allowed() implementation has to copy the cpumask into task_struct::cpus_allowed, this is pointless, put this copy in the generic code. Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-jhl5s9fckd9ptw1fzbqqlrd3@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- kernel/sched_rt.c | 3 --- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 45174ca5c8ea..ce9a9e7db116 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6161,10 +6161,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 57a10842afa1..3023fd1ef364 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1608,9 +1608,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(&rq->rt); } - - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = weight; } /* Assumes rq->lock is held */ -- cgit v1.2.3 From 510f5acc4f4fb07f3f075900dc468d6e380beff6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Jul 2011 20:47:54 +0200 Subject: sched: Don't use tasklist_lock for debug prints Avoid taking locks from debug prints, this avoids latencies on -rt, and improves reliability of the debug code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ce9a9e7db116..24637c782002 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6021,7 +6021,7 @@ void show_state_filter(unsigned long state_filter) printk(KERN_INFO " task PC stack pid father\n"); #endif - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -6037,7 +6037,7 @@ void show_state_filter(unsigned long state_filter) #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ -- cgit v1.2.3