From 32bf08a6257b9c7380dcd040af3c0858eee3ef05 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Oct 2014 14:54:57 -0700 Subject: bpf: fix bug in eBPF verifier while comparing for verifier state equivalency the comparison was missing a check for uninitialized register. Make sure it does so and add a testcase. Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier") Cc: Hannes Frederic Sowa Signed-off-by: Alexei Starovoitov Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 801f5f3b9307..9f81818f2941 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) if (memcmp(&old->regs[i], &cur->regs[i], sizeof(old->regs[0])) != 0) { if (old->regs[i].type == NOT_INIT || - old->regs[i].type == UNKNOWN_VALUE) + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) continue; return false; } -- cgit v1.2.3 From b2c4623dcd07af4b8ae3b56ae5f879e281c7b4f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Oct 2014 10:00:05 -0700 Subject: rcu: More on deadlock between CPU hotplug and expedited grace periods Commit dd56af42bd82 (rcu: Eliminate deadlock between CPU hotplug and expedited grace periods) was incomplete. Although it did eliminate deadlocks involving synchronize_sched_expedited()'s acquisition of cpu_hotplug.lock via get_online_cpus(), it did nothing about the similar deadlock involving acquisition of this same lock via put_online_cpus(). This deadlock became apparent with testing involving hibernation. This commit therefore changes put_online_cpus() acquisition of this lock to be conditional, and increments a new cpu_hotplug.puts_pending field in case of acquisition failure. Then cpu_hotplug_begin() checks for this new field being non-zero, and applies any changes to cpu_hotplug.refcount. Reported-by: Jiri Kosina Signed-off-by: Paul E. McKenney Tested-by: Jiri Kosina Tested-by: Borislav Petkov --- kernel/cpu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 356450f09c1f..90a3d017b90c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + /* And allows lockless put_online_cpus(). */ + atomic_t puts_pending; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -113,7 +115,11 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (!mutex_trylock(&cpu_hotplug.lock)) { + atomic_inc(&cpu_hotplug.puts_pending); + cpuhp_lock_release(); + return; + } if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ @@ -155,6 +161,12 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); + if (atomic_read(&cpu_hotplug.puts_pending)) { + int delta; + + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); -- cgit v1.2.3 From 8252ecf346474cfe46315bd0a7ca655c293c34a9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Oct 2014 14:56:01 -0400 Subject: ftrace: Set ops->old_hash on modifying what an ops hooks to The code that checks for trampolines when modifying function hooks tests against a modified ops "old_hash". But the ops old_hash pointer is not being updated before the changes are made, making it possible to not find the right hash to the callback and possibly causing ftrace to break in accounting and disable itself. Have the ops set its old_hash before the modifying takes place. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb186b9ddf51..483b8c1b1de0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2293,10 +2293,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } -static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash; ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3340,7 +3343,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) { int ret; int i; @@ -3348,7 +3351,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3477,13 +3481,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + __enable_ftrace_function_probe(old_hash); + if (!ret) free_ftrace_hash_rcu(old_hash); else count = ret; - __enable_ftrace_function_probe(); - out_unlock: mutex_unlock(&ftrace_lock); out: @@ -3764,10 +3769,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_hash *old_hash) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); } static int @@ -3813,7 +3819,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops); + ftrace_ops_update_code(ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4058,7 +4064,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops); + ftrace_ops_update_code(iter->ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); -- cgit v1.2.3 From 4fc409048d5afb1ad853f294b4262ecf2c980a49 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Oct 2014 14:48:35 -0400 Subject: ftrace: Fix checking of trampoline ftrace_ops in finding trampoline When modifying code, ftrace has several checks to make sure things are being done correctly. One of them is to make sure any code it modifies is exactly what it expects it to be before it modifies it. In order to do so with the new trampoline logic, it must be able to find out what trampoline a function is hooked to in order to see if the code that hooks to it is what's expected. The logic to find the trampoline from a record (accounting descriptor for a function that is hooked) needs to only look at the "old_hash" of an ops that is being modified. The old_hash is the list of function an ops is hooked to before its update. Since a record would only be pointing to an ops that is being modified if it was already hooked before. Currently, it can pick a modified ops based on its new functions it will be hooked to, and this picks the wrong trampoline and causes the check to fail, disabling ftrace. Signed-off-by: Steven Rostedt ftrace: squash into ordering of ops for modification --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 483b8c1b1de0..31c90fec4158 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) * when we are adding another op to the rec or removing the * current one. Thus, if the op is being added, we can * ignore it because it hasn't attached itself to the rec - * yet. That means we just need to find the op that has a - * trampoline and is not beeing added. + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. */ do_for_each_ftrace_op(op, ftrace_ops_list) { @@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) if (op->flags & FTRACE_OPS_FL_ADDING) continue; + /* - * If the ops is not being added and has a trampoline, - * then it must be the one that we want! + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. */ - if (hash_contains_ip(ip, op->func_hash)) - return op; - - /* If the ops is being modified, it may be in the old hash. */ if ((op->flags & FTRACE_OPS_FL_MODIFYING) && hash_contains_ip(ip, &op->old_hash)) return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; } while_for_each_ftrace_op(op); -- cgit v1.2.3 From 6891c4509c792209c44ced55a60f13954cb50ef4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 4 Oct 2014 23:06:39 +0200 Subject: posix-timers: Fix stack info leak in timer_create() If userland creates a timer without specifying a sigevent info, we'll create one ourself, using a stack local variable. Particularly will we use the timer ID as sival_int. But as sigev_value is a union containing a pointer and an int, that assignment will only partially initialize sigev_value on systems where the size of a pointer is bigger than the size of an int. On such systems we'll copy the uninitialized stack bytes from the timer_create() call to userland when the timer actually fires and we're going to deliver the signal. Initialize sigev_value with 0 to plug the stack info leak. Found in the PaX patch, written by the PaX Team. Fixes: 5a9fa7307285 ("posix-timers: kill ->it_sigev_signo and...") Signed-off-by: Mathias Krause Cc: Oleg Nesterov Cc: Brad Spengler Cc: PaX Team Cc: # v2.6.28+ Link: http://lkml.kernel.org/r/1412456799-32339-1-git-send-email-minipli@googlemail.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; -- cgit v1.2.3 From 10632008b9e18b76cbff0ffc69c15e948aa548e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2014 15:07:50 +0400 Subject: clockevents: Prevent shift out of bounds Andrey reported that on a kernel with UBSan enabled he found: UBSan: Undefined behaviour in ../kernel/time/clockevents.c:75:34 I guess it should be 1ULL here instead of 1U: (!ismax || evt->mult <= (1U << evt->shift))) That's indeed the correct solution because shift might be 32. Reported-by: Andrey Ryabinin Cc: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); -- cgit v1.2.3 From 993b2ff221999066fcff231590593d0b98f45d32 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Oct 2014 20:27:00 -0700 Subject: futex: Mention key referencing differences between shared and private futexes Update our documentation as of fix 76835b0ebf8 (futex: Ensure get_futex_key_refs() always implies a barrier). Explicitly state that we don't do key referencing for private futexes. Signed-off-by: Davidlohr Bueso Cc: Matteo Franchin Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Darren Hart Cc: Peter Zijlstra Cc: Paul E. McKenney Acked-by: Catalin Marinas Link: http://lkml.kernel.org/r/1414121220.817.0.camel@linux-t7sj.site Signed-off-by: Thomas Gleixner --- kernel/futex.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f3a3a071283c..bbf071f325b8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key) futex_get_mm(key); /* implies MB (B) */ break; default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { -- cgit v1.2.3 From 30a6b8031fe14031ab27c1fa3483cb9780e7f63c Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Sat, 25 Oct 2014 20:20:37 -0400 Subject: futex: Fix a race condition between REQUEUE_PI and task death free_pi_state and exit_pi_state_list both clean up futex_pi_state's. exit_pi_state_list takes the hb lock first, and most callers of free_pi_state do too. requeue_pi doesn't, which means free_pi_state can free the pi_state out from under exit_pi_state_list. For example: task A | task B exit_pi_state_list | pi_state = | curr->pi_state_list->next | | futex_requeue(requeue_pi=1) | // pi_state is the same as | // the one in task A | free_pi_state(pi_state) | list_del_init(&pi_state->list) | kfree(pi_state) list_del_init(&pi_state->list) | Move the free_pi_state calls in requeue_pi to before it drops the hb locks which it's already holding. [ tglx: Removed a pointless free_pi_state() call and the hb->lock held debugging. The latter comes via a seperate patch ] Signed-off-by: Brian Silverman Cc: austin.linux@gmail.com Cc: darren@dvhart.com Cc: peterz@infradead.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1414282837-23092-1-git-send-email-bsilver16384@gmail.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bbf071f325b8..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -647,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -1527,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1625,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1640,6 +1639,8 @@ retry_private: * exit to complete. * - The user space value changed. */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1716,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1733,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } -- cgit v1.2.3 From 94fb823fcb4892614f57e59601bb9d4920f24711 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 Oct 2014 20:29:10 +0300 Subject: PM / Sleep: fix recovery during resuming from hibernation If a device's dev_pm_ops::freeze callback fails during the QUIESCE phase, we don't rollback things correctly calling the thaw and complete callbacks. This could leave some devices in a suspended state in case of an error during resuming from hibernation. Signed-off-by: Imre Deak Cc: All applicable Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..1f35a3478f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode) error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); resume_console(); pm_restore_console(); -- cgit v1.2.3 From f89b7755f517cdbb755d7543eef986ee9d54e654 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 23 Oct 2014 18:41:08 -0700 Subject: bpf: split eBPF out of NET introduce two configs: - hidden CONFIG_BPF to select eBPF interpreter that classic socket filters depend on - visible CONFIG_BPF_SYSCALL (default off) that tracing and sockets can use that solves several problems: - tracing and others that wish to use eBPF don't need to depend on NET. They can use BPF_SYSCALL to allow loading from userspace or select BPF to use it directly from kernel in NET-less configs. - in 3.18 programs cannot be attached to events yet, so don't force it on - when the rest of eBPF infra is there in 3.19+, it's still useful to switch it off to minimize kernel size bloat-o-meter on x64 shows: add/remove: 0/60 grow/shrink: 0/2 up/down: 0/-15601 (-15601) tested with many different config combinations. Hopefully didn't miss anything. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- init/Kconfig | 14 ++++++++++++++ kernel/Makefile | 2 +- kernel/bpf/Makefile | 6 +++--- kernel/bpf/core.c | 9 +++++++++ net/Kconfig | 2 +- 5 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 3ee28ae02cc8..2081a4d3d917 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,10 @@ config SYSCTL_ARCH_UNALIGN_ALLOW config HAVE_PCSPKR_PLATFORM bool +# interpreter that classic socket filters depend on +config BPF + bool + menuconfig EXPERT bool "Configure standard kernel features (expert users)" # Unhide debug options, to make the on-by-default options visible @@ -1521,6 +1525,16 @@ config EVENTFD If unsure, say Y. +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" if EXPERT + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + config SHMEM bool "Use full shmem filesystem" if EXPERT default y diff --git a/kernel/Makefile b/kernel/Makefile index dc5c77544fd6..17ea6d4a9a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o -obj-$(CONFIG_NET) += bpf/ +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 45427239f375..0daf7f6ae7df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ -obj-y := core.o syscall.o verifier.o - +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o ifdef CONFIG_TEST_BPF -obj-y += test_stub.o +obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0c30c59b317..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp) schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/net/Kconfig b/net/Kconfig index 6272420a721b..99815b5454bf 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -6,7 +6,7 @@ menuconfig NET bool "Networking support" select NLATTR select GENERIC_NET_UTILS - select ANON_INODES + select BPF ---help--- Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even -- cgit v1.2.3 From c719f56092add9b3d4192f57c64ce7af11105130 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Oct 2014 11:10:21 +0200 Subject: perf: Fix and clean up initialization of pmu::event_idx Andy reported that the current state of event_idx is rather confused. So remove all but the x86_pmu implementation and change the default to return 0 (the safe option). Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Christoph Lameter Cc: Cody P Schafer Cc: Cody P Schafer Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Himangi Saraogi Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: sukadev@linux.vnet.ibm.com Cc: Thomas Huth Cc: Vince Weaver Cc: linux390@de.ibm.com Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/perf/hv-24x7.c | 6 ------ arch/powerpc/perf/hv-gpci.c | 6 ------ arch/s390/kernel/perf_cpum_sf.c | 6 ------ kernel/events/core.c | 15 +-------------- kernel/events/hw_breakpoint.c | 7 ------- 5 files changed, 1 insertion(+), 39 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 6c8710dd90c9..dba34088da28 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_event *event, int flags) return 0; } -static int h_24x7_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu h_24x7_pmu = { .task_ctx_nr = perf_invalid_context, @@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = { .start = h_24x7_event_start, .stop = h_24x7_event_stop, .read = h_24x7_event_update, - .event_idx = h_24x7_event_idx, }; static int hv_24x7_init(void) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 15fc76c93022..a051fe946c63 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf_event *event) return 0; } -static int h_gpci_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu h_gpci_pmu = { .task_ctx_nr = perf_invalid_context, @@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = { .start = h_gpci_event_start, .stop = h_gpci_event_stop, .read = h_gpci_event_update, - .event_idx = h_gpci_event_idx, }; static int hv_gpci_init(void) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 08e761318c17..b878f12a9597 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_event *event, int flags) perf_pmu_enable(event->pmu); } -static int cpumsf_pmu_event_idx(struct perf_event *event) -{ - return event->hw.idx; -} - CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF); CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG); @@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = { .stop = cpumsf_pmu_stop, .read = cpumsf_pmu_read, - .event_idx = cpumsf_pmu_event_idx, .attr_groups = cpumsf_pmu_attr_groups, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1425d07018de..2b02c9fda790 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) -- cgit v1.2.3 From d7e29933969e5ca7c112ce1368a07911f4485dc2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 09:15:54 -0700 Subject: rcu: Make rcu_barrier() understand about missing rcuo kthreads Commit 35ce7f29a44a (rcu: Create rcuo kthreads only for onlined CPUs) avoids creating rcuo kthreads for CPUs that never come online. This fixes a bug in many instances of firmware: Instead of lying about their age, these systems instead lie about the number of CPUs that they have. Before commit 35ce7f29a44a, this could result in huge numbers of useless rcuo kthreads being created. It appears that experience indicates that I should have told the people suffering from this problem to fix their broken firmware, but I instead produced what turned out to be a partial fix. The missing piece supplied by this commit makes sure that rcu_barrier() knows not to post callbacks for no-CBs CPUs that have not yet come online, because otherwise rcu_barrier() will hang on systems having firmware that lies about the number of CPUs. It is tempting to simply have rcu_barrier() refuse to post a callback on any no-CBs CPU that does not have an rcuo kthread. This unfortunately does not work because rcu_barrier() is required to wait for all pending callbacks. It is therefore required to wait even for those callbacks that cannot possibly be invoked. Even if doing so hangs the system. Given that posting a callback to a no-CBs CPU that does not yet have an rcuo kthread can hang rcu_barrier(), It is tempting to report an error in this case. Unfortunately, this will result in false positives at boot time, when it is perfectly legal to post callbacks to the boot CPU before the scheduler has started, in other words, before it is legal to invoke rcu_barrier(). So this commit instead has rcu_barrier() avoid posting callbacks to CPUs having neither rcuo kthread nor pending callbacks, and has it complain bitterly if it finds CPUs having no rcuo kthread but some pending callbacks. And when rcu_barrier() does find CPUs having no rcuo kthread but pending callbacks, as noted earlier, it has no choice but to hang indefinitely. Reported-by: Yanko Kaneti Reported-by: Jay Vosburgh Reported-by: Meelis Roos Reported-by: Eric B Munson Signed-off-by: Paul E. McKenney Tested-by: Eric B Munson Tested-by: Jay Vosburgh Tested-by: Yanko Kaneti Tested-by: Kevin Fenzi Tested-by: Meelis Roos --- include/trace/events/rcu.h | 18 +++++++++--------- kernel/rcu/tree.c | 15 ++++++++++----- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 9b56f37148cf..e335e7d8c6c2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -660,18 +660,18 @@ TRACE_EVENT(rcu_torture_read, /* * Tracepoint for _rcu_barrier() execution. The string "s" describes * the _rcu_barrier phase: - * "Begin": rcu_barrier_callback() started. - * "Check": rcu_barrier_callback() checking for piggybacking. - * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. - * "Inc1": rcu_barrier_callback() piggyback check counter incremented. - * "Offline": rcu_barrier_callback() found offline CPU - * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU. - * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. - * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. + * "Begin": _rcu_barrier() started. + * "Check": _rcu_barrier() checking for piggybacking. + * "EarlyExit": _rcu_barrier() piggybacked, thus early exit. + * "Inc1": _rcu_barrier() piggyback check counter incremented. + * "OfflineNoCB": _rcu_barrier() found callback on never-online CPU + * "OnlineNoCB": _rcu_barrier() found online no-CBs CPU. + * "OnlineQ": _rcu_barrier() found online CPU with callbacks. + * "OnlineNQ": _rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. - * "Inc2": rcu_barrier_callback() piggyback check counter incremented. + * "Inc2": _rcu_barrier() piggyback check counter incremented. * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument * is the count of remaining callbacks, and "done" is the piggybacking count. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 133e47223095..9815447d22e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d03764652d91..bbdc45d8d74f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -587,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 387dd4599344..c1d7f27bd38f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2049,6 +2049,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } } +/* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + /* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the @@ -2642,6 +2669,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } -- cgit v1.2.3 From f601de204465048bdf0d5537f630729622ebc3a6 Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Wed, 29 Oct 2014 14:50:24 -0700 Subject: gcov: add ARM64 to GCOV_PROFILE_ALL Following up the arm testing of gcov, turns out gcov on ARM64 works fine as well. Only change needed is adding ARM64 to Kconfig depends. Tested with qemu and mach-virt Signed-off-by: Riku Voipio Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index cf66c5c8458e..3b7408759bdf 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 0baf2a4dbf75abb7c186fd6c8d55d27aaa354a29 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 29 Oct 2014 14:50:35 -0700 Subject: kernel/kmod: fix use-after-free of the sub_info structure Found this in the message log on a s390 system: BUG kmalloc-192 (Not tainted): Poison overwritten Disabling lock debugging due to kernel taint INFO: 0x00000000684761f4-0x00000000684761f7. First byte 0xff instead of 0x6b INFO: Allocated in call_usermodehelper_setup+0x70/0x128 age=71 cpu=2 pid=648 __slab_alloc.isra.47.constprop.56+0x5f6/0x658 kmem_cache_alloc_trace+0x106/0x408 call_usermodehelper_setup+0x70/0x128 call_usermodehelper+0x62/0x90 cgroup_release_agent+0x178/0x1c0 process_one_work+0x36e/0x680 worker_thread+0x2f0/0x4f8 kthread+0x10a/0x120 kernel_thread_starter+0x6/0xc kernel_thread_starter+0x0/0xc INFO: Freed in call_usermodehelper_exec+0x110/0x1b8 age=71 cpu=2 pid=648 __slab_free+0x94/0x560 kfree+0x364/0x3e0 call_usermodehelper_exec+0x110/0x1b8 cgroup_release_agent+0x178/0x1c0 process_one_work+0x36e/0x680 worker_thread+0x2f0/0x4f8 kthread+0x10a/0x120 kernel_thread_starter+0x6/0xc kernel_thread_starter+0x0/0xc There is a use-after-free bug on the subprocess_info structure allocated by the user mode helper. In case do_execve() returns with an error ____call_usermodehelper() stores the error code to sub_info->retval, but sub_info can already have been freed. Regarding UMH_NO_WAIT, the sub_info structure can be freed by __call_usermodehelper() before the worker thread returns from do_execve(), allowing memory corruption when do_execve() failed after exec_mmap() is called. Regarding UMH_WAIT_EXEC, the call to umh_complete() allows call_usermodehelper_exec() to continue which then frees sub_info. To fix this race the code needs to make sure that the call to call_usermodehelper_freeinfo() is always done after the last store to sub_info->retval. Signed-off-by: Martin Schwidefsky Reviewed-by: Oleg Nesterov Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 76 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..80f7a6d00519 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + int wait = sub_info->wait & ~UMH_KILLABLE; struct cred *new; int retval; @@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (wait != UMH_WAIT_PROC) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } @@ -258,26 +281,6 @@ static int call_helper(void *data) return ____call_usermodehelper(data); } -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work) kmod_thread_locker = NULL; } - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } - sub_info->complete = &done; + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); -- cgit v1.2.3 From 086ba77a6db00ed858ff07451bedee197df868c9 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 29 Oct 2014 23:06:58 +0100 Subject: tracing/syscalls: Ignore numbers outside NR_syscalls' range ARM has some private syscalls (for example, set_tls(2)) which lie outside the range of NR_syscalls. If any of these are called while syscall tracing is being performed, out-of-bounds array access will occur in the ftrace and perf sys_{enter,exit} handlers. # trace-cmd record -e raw_syscalls:* true && trace-cmd report ... true-653 [000] 384.675777: sys_enter: NR 192 (0, 1000, 3, 4000022, ffffffff, 0) true-653 [000] 384.675812: sys_exit: NR 192 = 1995915264 true-653 [000] 384.675971: sys_enter: NR 983045 (76f74480, 76f74000, 76f74b28, 76f74480, 76f76f74, 1) true-653 [000] 384.675988: sys_exit: NR 983045 = 0 ... # trace-cmd record -e syscalls:* true [ 17.289329] Unable to handle kernel paging request at virtual address aaaaaace [ 17.289590] pgd = 9e71c000 [ 17.289696] [aaaaaace] *pgd=00000000 [ 17.289985] Internal error: Oops: 5 [#1] PREEMPT SMP ARM [ 17.290169] Modules linked in: [ 17.290391] CPU: 0 PID: 704 Comm: true Not tainted 3.18.0-rc2+ #21 [ 17.290585] task: 9f4dab00 ti: 9e710000 task.ti: 9e710000 [ 17.290747] PC is at ftrace_syscall_enter+0x48/0x1f8 [ 17.290866] LR is at syscall_trace_enter+0x124/0x184 Fix this by ignoring out-of-NR_syscalls-bounds syscall numbers. Commit cd0980fc8add "tracing: Check invalid syscall nr while tracing syscalls" added the check for less than zero, but it should have also checked for greater than NR_syscalls. Link: http://lkml.kernel.org/p/1414620418-29472-1-git-send-email-rabin@rab.in Fixes: cd0980fc8add "tracing: Check invalid syscall nr while tracing syscalls" Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4dc8b79c5f75..29228c4d5696 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; -- cgit v1.2.3 From f7b8a47da17c9ee4998f2ca2018fcc424e953c0e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 28 Oct 2014 08:24:34 +0300 Subject: sched: Remove lockdep check in sched_move_task() sched_move_task() is the only interface to change sched_task_group: cpu_cgrp_subsys methods and autogroup_move_group() use it. Everything is synchronized by task_rq_lock(), so cpu_cgroup_attach() is ordered with other users of sched_move_task(). This means we do no need RCU here: if we've dereferenced a tg here, the .attach method hasn't been called for it yet. Thus, we should pass "true" to task_css_check() to silence lockdep warnings. Fixes: eeb61e53ea19 ("sched: Fix race between task_group and sched_task_group") Reported-by: Oleg Nesterov Reported-by: Fengguang Wu Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414473874.8574.2.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 240157c13ddc..6841fb46eb07 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7444,8 +7444,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgrp_id, - lockdep_is_held(&tsk->sighand->siglock)), + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); tsk->sched_task_group = tg; -- cgit v1.2.3 From c123588b3b193d06588dfb51f475407f835ebfb2 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 7 Nov 2014 17:53:40 +0300 Subject: sched/numa: Fix out of bounds read in sched_init_numa() On latest mm + KASan patchset I've got this: ================================================================== BUG: AddressSanitizer: out of bounds access in sched_init_smp+0x3ba/0x62c at addr ffff88006d4bee6c ============================================================================= BUG kmalloc-8 (Not tainted): kasan error ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in alloc_vfsmnt+0xb0/0x2c0 age=75 cpu=0 pid=0 __slab_alloc+0x4b4/0x4f0 __kmalloc_track_caller+0x15f/0x1e0 kstrdup+0x44/0x90 alloc_vfsmnt+0xb0/0x2c0 vfs_kern_mount+0x35/0x190 kern_mount_data+0x25/0x50 pid_ns_prepare_proc+0x19/0x50 alloc_pid+0x5e2/0x630 copy_process.part.41+0xdf5/0x2aa0 do_fork+0xf5/0x460 kernel_thread+0x21/0x30 rest_init+0x1e/0x90 start_kernel+0x522/0x531 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x15b/0x16a INFO: Slab 0xffffea0001b52f80 objects=24 used=22 fp=0xffff88006d4befc0 flags=0x100000000004080 INFO: Object 0xffff88006d4bed20 @offset=3360 fp=0xffff88006d4bee70 Bytes b4 ffff88006d4bed10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ Object ffff88006d4bed20: 70 72 6f 63 00 6b 6b a5 proc.kk. Redzone ffff88006d4bed28: cc cc cc cc cc cc cc cc ........ Padding ffff88006d4bee68: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ CPU: 0 PID: 1 Comm: swapper/0 Tainted: G B 3.18.0-rc3-mm1+ #108 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 ffff88006d4be000 0000000000000000 ffff88006d4bed20 ffff88006c86fd18 ffffffff81cd0a59 0000000000000058 ffff88006d404240 ffff88006c86fd48 ffffffff811fa3a8 ffff88006d404240 ffffea0001b52f80 ffff88006d4bed20 Call Trace: dump_stack (lib/dump_stack.c:52) print_trailer (mm/slub.c:645) object_err (mm/slub.c:652) ? sched_init_smp (kernel/sched/core.c:6552 kernel/sched/core.c:7063) kasan_report_error (mm/kasan/report.c:102 mm/kasan/report.c:178) ? kasan_poison_shadow (mm/kasan/kasan.c:48) ? kasan_unpoison_shadow (mm/kasan/kasan.c:54) ? kasan_poison_shadow (mm/kasan/kasan.c:48) ? kasan_kmalloc (mm/kasan/kasan.c:311) __asan_load4 (mm/kasan/kasan.c:371) ? sched_init_smp (kernel/sched/core.c:6552 kernel/sched/core.c:7063) sched_init_smp (kernel/sched/core.c:6552 kernel/sched/core.c:7063) kernel_init_freeable (init/main.c:869 init/main.c:997) ? finish_task_switch (kernel/sched/sched.h:1036 kernel/sched/core.c:2248) ? rest_init (init/main.c:924) kernel_init (init/main.c:929) ? rest_init (init/main.c:924) ret_from_fork (arch/x86/kernel/entry_64.S:348) ? rest_init (init/main.c:924) Read of size 4 by task swapper/0: Memory state around the buggy address: ffff88006d4beb80: fc fc fc fc fc fc fc fc fc fc 00 fc fc fc fc fc ffff88006d4bec00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88006d4bec80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88006d4bed00: fc fc fc fc 00 fc fc fc fc fc fc fc fc fc fc fc ffff88006d4bed80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88006d4bee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 04 fc ^ ffff88006d4bee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88006d4bef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88006d4bef80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff88006d4bf000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88006d4bf080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Zero 'level' (e.g. on non-NUMA system) causing out of bounds access in this line: sched_max_numa_distance = sched_domains_numa_distance[level - 1]; Fix this by exiting from sched_init_numa() earlier. Signed-off-by: Andrey Ryabinin Reviewed-by: Rik van Riel Fixes: 9942f79ba ("sched/numa: Export info needed for NUMA balancing on complex topologies") Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1415372020-1871-1-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6841fb46eb07..5f12ca65c9a7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6368,6 +6368,10 @@ static void sched_init_numa(void) if (!sched_debug()) break; } + + if (!level) + return; + /* * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). -- cgit v1.2.3 From 7af683350cb0ddd0e9d3819b4eb7abe9e2d3e709 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2014 10:54:35 +0100 Subject: sched/numa: Avoid selecting oneself as swap target Because the whole numa task selection stuff runs with preemption enabled (its long and expensive) we can end up migrating and selecting oneself as a swap target. This doesn't really work out well -- we end up trying to acquire the same lock twice for the swap migrate -- so avoid this. Reported-and-Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141110100328.GF29390@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 34baa60f8a7b..3af3d1e7df9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1179,6 +1179,13 @@ static void task_numa_compare(struct task_numa_env *env, cur = NULL; raw_spin_unlock_irq(&dst_rq->lock); + /* + * Because we have preemption enabled we can get migrated around and + * end try selecting ourselves (current == env->p) as a swap candidate. + */ + if (cur == env->p) + goto unlock; + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for -- cgit v1.2.3 From 23cfa361f3e54a3e184a5e126bbbdd95f984881a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Nov 2014 12:37:37 +0100 Subject: sched/cputime: Fix cpu_timer_sample_group() double accounting While looking over the cpu-timer code I found that we appear to add the delta for the calling task twice, through: cpu_timer_sample_group() thread_group_cputimer() thread_group_cputime() times->sum_exec_runtime += task_sched_runtime(); *sample = cputime.sum_exec_runtime + task_delta_exec(); Which would make the sample run ahead, making the sleep short. Signed-off-by: Peter Zijlstra (Intel) Cc: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Stanislaw Gruszka Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Rik van Riel Cc: Tejun Heo Link: http://lkml.kernel.org/r/20141112113737.GI10476@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 5 ----- kernel/sched/core.c | 13 ------------- kernel/time/posix-cpu-timers.c | 2 +- 3 files changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 8422b4ed6882..b9376cd5a187 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -77,11 +77,6 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) return kstat_cpu(cpu).irqs_sum; } -/* - * Lock/unlock the current runqueue - to extract task statistics: - */ -extern unsigned long long task_delta_exec(struct task_struct *); - extern void account_user_time(struct task_struct *, cputime_t, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); extern void account_steal_time(cputime_t); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5f12ca65c9a7..797a6c84c48d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2499,19 +2499,6 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) return ns; } -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - /* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 492b986195d5..a16b67859e2a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -553,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime; break; } return 0; -- cgit v1.2.3 From 6e998916dfe327e785e7c2447959b2c1a3ea4930 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 12 Nov 2014 16:58:44 +0100 Subject: sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency Commit d670ec13178d0 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc test case in cost of breaking another one. After that commit, calling clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result of Y time being smaller than X time. Reproducer/tester can be found further below, it can be compiled and ran by: gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread while ./tst-cpuclock2 ; do : ; done This reproducer, when running on a buggy kernel, will complain about "clock_gettime difference too small". Issue happens because on start in thread_group_cputimer() we initialize sum_exec_runtime of cputimer with threads runtime not yet accounted and then add the threads runtime to running cputimer again on scheduler tick, making it's sum_exec_runtime bigger than actual threads runtime. KOSAKI Motohiro posted a fix for this problem, but that patch was never applied: https://lkml.org/lkml/2013/5/26/191 . This patch takes different approach to cure the problem. It calls update_curr() when cputimer starts, that assure we will have updated stats of running threads and on the next schedule tick we will account only the runtime that elapsed from cputimer start. That also assure we have consistent state between cpu times of individual threads and cpu time of the process consisted by those threads. Full reproducer (tst-cpuclock2.c): #define _GNU_SOURCE #include #include #include #include #include #include #include /* Parameters for the Linux kernel ABI for CPU clocks. */ #define CPUCLOCK_SCHED 2 #define MAKE_PROCESS_CPUCLOCK(pid, clock) \ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock)) static pthread_barrier_t barrier; /* Help advance the clock. */ static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) ; return NULL; } /* Don't use the glibc wrapper. */ static int do_nanosleep(int flags, const struct timespec *req) { clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED); return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL); } static int64_t tsdiff(const struct timespec *before, const struct timespec *after) { int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec; int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec; return after_i - before_i; } int main(void) { int result = 0; pthread_t th; pthread_barrier_init(&barrier, NULL, 2); if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) { perror("pthread_create"); return 1; } pthread_barrier_wait(&barrier); /* The test. */ struct timespec before, after, sleeptimeabs; int64_t sleepdiff, diffabs; const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 }; /* The relative nanosleep. Not sure why this is needed, but its presence seems to make it easier to reproduce the problem. */ if (do_nanosleep(0, &sleeptime) != 0) { perror("clock_nanosleep"); return 1; } /* Get the current time. */ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) { perror("clock_gettime[2]"); return 1; } /* Compute the absolute sleep time based on the current time. */ uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec; sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000; sleeptimeabs.tv_nsec = nsec % 1000000000; /* Sleep for the computed time. */ if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) { perror("absolute clock_nanosleep"); return 1; } /* Get the time after the sleep. */ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) { perror("clock_gettime[3]"); return 1; } /* The time after sleep should always be equal to or after the absolute sleep time passed to clock_nanosleep. */ sleepdiff = tsdiff(&sleeptimeabs, &after); if (sleepdiff < 0) { printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff); result = 1; printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec); printf("After %llu.%09llu\n", after.tv_sec, after.tv_nsec); printf("Sleep %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec); } /* The difference between the timestamps taken before and after the clock_nanosleep call should be equal to or more than the duration of the sleep. */ diffabs = tsdiff(&before, &after); if (diffabs < sleeptime.tv_nsec) { printf("clock_gettime difference too small: %" PRId64 "\n", diffabs); result = 1; } pthread_cancel(th); return result; } Signed-off-by: Stanislaw Gruszka Signed-off-by: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Frederic Weisbecker Cc: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141112155843.GA24803@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 38 +++++++++++--------------------------- kernel/sched/deadline.c | 2 ++ kernel/sched/fair.c | 7 +++++++ kernel/sched/rt.c | 2 ++ kernel/sched/sched.h | 2 ++ 5 files changed, 24 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 797a6c84c48d..24beb9bb4c3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2474,31 +2474,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -/* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. - */ -static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - /* - * Must be ->curr _and_ ->on_rq. If dequeued, we would - * project cycles that may never be accounted to this - * thread, breaking clock_gettime(). - */ - if (task_current(rq, p) && task_on_rq_queued(p)) { - update_rq_clock(rq); - ns = rq_clock_task(rq) - p->se.exec_start; - if ((s64)ns < 0) - ns = 0; - } - - return ns; -} - /* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's @@ -2508,7 +2483,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 ns = 0; + u64 ns; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* @@ -2527,7 +2502,16 @@ unsigned long long task_sched_runtime(struct task_struct *p) #endif rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && task_on_rq_queued(p)) { + update_rq_clock(rq); + p->sched_class->update_curr(rq); + } + ns = p->se.sum_exec_runtime; task_rq_unlock(rq, p, &flags); return ns; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5285332392d5..28fa9d9e9201 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1701,4 +1701,6 @@ const struct sched_class dl_sched_class = { .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, + + .update_curr = update_curr_dl, }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3af3d1e7df9b..ef2b104b254c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); } +static void update_curr_fair(struct rq *rq) +{ + update_curr(cfs_rq_of(&rq->curr->se)); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -7956,6 +7961,8 @@ const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, + .update_curr = update_curr_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d024e6ce30ba..20bca398084a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2128,6 +2128,8 @@ const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + + .update_curr = update_curr_rt, }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24156c8434d1..2df8ef067cc5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1135,6 +1135,8 @@ struct sched_class { unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); + void (*update_curr) (struct rq *rq); + #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p, int on_rq); #endif -- cgit v1.2.3