From 3a150df945b7408c27cad2c01a1638e8b14ac562 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Wed, 22 Feb 2017 08:29:26 +0800 Subject: tracing: Fix code comment for ftrace_ops_get_func() There is no function 'ftrace_ops_recurs_func' existing in the current code, it was renamed to ftrace_ops_assist_func() in commit c68c0fa29341 ("ftrace: Have ftrace_ops_get_func() handle RCU and PER_CPU flags too"). Update the comment to the correct function name. Link: http://lkml.kernel.org/r/1487723366-14463-1-git-send-email-chuhu@redhat.com Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0c0609326391..fd84f2e30b6d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5487,7 +5487,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, * Normally the mcount trampoline will call the ops->func, but there * are times that it should not. For example, if the ops does not * have its own recursion protection, then it should call the - * ftrace_ops_recurs_func() instead. + * ftrace_ops_assist_func() instead. * * Returns the function that the trampoline should call for @ops. */ -- cgit v1.2.3 From 4c77b18cf8b7ab37c7d5737b4609010d2ceec5f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Mar 2017 11:24:35 +0100 Subject: sched/fair: Make select_idle_cpu() more aggressive Kitsunyan reported desktop latency issues on his Celeron 887 because of commit: 1b568f0aabf2 ("sched/core: Optimize SCHED_SMT") ... even though his CPU doesn't do SMT. The effect of running the SMT code on a !SMT part is basically a more aggressive select_idle_cpu(). Removing the avg condition fixed things for him. I also know FB likes this test gone, even though other workloads like having it. For now, take it out by default, until we get a better idea. Reported-by: kitsunyan Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- kernel/sched/features.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 274c747a01ce..b3ee10dd3e85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5797,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if ((avg_idle / 512) < avg_cost) + if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) return -1; time = local_clock(); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..1b3c8189b286 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -51,6 +51,11 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) + #ifdef HAVE_RT_PUSH_IPI /* * In order to avoid a thundering herd attack of CPUs that are -- cgit v1.2.3 From 0ba87bb27d66b78e278167ac7e20c66520b8a612 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Mar 2017 10:51:47 +0100 Subject: sched/core: Fix pick_next_task() for RT,DL Pavan noticed that the following commit: 49ee576809d8 ("sched/core: Optimize pick_next_task() for idle_sched_class") ... broke RT,DL balancing by robbing them of the opportinty to do new-'idle' balancing when their last runnable task (on that runqueue) goes away. Reported-by: Pavan Kondeti Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: 49ee576809d8 ("sched/core: Optimize pick_next_task() for idle_sched_class") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bbfb917a9b49..6699d43a8843 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3273,10 +3273,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: + * Optimization: we know that if all tasks are in the fair class we can + * call that function directly, but only if the @prev task wasn't of a + * higher scheduling class, because otherwise those loose the + * opportunity to pull in more work from other CPUs. */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + if (likely((prev->sched_class == &idle_sched_class || + prev->sched_class == &fair_sched_class) && + rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; -- cgit v1.2.3 From 2b232e0c3b3a09f3e33750aa20e314f1b80e5361 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 28 Feb 2017 09:40:11 +0000 Subject: locking/ww_mutex: Replace cpu_relax() with cond_resched() for tests When busy-spinning on a ww_mutex_trylock(), we depend upon the other thread advancing and releasing the lock. This can not happen on a single CPU unless we relinquish it: [ ] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [kworker/0:1:18] ... [ ] Call Trace: [ ] mutex_trylock() [ ] test_mutex_work+0x31/0x56 [ ] process_one_work+0x1b4/0x2f9 [ ] worker_thread+0x1b0/0x27c [ ] kthread+0xd1/0xd3 [ ] ret_from_fork+0x19/0x30 Reported-by: Fengguang Wu Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f2a5fec17395 ("locking/ww_mutex: Begin kselftests for ww_mutex") Link: http://lkml.kernel.org/r/20170228094011.2595-1-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index da6c9a34f62f..3eb39c588397 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -50,7 +50,7 @@ static void test_mutex_work(struct work_struct *work) if (mtx->flags & TEST_MTX_TRY) { while (!ww_mutex_trylock(&mtx->mutex)) - cpu_relax(); + cond_resched(); } else { ww_mutex_lock(&mtx->mutex, NULL); } @@ -88,7 +88,7 @@ static int __test_mutex(unsigned int flags) ret = -EINVAL; break; } - cpu_relax(); + cond_resched(); } while (time_before(jiffies, timeout)); } else { ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); -- cgit v1.2.3 From 7fb4a2cea6b18dab56d609530d077f168169ed6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Mar 2017 16:23:30 +0100 Subject: locking/lockdep: Add nest_lock integrity test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boqun reported that hlock->references can overflow. Add a debug test for that to generate a clear error when this happens. Without this, lockdep is likely to report a mysterious failure on unlock. Reported-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Linus Torvalds Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9812e5dd409e..c0ee8607c11e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3260,10 +3260,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) + if (hlock->references) { + /* + * Check: unsigned int references:12, overflow. + */ + if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) + return 0; + hlock->references++; - else + } else { hlock->references = 2; + } return 1; } -- cgit v1.2.3 From 857811a37129f5d2ba162d7be3986eff44724014 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 1 Mar 2017 23:01:38 +0800 Subject: locking/ww_mutex: Adjust the lock number for stress test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because there are only 12 bits in held_lock::references, so we only support 4095 nested lock held in the same time, adjust the lock number for ww_mutex stress test to kill one lockdep splat: [ ] [ BUG: bad unlock balance detected! ] [ ] kworker/u2:0/5 is trying to release lock (ww_class_mutex) at: [ ] ww_mutex_unlock() [ ] but there are no more locks to release! ... Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Fengguang Wu Cc: Linus Torvalds Cc: Nicolai Hähnle Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170301150138.hdixnmafzfsox7nn@tardis.cn.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 3eb39c588397..6b7abb334ca6 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -627,7 +627,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); + ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); if (ret) return ret; -- cgit v1.2.3 From 92ad18ec26611e8a47639e600bd9dc42fbe2edcf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Mar 2017 12:53:26 -0500 Subject: ftrace/graph: Do not modify the EMPTY_HASH for the function_graph filter On boot up, if the kernel command line sets a graph funtion with the kernel command line options "ftrace_graph_filter" or "ftrace_graph_notrace" then it updates the corresponding function graph hash, ftrace_graph_hash or ftrace_graph_notrace_hash respectively. Unfortunately, at boot up, these variables are pointers to the "EMPTY_HASH" which is a constant used as a placeholder when a hash has no entities. The problem was that the comand line version to set the hashes updated the actual EMPTY_HASH instead of creating a new hash for the function graph. This broke the EMPTY_HASH because not only did it modify a constant (not sure how that was allowed to happen, except maybe because it was done at early boot, const variables were still mutable), but it made the filters have functions listed in them when they were actually empty. The kernel command line function needs to allocate a new hash for the function graph filters and assign the necessary variables to that new hash instead. Link: http://lkml.kernel.org/r/1488420091.7212.17.camel@linux.intel.com Cc: Namhyung Kim Fixes: b9b0c831bed2 ("ftrace: Convert graph filter to use hash tables") Reported-by: Todd Brandt Tested-by: Todd Brandt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fd84f2e30b6d..44122e7a6418 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4421,10 +4421,9 @@ static void __init set_ftrace_early_graph(char *buf, int enable) char *func; struct ftrace_hash *hash; - if (enable) - hash = ftrace_graph_hash; - else - hash = ftrace_graph_notrace_hash; + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (WARN_ON(!hash)) + return; while (buf) { func = strsep(&buf, ","); @@ -4434,6 +4433,11 @@ static void __init set_ftrace_early_graph(char *buf, int enable) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); } + + if (enable) + ftrace_graph_hash = hash; + else + ftrace_graph_notrace_hash = hash; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 65a50c656276b0846bea09dd011c0a3d35b77f3e Mon Sep 17 00:00:00 2001 From: Todd Brandt Date: Thu, 2 Mar 2017 16:12:15 -0800 Subject: ftrace/graph: Add ftrace_graph_max_depth kernel parameter Early trace callgraphs can be extremely large on systems with several seconds of boot time. The max_depth parameter limits how deep the graph trace goes and reduces the output size. This parameter is the same as the max_graph_depth file in tracefs. Link: http://lkml.kernel.org/r/1488499935-23216-1-git-send-email-todd.e.brandt@linux.intel.com Signed-off-by: Todd Brandt [ changed comments about debugfs to tracefs ] Signed-off-by: Steven Rostedt (VMware) --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ kernel/trace/ftrace.c | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 21e2d8863705..1c9016b27ee9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1173,6 +1173,12 @@ functions that can be changed at run time by the set_graph_notrace file in the debugfs tracing directory. + ftrace_graph_max_depth= + [FTRACE] Used with the function graph tracer. This is + the max depth it will trace into a function. This value + can be changed at run time by the max_graph_depth file + in the tracefs tracing directory. default: 0 (no limit) + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 44122e7a6418..d129ae51329a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4415,6 +4415,15 @@ static int __init set_graph_notrace_function(char *str) } __setup("ftrace_graph_notrace=", set_graph_notrace_function); +static int __init set_graph_max_depth_function(char *str) +{ + if (!str) + return 0; + fgraph_max_depth = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("ftrace_graph_max_depth=", set_graph_max_depth_function); + static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; -- cgit v1.2.3 From 6c4f0fa643cb9e775dcc976e3db00d649468ff1d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 2 Mar 2017 14:03:20 +0530 Subject: cpufreq: schedutil: move cached_raw_freq to struct sugov_policy cached_raw_freq applies to the entire cpufreq policy and not individual CPUs. Apart from wasting per-cpu memory, it is actually wrong to keep it in struct sugov_cpu as we may end up comparing next_freq with a stale cached_raw_freq of a random CPU. Move cached_raw_freq to struct sugov_policy. Fixes: 5cbea46984d6 (cpufreq: schedutil: map raw required frequency to driver frequency) Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8f8de3d4d6b7..3ab2aeaec6ec 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -36,6 +36,7 @@ struct sugov_policy { u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; + unsigned int cached_raw_freq; /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; @@ -52,7 +53,6 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; - unsigned int cached_raw_freq; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -146,9 +146,9 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) return sg_policy->next_freq; - sg_cpu->cached_raw_freq = freq; + sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -580,6 +580,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = UINT_MAX; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -590,7 +591,6 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->max = 0; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->last_update = 0; - sg_cpu->cached_raw_freq = 0; sg_cpu->iowait_boost = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, -- cgit v1.2.3 From 655cb1ebff4b7918fc560502c3297af2d3c7d114 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 2 Mar 2017 14:03:21 +0530 Subject: cpufreq: schedutil: Pass sg_policy to get_next_freq() get_next_freq() uses sg_cpu only to get sg_policy, which the callers of get_next_freq() already have. Pass sg_policy instead of sg_cpu to get_next_freq(), to make it more efficient. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3ab2aeaec6ec..cd7cd489f739 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -116,7 +116,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -136,10 +136,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq (as calculated above) is returned, subject to policy min/max and * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, - unsigned long max) +static unsigned int get_next_freq(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) { - struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; @@ -213,7 +212,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, } else { sugov_get_util(&util, &max); sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_cpu, util, max); + next_f = get_next_freq(sg_policy, util, max); } sugov_update_commit(sg_policy, time, next_f); } @@ -267,7 +266,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, sugov_iowait_boost(j_sg_cpu, &util, &max); } - return get_next_freq(sg_cpu, util, max); + return get_next_freq(sg_policy, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, -- cgit v1.2.3 From f38837b08d23e66de17d46d030e0d9ac5172ad1f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 5 Mar 2017 09:41:08 -0800 Subject: bpf: add get_next_key callback to LPM map map_get_next_key callback is mandatory. Supply dummy handler. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Dmitry Vyukov Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8bfe0afaee10..b37bd9ab7f57 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -500,9 +500,15 @@ unlock: raw_spin_unlock(&trie->lock); } +static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + static const struct bpf_map_ops trie_ops = { .map_alloc = trie_alloc, .map_free = trie_free, + .map_get_next_key = trie_get_next_key, .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, -- cgit v1.2.3 From 1d18c2747f937f1b5ec65ce6bf4ccb9ca1aea9e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 1 Mar 2017 15:39:07 -0500 Subject: cgroup/pids: remove spurious suspicious RCU usage warning pids_can_fork() is special in that the css association is guaranteed to be stable throughout the function and thus doesn't need RCU protection around task_css access. When determining the css to charge the pid, task_css_check() is used to override the RCU sanity check. While adding a warning message on fork rejection from pids limit, 135b8b37bd91 ("cgroup: Add pids controller event when fork fails because of pid limit") incorrectly added a task_css access which is neither RCU protected or explicitly annotated. This triggers the following suspicious RCU usage warning when RCU debugging is enabled. cgroup: fork rejected by pids controller in =============================== [ ERR: suspicious RCU usage. ] 4.10.0-work+ #1 Not tainted ------------------------------- ./include/linux/cgroup.h:435 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 0 1 lock held by bash/1748: #0: (&cgroup_threadgroup_rwsem){+++++.}, at: [] _do_fork+0xe6/0x6e0 stack backtrace: CPU: 3 PID: 1748 Comm: bash Not tainted 4.10.0-work+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.fc25 04/01/2014 Call Trace: dump_stack+0x68/0x93 lockdep_rcu_suspicious+0xd7/0x110 pids_can_fork+0x1c7/0x1d0 cgroup_can_fork+0x67/0xc0 copy_process.part.58+0x1709/0x1e90 _do_fork+0xe6/0x6e0 SyS_clone+0x19/0x20 do_syscall_64+0x5c/0x140 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7f7853fab93a RSP: 002b:00007ffc12d05c90 EFLAGS: 00000246 ORIG_RAX: 0000000000000038 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7853fab93a RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011 RBP: 00007ffc12d05cc0 R08: 0000000000000000 R09: 00007f78548db700 R10: 00007f78548db9d0 R11: 0000000000000246 R12: 00000000000006d4 R13: 0000000000000001 R14: 0000000000000000 R15: 000055e3ebe2c04d /asdf There's no reason to dereference task_css again here when the associated css is already available. Fix it by replacing the task_cgroup() call with css->cgroup. Signed-off-by: Tejun Heo Reported-by: Mike Galbraith Fixes: 135b8b37bd91 ("cgroup: Add pids controller event when fork fails because of pid limit") Cc: Kenny Yu Cc: stable@vger.kernel.org # v4.8+ Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index e756dae49300..2237201d66d5 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task) /* Only log the first time events_limit is incremented. */ if (atomic64_inc_return(&pids->events_limit) == 1) { pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); + pr_cont_cgroup_path(css->cgroup); pr_cont("\n"); } cgroup_file_notify(&pids->events_file); -- cgit v1.2.3 From b6a6759daf55dade2b65089957832759d502acfb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 25 Feb 2017 01:56:48 -0800 Subject: cgroups: censor kernel pointer in debug files As found in grsecurity, this avoids exposing a kernel pointer through the cgroup debug entries. Signed-off-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 56eba9caa632..1dc22f6b49f5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1329,7 +1329,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cset); + seq_printf(seq, "css_set %pK\n", cset); list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) -- cgit v1.2.3 From 637fdbae60d6cb9f6e963c1079d7e0445c86ff7d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 6 Mar 2017 15:33:42 -0500 Subject: workqueue: trigger WARN if queue_delayed_work() is called with NULL @wq If queue_delayed_work() gets called with NULL @wq, the kernel will oops asynchronuosly on timer expiration which isn't too helpful in tracking down the offender. This actually happened with smc. __queue_delayed_work() already does several input sanity checks synchronously. Add NULL @wq check. Reported-by: Dave Jones Link: http://lkml.kernel.org/r/20170227171439.jshx3qplflyrgcv7@codemonkey.org.uk Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 072cbc9b175d..c0168b7da1ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1507,6 +1507,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); WARN_ON_ONCE(timer_pending(timer)); -- cgit v1.2.3 From 040757f738e13caaa9c5078bca79aa97e11dde88 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 5 Mar 2017 15:03:22 -0600 Subject: ucount: Remove the atomicity from ucount->count Always increment/decrement ucount->count under the ucounts_lock. The increments are there already and moving the decrements there means the locking logic of the code is simpler. This simplification in the locking logic fixes a race between put_ucounts and get_ucounts that could result in a use-after-free because the count could go zero then be found by get_ucounts and then be freed by put_ucounts. A bug presumably this one was found by a combination of syzkaller and KASAN. JongWhan Kim reported the syzkaller failure and Dmitry Vyukov spotted the race in the code. Cc: stable@vger.kernel.org Fixes: f6b2db1a3e8d ("userns: Make the count of user namespaces per user") Reported-by: JongHwan Kim Reported-by: Dmitry Vyukov Reviewed-by: Andrei Vagin Signed-off-by: "Eric W. Biederman" --- include/linux/user_namespace.h | 2 +- kernel/ucount.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index be765234c0a2..32354b4b4b2b 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -72,7 +72,7 @@ struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; - atomic_t count; + int count; atomic_t ucount[UCOUNT_COUNTS]; }; diff --git a/kernel/ucount.c b/kernel/ucount.c index 62630a40ab3a..b4eeee03934f 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -144,7 +144,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - atomic_set(&new->count, 0); + new->count = 0; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -155,8 +155,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) ucounts = new; } } - if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) + if (ucounts->count == INT_MAX) ucounts = NULL; + else + ucounts->count += 1; spin_unlock_irq(&ucounts_lock); return ucounts; } @@ -165,13 +167,15 @@ static void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_test(&ucounts->count)) { - spin_lock_irqsave(&ucounts_lock, flags); + spin_lock_irqsave(&ucounts_lock, flags); + ucounts->count -= 1; + if (!ucounts->count) hlist_del_init(&ucounts->node); - spin_unlock_irqrestore(&ucounts_lock, flags); + else + ucounts = NULL; + spin_unlock_irqrestore(&ucounts_lock, flags); - kfree(ucounts); - } + kfree(ucounts); } static inline bool atomic_inc_below(atomic_t *v, int u) -- cgit v1.2.3 From fa3aa7a54fe6d3abf128f13cd4bbd40eaa48fed2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Mar 2017 10:55:34 +0100 Subject: jiffies: Revert bogus conversion of NSEC_PER_SEC to TICK_NSEC commit 93825f2ec736 converted NSEC_PER_SEC to TICK_NSEC because the author confused NSEC_PER_JIFFY with NSEC_PER_SEC. As a result, the calculation of refined jiffies got broken, triggering lockups. Fixes: 93825f2ec736 ("jiffies: Reuse TICK_NSEC instead of NSEC_PER_JIFFY") Reported-and-tested-by: Meelis Roos Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1488880534-3777-1-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7906b3f0c41a..497719127bf9 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second) shift_hz += cycles_per_tick/2; do_div(shift_hz, cycles_per_tick); /* Calculate nsec_per_tick using shift_hz */ - nsec_per_tick = (u64)TICK_NSEC << 8; + nsec_per_tick = (u64)NSEC_PER_SEC << 8; nsec_per_tick += (u32)shift_hz/2; do_div(nsec_per_tick, (u32)shift_hz); -- cgit v1.2.3 From bd0f9b356d00aa241ced36fb075a07041c28d3b8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 7 Mar 2017 15:33:14 -0800 Subject: sched/headers: fix up header file dependency on The scheduler header file split and cleanups ended up exposing a few nasty header file dependencies, and in particular it showed how we in ended up depending on "signal_pending()", which now comes from . That's a very subtle and annoying dependency, which already caused a semantic merge conflict (see commit e58bc927835a "Pull overlayfs updates from Miklos Szeredi", which added that fixup in the merge commit). It turns out that we can avoid this dependency _and_ improve code generation by moving the guts of the fairly nasty helper #define __wait_event_interruptible_locked() to out-of-line code. The code that includes the signal_pending() check is all in the slow-path where we actually go to sleep waiting for the event anyway, so using a helper function is the right thing to do. Using a helper function is also what we already did for the non-locked versions, see the "__wait_event*()" macros and the "prepare_to_wait*()" set of helper functions. We might want to try to unify all these macro games, we have a _lot_ of subtly different wait-event loops. But this is the minimal patch to fix the annoying header dependency. Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/wait.h | 31 ++++++++++--------------------- kernel/sched/wait.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index aacb1282d19a..db076ca7f11d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -620,30 +620,19 @@ do { \ __ret; \ }) +extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *); +extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); -#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \ +#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ - int __ret = 0; \ + int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ - if (likely(list_empty(&__wait.task_list))) \ - __add_wait_queue_tail(&(wq), &__wait); \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (signal_pending(current)) { \ - __ret = -ERESTARTSYS; \ + __ret = fn(&(wq), &__wait); \ + if (__ret) \ break; \ - } \ - if (irq) \ - spin_unlock_irq(&(wq).lock); \ - else \ - spin_unlock(&(wq).lock); \ - schedule(); \ - if (irq) \ - spin_lock_irq(&(wq).lock); \ - else \ - spin_lock(&(wq).lock); \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ @@ -676,7 +665,7 @@ do { \ */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 0)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true @@ -703,7 +692,7 @@ do { \ */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 1)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true @@ -734,7 +723,7 @@ do { \ */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 0)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true @@ -765,7 +754,7 @@ do { \ */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4d2ea6f25568..b8c84c6dee64 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -242,6 +242,45 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_event); +/* + * Note! These two wait functions are entered with the + * wait-queue lock held (and interrupts off in the _irq + * case), so there is no race with testing the wakeup + * condition in the caller before they add the wait + * entry to the wake queue. + */ +int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +{ + if (likely(list_empty(&wait->task_list))) + __add_wait_queue_tail(wq, wait); + + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + return -ERESTARTSYS; + + spin_unlock(&wq->lock); + schedule(); + spin_lock(&wq->lock); + return 0; +} +EXPORT_SYMBOL(do_wait_intr); + +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +{ + if (likely(list_empty(&wait->task_list))) + __add_wait_queue_tail(wq, wait); + + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + return -ERESTARTSYS; + + spin_unlock_irq(&wq->lock); + schedule(); + spin_lock_irq(&wq->lock); + return 0; +} +EXPORT_SYMBOL(do_wait_intr_irq); + /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on -- cgit v1.2.3 From 9f691549f76d488a0c74397b3e51e943865ea01f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 7 Mar 2017 20:00:12 -0800 Subject: bpf: fix struct htab_elem layout when htab_elem is removed from the bucket list the htab_elem.hash_node.next field should not be overridden too early otherwise we have a tiny race window between lookup and delete. The bug was discovered by manual code analysis and reproducible only with explicit udelay() in lookup_elem_raw(). Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Reported-by: Jonathan Perry Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ea87fb19a94..63c86a7be2a1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -45,8 +45,13 @@ enum extra_elem_state { struct htab_elem { union { struct hlist_node hash_node; - struct bpf_htab *htab; - struct pcpu_freelist_node fnode; + struct { + void *padding; + union { + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; + }; }; union { struct rcu_head rcu; @@ -162,7 +167,8 @@ skip_percpu_elems: offsetof(struct htab_elem, lru_node), htab->elem_size, htab->map.max_entries); else - pcpu_freelist_populate(&htab->freelist, htab->elems, + pcpu_freelist_populate(&htab->freelist, + htab->elems + offsetof(struct htab_elem, fnode), htab->elem_size, htab->map.max_entries); return 0; @@ -217,6 +223,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + BUILD_BUG_ON(offsetof(struct htab_elem, htab) != + offsetof(struct htab_elem, hash_node.pprev)); + BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != + offsetof(struct htab_elem, hash_node.pprev)); + if (lru && !capable(CAP_SYS_ADMIN)) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. @@ -582,9 +593,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, int err = 0; if (prealloc) { - l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); - if (!l_new) + struct pcpu_freelist_node *l; + + l = pcpu_freelist_pop(&htab->freelist); + if (!l) err = -E2BIG; + else + l_new = container_of(l, struct htab_elem, fnode); } else { if (atomic_inc_return(&htab->count) > htab->map.max_entries) { atomic_dec(&htab->count); -- cgit v1.2.3 From 4fe8435909fddc97b81472026aa954e06dd192a5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 7 Mar 2017 20:00:13 -0800 Subject: bpf: convert htab map to hlist_nulls when all map elements are pre-allocated one cpu can delete and reuse htab_elem while another cpu is still walking the hlist. In such case the lookup may miss the element. Convert hlist to hlist_nulls to avoid such scenario. When bucket lock is taken there is no need to take such precautions, so only convert map_lookup and map_get_next to nulls. The race window is extremely small and only reproducible with explicit udelay() inside lookup_nulls_elem_raw() Similar to hlist add hlist_nulls_for_each_entry_safe() and hlist_nulls_entry_safe() helpers. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Reported-by: Jonathan Perry Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/list_nulls.h | 5 +++ include/linux/rculist_nulls.h | 14 +++++++ kernel/bpf/hashtab.c | 94 +++++++++++++++++++++++++++---------------- 3 files changed, 79 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index b01fe1009084..87ff4f58a2f0 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -29,6 +29,11 @@ struct hlist_nulls_node { ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_nulls_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ + }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 4ae95f7e8597..a23a33153180 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -156,5 +156,19 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) +/** + * hlist_nulls_for_each_entry_safe - + * iterate over list of given type safe against removal of list entry + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_nulls_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_nulls_node within the struct. + */ +#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ + for (({barrier();}), \ + pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ + pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 63c86a7be2a1..afe5bab376c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,11 +13,12 @@ #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" struct bucket { - struct hlist_head head; + struct hlist_nulls_head head; raw_spinlock_t lock; }; @@ -44,7 +45,7 @@ enum extra_elem_state { /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { - struct hlist_node hash_node; + struct hlist_nulls_node hash_node; struct { void *padding; union { @@ -337,7 +338,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); } @@ -377,28 +378,52 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) return &htab->buckets[hash & (htab->n_buckets - 1)]; } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) { return &__select_bucket(htab, hash)->head; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, +/* this lookup function can only be called with bucket lock taken */ +static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size) { + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; return NULL; } +/* can be called without bucket lock. it will repeat the loop in + * the unlikely event when elements moved from one bucket into another + * while link list is being walked + */ +static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, + u32 hash, void *key, + u32 key_size, u32 n_buckets) +{ + struct hlist_nulls_node *n; + struct htab_elem *l; + +again: + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) + goto again; + + return NULL; +} + /* Called from syscall or from eBPF program */ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; @@ -411,7 +436,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) head = select_bucket(htab, hash); - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); return l; } @@ -444,8 +469,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { struct bpf_htab *htab = (struct bpf_htab *)arg; - struct htab_elem *l, *tgt_l; - struct hlist_head *head; + struct htab_elem *l = NULL, *tgt_l; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; @@ -455,9 +481,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) raw_spin_lock_irqsave(&b->lock, flags); - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); break; } @@ -470,7 +496,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; int i; @@ -484,7 +510,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) head = select_bucket(htab, hash); /* lookup the key */ - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); if (!l) { i = 0; @@ -492,7 +518,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) } /* key was found, get next key in the same bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node); if (next_l) { @@ -511,7 +537,7 @@ find_first_elem: head = select_bucket(htab, i); /* pick first element in the bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ @@ -676,7 +702,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -715,9 +741,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { - hlist_del_rcu(&l_old->hash_node); + hlist_nulls_del_rcu(&l_old->hash_node); free_htab_elem(htab, l_old); } ret = 0; @@ -731,7 +757,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -772,10 +798,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); - hlist_del_rcu(&l_old->hash_node); + hlist_nulls_del_rcu(&l_old->hash_node); } ret = 0; @@ -796,7 +822,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -835,7 +861,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ret = PTR_ERR(l_new); goto err; } - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); } ret = 0; err: @@ -849,7 +875,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -897,7 +923,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } else { pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; @@ -925,7 +951,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -945,7 +971,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); ret = 0; } @@ -957,7 +983,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -977,7 +1003,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); ret = 0; } @@ -992,12 +1018,12 @@ static void delete_all_elements(struct bpf_htab *htab) int i; for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); - struct hlist_node *n; + struct hlist_nulls_head *head = select_bucket(htab, i); + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_safe(l, n, head, hash_node) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + hlist_nulls_del_rcu(&l->hash_node); if (l->state != HTAB_EXTRA_ELEM_USED) htab_elem_free(htab, l); } -- cgit v1.2.3 From 8a1115ff6b6d90cf1066ec3a0c4e51276553eebe Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 9 Mar 2017 16:16:31 -0800 Subject: scripts/spelling.txt: add "disble(d)" pattern and fix typo instances Fix typos and add the following to the scripts/spelling.txt: disble||disable disbled||disabled I kept the TSL2563_INT_DISBLED in /drivers/iio/light/tsl2563.c untouched. The macro is not referenced at all, but this commit is touching only comment blocks just in case. Link: http://lkml.kernel.org/r/1481573103-11329-20-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kcov.rst | 2 +- arch/cris/arch-v32/drivers/cryptocop.c | 2 +- arch/x86/kernel/ftrace.c | 2 +- drivers/crypto/ux500/cryp/cryp.c | 2 +- drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 2 +- drivers/hv/channel.c | 2 +- drivers/isdn/hisax/st5481_b.c | 2 +- drivers/mtd/spi-nor/spi-nor.c | 2 +- drivers/net/ethernet/qlogic/qlge/qlge.h | 2 +- drivers/scsi/aic7xxx/aic79xx_core.c | 2 +- drivers/usb/gadget/legacy/inode.c | 3 +-- drivers/usb/host/xhci.c | 4 ++-- include/linux/regulator/machine.h | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/events/core.c | 2 +- scripts/spelling.txt | 2 ++ sound/soc/amd/acp-pcm-dma.c | 2 +- 17 files changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 2c41b713841f..44886c91e112 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -10,7 +10,7 @@ Note that kcov does not aim to collect as much coverage as possible. It aims to collect more or less stable coverage that is function of syscall inputs. To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic parts of kernel is -disbled (e.g. scheduler, locking). +disabled (e.g. scheduler, locking). Usage ----- diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index ae6903d7fdbe..14970f11bbf2 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -2086,7 +2086,7 @@ static void cryptocop_job_queue_close(void) dma_in_cfg.en = regk_dma_no; REG_WR(dma, IN_DMA_INST, rw_cfg, dma_in_cfg); - /* Disble the cryptocop. */ + /* Disable the cryptocop. */ rw_cfg = REG_RD(strcop, regi_strcop, rw_cfg); rw_cfg.en = 0; REG_WR(strcop, regi_strcop, rw_cfg, rw_cfg); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8639bb2ae058..8f3d9cf26ff9 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -535,7 +535,7 @@ static void run_sync(void) { int enable_irqs = irqs_disabled(); - /* We may be called with interrupts disbled (on bootup). */ + /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) local_irq_enable(); on_each_cpu(do_sync_core, NULL, 1); diff --git a/drivers/crypto/ux500/cryp/cryp.c b/drivers/crypto/ux500/cryp/cryp.c index 43a0c8a26ab0..00a16ab601cb 100644 --- a/drivers/crypto/ux500/cryp/cryp.c +++ b/drivers/crypto/ux500/cryp/cryp.c @@ -82,7 +82,7 @@ void cryp_activity(struct cryp_device_data *device_data, void cryp_flush_inoutfifo(struct cryp_device_data *device_data) { /* - * We always need to disble the hardware before trying to flush the + * We always need to disable the hardware before trying to flush the * FIFO. This is something that isn't written in the design * specification, but we have been informed by the hardware designers * that this must be done. diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 31375bdde6f1..011800f621c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -788,7 +788,7 @@ static int sdma_v3_0_start(struct amdgpu_device *adev) } } - /* disble sdma engine before programing it */ + /* disable sdma engine before programing it */ sdma_v3_0_ctx_switch_enable(adev, false); sdma_v3_0_enable(adev, false); diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 81a80c82f1bd..bd0d1988feb2 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -543,7 +543,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) /* * In case a device driver's probe() fails (e.g., * util_probe() -> vmbus_open() returns -ENOMEM) and the device is - * rescinded later (e.g., we dynamically disble an Integrated Service + * rescinded later (e.g., we dynamically disable an Integrated Service * in Hyper-V Manager), the driver's remove() invokes vmbus_close(): * here we should skip most of the below cleanup work. */ diff --git a/drivers/isdn/hisax/st5481_b.c b/drivers/isdn/hisax/st5481_b.c index 409849165838..f64a36007800 100644 --- a/drivers/isdn/hisax/st5481_b.c +++ b/drivers/isdn/hisax/st5481_b.c @@ -239,7 +239,7 @@ static void st5481B_mode(struct st5481_bcs *bcs, int mode) } } } else { - // Disble B channel interrupts + // Disable B channel interrupts st5481_usb_device_ctrl_msg(adapter, FFMSK_B1+(bcs->channel * 2), 0, NULL, NULL); // Disable B channel FIFOs diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 1ae872bfc3ba..747645c74134 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -186,7 +186,7 @@ static inline int write_enable(struct spi_nor *nor) } /* - * Send write disble instruction to the chip. + * Send write disable instruction to the chip. */ static inline int write_disable(struct spi_nor *nor) { diff --git a/drivers/net/ethernet/qlogic/qlge/qlge.h b/drivers/net/ethernet/qlogic/qlge/qlge.h index 6d31f92ef2b6..90b3b46f85cc 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge.h +++ b/drivers/net/ethernet/qlogic/qlge/qlge.h @@ -1163,7 +1163,7 @@ struct ib_mac_iocb_rsp { u8 opcode; /* 0x20 */ u8 flags1; #define IB_MAC_IOCB_RSP_OI 0x01 /* Overide intr delay */ -#define IB_MAC_IOCB_RSP_I 0x02 /* Disble Intr Generation */ +#define IB_MAC_IOCB_RSP_I 0x02 /* Disable Intr Generation */ #define IB_MAC_CSUM_ERR_MASK 0x1c /* A mask to use for csum errs */ #define IB_MAC_IOCB_RSP_TE 0x04 /* Checksum error */ #define IB_MAC_IOCB_RSP_NU 0x08 /* No checksum rcvd */ diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c index 109e2c99e6c1..95d8f25cbcca 100644 --- a/drivers/scsi/aic7xxx/aic79xx_core.c +++ b/drivers/scsi/aic7xxx/aic79xx_core.c @@ -6278,7 +6278,7 @@ ahd_reset(struct ahd_softc *ahd, int reinit) * does not disable its parity logic prior to * the start of the reset. This may cause a * parity error to be detected and thus a - * spurious SERR or PERR assertion. Disble + * spurious SERR or PERR assertion. Disable * PERR and SERR responses during the CHIPRST. */ mod_cmd = cmd & ~(PCIM_CMD_PERRESPEN|PCIM_CMD_SERRESPEN); diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index a2615d64d07c..79a2d8fba6b6 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -84,8 +84,7 @@ static int ep_open(struct inode *, struct file *); /* /dev/gadget/$CHIP represents ep0 and the whole device */ enum ep0_state { - /* DISBLED is the initial state. - */ + /* DISABLED is the initial state. */ STATE_DEV_DISABLED = 0, /* Only one open() of /dev/gadget/$CHIP; only one file tracks diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 6d6c46000e56..50aee8b7718b 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -868,7 +868,7 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci) spin_lock_irqsave(&xhci->lock, flags); - /* disble usb3 ports Wake bits*/ + /* disable usb3 ports Wake bits */ port_index = xhci->num_usb3_ports; port_array = xhci->usb3_ports; while (port_index--) { @@ -879,7 +879,7 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci) writel(t2, port_array[port_index]); } - /* disble usb2 ports Wake bits*/ + /* disable usb2 ports Wake bits */ port_index = xhci->num_usb2_ports; port_array = xhci->usb2_ports; while (port_index--) { diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index ad3e5158e586..c9f795e9a2ee 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -65,7 +65,7 @@ struct regulator_state { int uV; /* suspend voltage */ unsigned int mode; /* suspend regulator operating mode */ int enabled; /* is regulator enabled in this suspend state */ - int disabled; /* is the regulator disbled in this suspend state */ + int disabled; /* is the regulator disabled in this suspend state */ }; /** diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0125589c7428..48851327a15e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2669,7 +2669,7 @@ static bool css_visible(struct cgroup_subsys_state *css) * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for - * cleaning up with cgroup_apply_control_disble(). + * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 6f41548f2e32..a17ed56c8ce1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -998,7 +998,7 @@ list_update_cgroup_event(struct perf_event *event, */ #define PERF_CPU_HRTIMER (1000 / HZ) /* - * function must be called with interrupts disbled + * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 0458b037c8a1..6dae4df472f6 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -372,6 +372,8 @@ disassocation||disassociation disapear||disappear disapeared||disappeared disappared||disappeared +disble||disable +disbled||disabled disconnet||disconnect discontinous||discontinuous dispertion||dispersion diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c index ec1067a679da..08b1399d1da2 100644 --- a/sound/soc/amd/acp-pcm-dma.c +++ b/sound/soc/amd/acp-pcm-dma.c @@ -89,7 +89,7 @@ static void acp_reg_write(u32 val, void __iomem *acp_mmio, u32 reg) writel(val, acp_mmio + (reg * 4)); } -/* Configure a given dma channel parameters - enable/disble, +/* Configure a given dma channel parameters - enable/disable, * number of descriptors, priority */ static void config_acp_dma_channel(void __iomem *acp_mmio, u8 ch_num, -- cgit v1.2.3 From 505d3085d7120a9f4cd0d6ffaa876968854b3baa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 9 Mar 2017 16:16:33 -0800 Subject: scripts/spelling.txt: add "overide" pattern and fix typo instances Fix typos and add the following to the scripts/spelling.txt: overide||override While we are here, fix the doubled "address" in the touched line Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt. Also, fix the comment block style in the touched hunks in drivers/media/dvb-frontends/drx39xyj/drx_driver.h. Link: http://lkml.kernel.org/r/1481573103-11329-21-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt | 2 +- drivers/block/paride/pcd.c | 2 +- drivers/block/paride/pd.c | 2 +- drivers/block/paride/pf.c | 2 +- drivers/block/paride/pg.c | 2 +- drivers/block/paride/pt.c | 2 +- drivers/media/dvb-frontends/drx39xyj/drx_driver.h | 8 +++----- drivers/net/ethernet/qlogic/qlge/qlge.h | 2 +- include/dt-bindings/sound/cs42l42.h | 2 +- include/net/irda/timer.h | 2 +- kernel/trace/trace_stack.c | 2 +- scripts/spelling.txt | 1 + tools/lguest/lguest.c | 2 +- tools/lib/bpf/Makefile | 2 +- tools/lib/traceevent/Makefile | 2 +- tools/lib/traceevent/event-parse.h | 2 +- 16 files changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt b/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt index c3f6546ebac7..6a23ad9ac53a 100644 --- a/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt @@ -45,7 +45,7 @@ Required Properties: Optional Properties: - reg-names: In addition to the required properties, the following are optional - "efuse-address" - Contains efuse base address used to pick up ABB info. - - "ldo-address" - Contains address of ABB LDO overide register address. + - "ldo-address" - Contains address of ABB LDO override register. "efuse-address" is required for this. - ti,ldovbb-vset-mask - Required if ldo-address is set, mask for LDO override register to provide override vset value. diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 10aed84244f5..939641d6e262 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -50,7 +50,7 @@ the slower the port i/o. In some cases, setting this to zero will speed up the device. (default -1) - major You may use this parameter to overide the + major You may use this parameter to override the default major number (46) that this driver will use. Be sure to change the device name as well. diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 644ba0888bd4..9cfd2e06a649 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -61,7 +61,7 @@ first drive found. - major You may use this parameter to overide the + major You may use this parameter to override the default major number (45) that this driver will use. Be sure to change the device name as well. diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index ed93e8badf56..14c5d32f5d8b 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -59,7 +59,7 @@ the slower the port i/o. In some cases, setting this to zero will speed up the device. (default -1) - major You may use this parameter to overide the + major You may use this parameter to override the default major number (47) that this driver will use. Be sure to change the device name as well. diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 5db955fe3a94..3b5882bfb736 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -84,7 +84,7 @@ the slower the port i/o. In some cases, setting this to zero will speed up the device. (default -1) - major You may use this parameter to overide the + major You may use this parameter to override the default major number (97) that this driver will use. Be sure to change the device name as well. diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 61fc6824299a..e815312a00ad 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -61,7 +61,7 @@ the slower the port i/o. In some cases, setting this to zero will speed up the device. (default -1) - major You may use this parameter to overide the + major You may use this parameter to override the default major number (96) that this driver will use. Be sure to change the device name as well. diff --git a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h index 7a681d8202c7..4442e478db72 100644 --- a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h +++ b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h @@ -256,8 +256,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner, * * The actual DAP implementation may be restricted to only one of the modes. * A compiler warning or error will be generated if the DAP implementation -* overides or cannot handle the mode defined below. -* +* overrides or cannot handle the mode defined below. */ #ifndef DRXDAP_SINGLE_MASTER #define DRXDAP_SINGLE_MASTER 1 @@ -272,7 +271,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner, * * This maximum size may be restricted by the actual DAP implementation. * A compiler warning or error will be generated if the DAP implementation -* overides or cannot handle the chunksize defined below. +* overrides or cannot handle the chunksize defined below. * * Beware that the DAP uses DRXDAP_MAX_WCHUNKSIZE to create a temporary data * buffer. Do not undefine or choose too large, unless your system is able to @@ -292,8 +291,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner, * * This maximum size may be restricted by the actual DAP implementation. * A compiler warning or error will be generated if the DAP implementation -* overides or cannot handle the chunksize defined below. -* +* overrides or cannot handle the chunksize defined below. */ #ifndef DRXDAP_MAX_RCHUNKSIZE #define DRXDAP_MAX_RCHUNKSIZE 60 diff --git a/drivers/net/ethernet/qlogic/qlge/qlge.h b/drivers/net/ethernet/qlogic/qlge/qlge.h index 90b3b46f85cc..84ac50f92c9c 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge.h +++ b/drivers/net/ethernet/qlogic/qlge/qlge.h @@ -1162,7 +1162,7 @@ struct ob_mac_tso_iocb_rsp { struct ib_mac_iocb_rsp { u8 opcode; /* 0x20 */ u8 flags1; -#define IB_MAC_IOCB_RSP_OI 0x01 /* Overide intr delay */ +#define IB_MAC_IOCB_RSP_OI 0x01 /* Override intr delay */ #define IB_MAC_IOCB_RSP_I 0x02 /* Disable Intr Generation */ #define IB_MAC_CSUM_ERR_MASK 0x1c /* A mask to use for csum errs */ #define IB_MAC_IOCB_RSP_TE 0x04 /* Checksum error */ diff --git a/include/dt-bindings/sound/cs42l42.h b/include/dt-bindings/sound/cs42l42.h index 399a123aed58..db69d84ed7d1 100644 --- a/include/dt-bindings/sound/cs42l42.h +++ b/include/dt-bindings/sound/cs42l42.h @@ -20,7 +20,7 @@ #define CS42L42_HPOUT_LOAD_1NF 0 #define CS42L42_HPOUT_LOAD_10NF 1 -/* HPOUT Clamp to GND Overide */ +/* HPOUT Clamp to GND Override */ #define CS42L42_HPOUT_CLAMP_EN 0 #define CS42L42_HPOUT_CLAMP_DIS 1 diff --git a/include/net/irda/timer.h b/include/net/irda/timer.h index cb2615ccf761..d784f242cf7b 100644 --- a/include/net/irda/timer.h +++ b/include/net/irda/timer.h @@ -59,7 +59,7 @@ struct lap_cb; * Slot timer must never exceed 85 ms, and must always be at least 25 ms, * suggested to 75-85 msec by IrDA lite. This doesn't work with a lot of * devices, and other stackes uses a lot more, so it's best we do it as well - * (Note : this is the default value and sysctl overides it - Jean II) + * (Note : this is the default value and sysctl overrides it - Jean II) */ #define SLOT_TIMEOUT (90*HZ/1000) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1d68b5b7ad41..5fb1f2c87e6b 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -65,7 +65,7 @@ void stack_trace_print(void) } /* - * When arch-specific code overides this function, the following + * When arch-specific code overrides this function, the following * data should be filled up, assuming stack_trace_max_lock is held to * prevent concurrent updates. * stack_trace_index[] diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 6dae4df472f6..0545f5a8cabe 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -734,6 +734,7 @@ oustanding||outstanding overaall||overall overhread||overhead overlaping||overlapping +overide||override overrided||overridden overriden||overridden overun||overrun diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 11c8d9bc762e..5d19fdf80292 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -1387,7 +1387,7 @@ static bool pci_data_iowrite(u16 port, u32 mask, u32 val) /* Allow writing to any other BAR, or expansion ROM */ iowrite(portoff, val, mask, &d->config_words[reg]); return true; - /* We let them overide latency timer and cacheline size */ + /* We let them override latency timer and cacheline size */ } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) { /* Only let them change the first two fields. */ if (mask == 0xFFFFFFFF) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index e2efddf10231..1f5300e56b44 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -132,7 +132,7 @@ else Q = @ endif -# Disable command line variables (CFLAGS) overide from top +# Disable command line variables (CFLAGS) override from top # level Makefile (perf), otherwise build Makefile will get # the same command line setup. MAKEOVERRIDES= diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index 47076b15eebe..9b8555ea3459 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -135,7 +135,7 @@ else Q = @ endif -# Disable command line variables (CFLAGS) overide from top +# Disable command line variables (CFLAGS) override from top # level Makefile (perf), otherwise build Makefile will get # the same command line setup. MAKEOVERRIDES= diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index 66342804161c..0c03538df74c 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -140,7 +140,7 @@ struct pevent_plugin_option { * struct pevent_plugin_option PEVENT_PLUGIN_OPTIONS[] = { * { * .name = "option-name", - * .plugin_alias = "overide-file-name", (optional) + * .plugin_alias = "override-file-name", (optional) * .description = "description of option to show users", * }, * { -- cgit v1.2.3 From dd0db88d8094a6d9d4d1fc5fcd56ab619f54ccf8 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:16:49 -0800 Subject: userfaultfd: non-cooperative: rollback userfaultfd_exit Patch series "userfaultfd non-cooperative further update for 4.11 merge window". Unfortunately I noticed one relevant bug in userfaultfd_exit while doing more testing. I've been doing testing before and this was also tested by kbuild bot and exercised by the selftest, but this bug never reproduced before. I dropped userfaultfd_exit as result. I dropped it because of implementation difficulty in receiving signals in __mmput and because I think -ENOSPC as result from the background UFFDIO_COPY should be enough already. Before I decided to remove userfaultfd_exit, I noticed userfaultfd_exit wasn't exercised by the selftest and when I tried to exercise it, after moving it to a more correct place in __mmput where it would make more sense and where the vma list is stable, it resulted in the event_wait_completion in D state. So then I added the second patch to be sure even if we call userfaultfd_event_wait_completion too late during task exit(), we won't risk to generate tasks in D state. The same check exists in handle_userfault() for the same reason, except it makes a difference there, while here is just a robustness check and it's run under WARN_ON_ONCE. While looking at the userfaultfd_event_wait_completion() function I looked back at its callers too while at it and I think it's not ok to stop executing dup_fctx on the fcs list because we relay on userfaultfd_event_wait_completion to execute userfaultfd_ctx_put(fctx->orig) which is paired against userfaultfd_ctx_get(fctx->orig) in dup_userfault just before list_add(fcs). This change only takes care of fctx->orig but this area also needs further review looking for similar problems in fctx->new. The only patch that is urgent is the first because it's an use after free during a SMP race condition that affects all processes if CONFIG_USERFAULTFD=y. Very hard to reproduce though and probably impossible without SLUB poisoning enabled. This patch (of 3): I once reproduced this oops with the userfaultfd selftest, it's not easily reproducible and it requires SLUB poisoning to reproduce. general protection fault: 0000 [#1] SMP Modules linked in: CPU: 2 PID: 18421 Comm: userfaultfd Tainted: G ------------ T 3.10.0+ #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.1-0-g8891697-prebuilt.qemu-project.org 04/01/2014 task: ffff8801f83b9440 ti: ffff8801f833c000 task.ti: ffff8801f833c000 RIP: 0010:[] [] userfaultfd_exit+0x29/0xa0 RSP: 0018:ffff8801f833fe80 EFLAGS: 00010202 RAX: ffff8801f833ffd8 RBX: 6b6b6b6b6b6b6b6b RCX: ffff8801f83b9440 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800baf18600 RBP: ffff8801f833fee8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: ffffffff8127ceb3 R12: 0000000000000000 R13: ffff8800baf186b0 R14: ffff8801f83b99f8 R15: 00007faed746c700 FS: 0000000000000000(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007faf0966f028 CR3: 0000000001bc6000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: do_exit+0x297/0xd10 SyS_exit+0x17/0x20 tracesys+0xdd/0xe2 Code: 00 00 66 66 66 66 90 55 48 89 e5 41 54 53 48 83 ec 58 48 8b 1f 48 85 db 75 11 eb 73 66 0f 1f 44 00 00 48 8b 5b 10 48 85 db 74 64 <4c> 8b a3 b8 00 00 00 4d 85 e4 74 eb 41 f6 84 24 2c 01 00 00 80 RIP [] userfaultfd_exit+0x29/0xa0 RSP ---[ end trace 9fecd6dcb442846a ]--- In the debugger I located the "mm" pointer in the stack and walking mm->mmap->vm_next through the end shows the vma->vm_next list is fully consistent and it is null terminated list as expected. So this has to be an SMP race condition where userfaultfd_exit was running while the vma list was being modified by another CPU. When userfaultfd_exit() run one of the ->vm_next pointers pointed to SLAB_POISON (RBX is the vma pointer and is 0x6b6b..). The reason is that it's not running in __mmput but while there are still other threads running and it's not holding the mmap_sem (it can't as it has to wait the even to be received by the manager). So this is an use after free that was happening for all processes. One more implementation problem aside from the race condition: userfaultfd_exit has really to check a flag in mm->flags before walking the vma or it's going to slowdown the exit() path for regular tasks. One more implementation problem: at that point signals can't be delivered so it would also create a task in D state if the manager doesn't read the event. The major design issue: it overall looks superfluous as the manager can check for -ENOSPC in the background transfer: if (mmget_not_zero(ctx->mm)) { [..] } else { return -ENOSPC; } It's safer to roll it back and re-introduce it later if at all. [rppt@linux.vnet.ibm.com: documentation fixup after removal of UFFD_EVENT_EXIT] Link: http://lkml.kernel.org/r/1488345437-4364-1-git-send-email-rppt@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20170224181957.19736-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: Mike Rapoport Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/userfaultfd.txt | 4 ---- fs/userfaultfd.c | 28 ---------------------------- include/linux/userfaultfd_k.h | 6 ------ include/uapi/linux/userfaultfd.h | 5 +---- kernel/exit.c | 1 - 5 files changed, 1 insertion(+), 43 deletions(-) (limited to 'kernel') diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt index 0e5543a920e5..bb2f945f87ab 100644 --- a/Documentation/vm/userfaultfd.txt +++ b/Documentation/vm/userfaultfd.txt @@ -172,10 +172,6 @@ the same read(2) protocol as for the page fault notifications. The manager has to explicitly enable these events by setting appropriate bits in uffdio_api.features passed to UFFDIO_API ioctl: -UFFD_FEATURE_EVENT_EXIT - enable notification about exit() of the -non-cooperative process. When the monitored process exits, the uffd -manager will get UFFD_EVENT_EXIT. - UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When this feature is enabled, the userfaultfd context of the parent process is duplicated into the newly created process. The manager receives diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f62199b90fd0..16d0cc600fa9 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -775,34 +775,6 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) } } -void userfaultfd_exit(struct mm_struct *mm) -{ - struct vm_area_struct *vma = mm->mmap; - - /* - * We can do the vma walk without locking because the caller - * (exit_mm) knows it now has exclusive access - */ - while (vma) { - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - - if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) { - struct userfaultfd_wait_queue ewq; - - userfaultfd_ctx_get(ctx); - - msg_init(&ewq.msg); - ewq.msg.event = UFFD_EVENT_EXIT; - - userfaultfd_event_wait_completion(ctx, &ewq); - - ctx->features &= ~UFFD_FEATURE_EVENT_EXIT; - } - - vma = vma->vm_next; - } -} - static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0468548acebf..f2b79bf4c895 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -72,8 +72,6 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); -extern void userfaultfd_exit(struct mm_struct *mm); - #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -139,10 +137,6 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -static inline void userfaultfd_exit(struct mm_struct *mm) -{ -} - #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index c055947c5c98..3b059530dac9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,8 +18,7 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT | \ - UFFD_FEATURE_EVENT_FORK | \ +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ @@ -113,7 +112,6 @@ struct uffd_msg { #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 #define UFFD_EVENT_UNMAP 0x16 -#define UFFD_EVENT_EXIT 0x17 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -163,7 +161,6 @@ struct uffdio_api { #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) #define UFFD_FEATURE_EVENT_UNMAP (1<<6) -#define UFFD_FEATURE_EVENT_EXIT (1<<7) __u64 features; __u64 ioctls; diff --git a/kernel/exit.c b/kernel/exit.c index e126ebf2400c..516acdb0e0ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -554,7 +554,6 @@ static void exit_mm(void) enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); - userfaultfd_exit(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); -- cgit v1.2.3 From 40c50c1fecdf012a3bf055ec813f0ef2eda2749c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Mar 2017 13:17:18 +0100 Subject: kexec, x86/purgatory: Unbreak it and clean it up The purgatory code defines global variables which are referenced via a symbol lookup in the kexec code (core and arch). A recent commit addressing sparse warnings made these static and thereby broke kexec_file. Why did this happen? Simply because the whole machinery is undocumented and lacks any form of forward declarations. The variable names are unspecific and lack a prefix, so adding forward declarations creates shadow variables in the core code. Aside of that the code relies on magic constants and duplicate struct definitions with no way to ensure that these things stay in sync. The section placement of the purgatory variables happened by chance and not by design. Unbreak kexec and cleanup the mess: - Add proper forward declarations and document the usage - Use common struct definition - Use the proper common defines instead of magic constants - Add a purgatory_ prefix to have a proper name space - Use ARRAY_SIZE() instead of a homebrewn reimplementation - Add proper sections to the purgatory variables [ From Mike ] Fixes: 72042a8c7b01 ("x86/purgatory: Make functions and variables static") Reported-by: Mike Galbraith < Signed-off-by: Thomas Gleixner Cc: Nicholas Mc Guire Cc: Borislav Petkov Cc: Vivek Goyal Cc: "Tobin C. Harding" Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1703101315140.3681@nanos Signed-off-by: Thomas Gleixner --- arch/powerpc/purgatory/trampoline.S | 12 ++++++------ arch/x86/include/asm/purgatory.h | 20 ++++++++++++++++++++ arch/x86/kernel/machine_kexec_64.c | 9 ++++++--- arch/x86/purgatory/purgatory.c | 35 +++++++++++++++++------------------ arch/x86/purgatory/purgatory.h | 8 -------- arch/x86/purgatory/setup-x86_64.S | 2 +- arch/x86/purgatory/sha256.h | 1 - include/linux/purgatory.h | 23 +++++++++++++++++++++++ kernel/kexec_file.c | 8 ++++---- kernel/kexec_internal.h | 6 +----- 10 files changed, 78 insertions(+), 46 deletions(-) create mode 100644 arch/x86/include/asm/purgatory.h delete mode 100644 arch/x86/purgatory/purgatory.h create mode 100644 include/linux/purgatory.h (limited to 'kernel') diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S index f9760ccf4032..3696ea6c4826 100644 --- a/arch/powerpc/purgatory/trampoline.S +++ b/arch/powerpc/purgatory/trampoline.S @@ -116,13 +116,13 @@ dt_offset: .data .balign 8 -.globl sha256_digest -sha256_digest: +.globl purgatory_sha256_digest +purgatory_sha256_digest: .skip 32 - .size sha256_digest, . - sha256_digest + .size purgatory_sha256_digest, . - purgatory_sha256_digest .balign 8 -.globl sha_regions -sha_regions: +.globl purgatory_sha_regions +purgatory_sha_regions: .skip 8 * 2 * 16 - .size sha_regions, . - sha_regions + .size purgatory_sha_regions, . - purgatory_sha_regions diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h new file mode 100644 index 000000000000..d7da2729903d --- /dev/null +++ b/arch/x86/include/asm/purgatory.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_PURGATORY_H +#define _ASM_X86_PURGATORY_H + +#ifndef __ASSEMBLY__ +#include + +extern void purgatory(void); +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern unsigned long purgatory_backup_dest; +extern unsigned long purgatory_backup_src; +extern unsigned long purgatory_backup_sz; +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 307b1f4543de..857cdbd02867 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -194,19 +194,22 @@ static int arch_update_purgatory(struct kimage *image) /* Setup copying of backup region */ if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_dest", &image->arch.backup_load_addr, sizeof(image->arch.backup_load_addr), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_src", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_src", &image->arch.backup_src_start, sizeof(image->arch.backup_src_start), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_sz", &image->arch.backup_src_sz, sizeof(image->arch.backup_src_sz), 0); if (ret) diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index b6d5c8946e66..470edad96bb9 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -10,22 +10,19 @@ * Version 2. See the file COPYING for more details. */ +#include +#include + #include "sha256.h" -#include "purgatory.h" #include "../boot/string.h" -struct sha_region { - unsigned long start; - unsigned long len; -}; - -static unsigned long backup_dest; -static unsigned long backup_src; -static unsigned long backup_sz; +unsigned long purgatory_backup_dest __section(.kexec-purgatory); +unsigned long purgatory_backup_src __section(.kexec-purgatory); +unsigned long purgatory_backup_sz __section(.kexec-purgatory); -static u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; +u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); -struct sha_region sha_regions[16] = {}; +struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); /* * On x86, second kernel requries first 640K of memory to boot. Copy @@ -34,26 +31,28 @@ struct sha_region sha_regions[16] = {}; */ static int copy_backup_region(void) { - if (backup_dest) - memcpy((void *)backup_dest, (void *)backup_src, backup_sz); - + if (purgatory_backup_dest) { + memcpy((void *)purgatory_backup_dest, + (void *)purgatory_backup_src, purgatory_backup_sz); + } return 0; } static int verify_sha256_digest(void) { - struct sha_region *ptr, *end; + struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; struct sha256_state sctx; sha256_init(&sctx); - end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; - for (ptr = sha_regions; ptr < end; ptr++) + end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); + + for (ptr = purgatory_sha_regions; ptr < end; ptr++) sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); sha256_final(&sctx, digest); - if (memcmp(digest, sha256_digest, sizeof(digest))) + if (memcmp(digest, purgatory_sha256_digest, sizeof(digest))) return 1; return 0; diff --git a/arch/x86/purgatory/purgatory.h b/arch/x86/purgatory/purgatory.h deleted file mode 100644 index e2e365a6c192..000000000000 --- a/arch/x86/purgatory/purgatory.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef PURGATORY_H -#define PURGATORY_H - -#ifndef __ASSEMBLY__ -extern void purgatory(void); -#endif /* __ASSEMBLY__ */ - -#endif /* PURGATORY_H */ diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S index f90e9dfa90bb..dfae9b9e60b5 100644 --- a/arch/x86/purgatory/setup-x86_64.S +++ b/arch/x86/purgatory/setup-x86_64.S @@ -9,7 +9,7 @@ * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ -#include "purgatory.h" +#include .text .globl purgatory_start diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h index bd15a4127735..2867d9825a57 100644 --- a/arch/x86/purgatory/sha256.h +++ b/arch/x86/purgatory/sha256.h @@ -10,7 +10,6 @@ #ifndef SHA256_H #define SHA256_H - #include #include diff --git a/include/linux/purgatory.h b/include/linux/purgatory.h new file mode 100644 index 000000000000..d60d4e278609 --- /dev/null +++ b/include/linux/purgatory.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_PURGATORY_H +#define _LINUX_PURGATORY_H + +#include +#include +#include + +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX]; +extern u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE]; + +#endif diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b56a558e406d..b118735fea9d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -614,13 +614,13 @@ static int kexec_calculate_store_digests(struct kimage *image) ret = crypto_shash_final(desc, digest); if (ret) goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha_regions", - sha_regions, sha_region_sz, 0); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); if (ret) goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); if (ret) goto out_free_digest; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 4cef7e4706b0..799a8a452187 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,11 +15,7 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE -struct kexec_sha_region { - unsigned long start; - unsigned long len; -}; - +#include void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -- cgit v1.2.3