From d0b6e0a8ef24b1b07078ababe5d91bcdf4f4264a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Sep 2017 21:36:55 +0200 Subject: watchdog/hardlockup: Provide interface to stop/restart perf events Provide an interface to stop and restart perf NMI watchdog events on all CPUs. This is only usable during init and especially for handling the perf HT bug on Intel machines. It's safe to use it this way as nothing can start/stop the NMI watchdog in parallel. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.167649596@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..c9586ebc2e98 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -261,3 +261,44 @@ void watchdog_nmi_disable(unsigned int cpu) firstcpu_err = 0; } } + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_disable(event); + } +} + +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_enable(event); + } +} -- cgit v1.2.3 From 6554fd8cf06db86f861bb24d7487b2873ca444c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:57 +0200 Subject: watchdog/core: Provide interface to stop from poweroff() PARISC has a a busy looping power off routine. If the watchdog is enabled the watchdog timer will still fire, but the thread is not running, which causes the softlockup watchdog to trigger. Provide a interface which allows to turn the watchdog off. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Helge Deller Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/20170912194146.327343752@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 6 +++--- kernel/watchdog.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b24d4a58674a..85bb268be39c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -12,10 +12,10 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +void lockup_detector_soft_poweroff(void); #else -static inline void lockup_detector_init(void) -{ -} +static inline void lockup_detector_init(void) { } +static inline void lockup_detector_soft_poweroff(void) { } #endif #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..f23e373aa3bf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -333,7 +333,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled || + atomic_read(&watchdog_park_in_progress) != 0) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -660,6 +661,17 @@ static void set_sample_period(void) } #endif /* SOFTLOCKUP */ +/** + * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) + * + * Special interface for parisc. It prevents lockup detector warnings from + * the default pm_poweroff() function which busy loops forever. + */ +void lockup_detector_soft_poweroff(void) +{ + watchdog_enabled = 0; +} + /* * Suspend the hard and soft lockup detector by parking the watchdog threads. */ -- cgit v1.2.3 From 5490125d77a43016b26f629d4b485e2c62172551 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:59 +0200 Subject: watchdog/core: Remove broken suspend/resume interfaces This interface has several issues: - It's causing recursive locking of the hotplug lock. - It's complete overkill to teardown all threads and then recreate them The same can be achieved with the simple hardlockup_detector_perf_stop / restart() interfaces. The abuse from the busy looping poweroff() loop of PARISC has been solved as well. Remove the cruft. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.487537732@linutronix.de Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/watchdog.c | 3 -- include/linux/nmi.h | 12 ------ kernel/watchdog.c | 89 +----------------------------------------- 3 files changed, 1 insertion(+), 103 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2f6eadd9408d..5ded171f02d6 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) return 0; - if (watchdog_suspended) - return 0; - if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) return 0; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 85bb268be39c..7eefe7abf44b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -164,7 +164,6 @@ extern int watchdog_thresh; extern unsigned long watchdog_enabled; extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; @@ -192,17 +191,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_cpumask(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int lockup_detector_suspend(void); -extern void lockup_detector_resume(void); -#else -static inline int lockup_detector_suspend(void) -{ - return 0; -} - -static inline void lockup_detector_resume(void) -{ -} #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f23e373aa3bf..b2d46757917e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -97,19 +97,6 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); * unregistered/stopped, so it is an indicator whether the threads exist. */ static int __read_mostly watchdog_running; -/* - * If a subsystem has a need to deactivate the watchdog temporarily, it - * can use the suspend/resume interface to achieve this. The content of - * the 'watchdog_suspended' variable reflects this state. Existing threads - * are parked/unparked by the lockup_detector_{suspend|resume} functions - * (see comment blocks pertaining to those functions for further details). - * - * 'watchdog_suspended' also prevents threads from being registered/started - * or unregistered/stopped via parameters in /proc/sys/kernel, so the state - * of 'watchdog_running' cannot change while the watchdog is deactivated - * temporarily (see related code in 'proc' handlers). - */ -int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its @@ -136,7 +123,6 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * - watchdog_cpumask * - sysctl_hardlockup_all_cpu_backtrace * - hardlockup_panic - * - watchdog_suspended */ void __weak watchdog_nmi_reconfigure(void) { @@ -672,61 +658,6 @@ void lockup_detector_soft_poweroff(void) watchdog_enabled = 0; } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - #ifdef CONFIG_SYSCTL /* @@ -775,12 +706,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -872,12 +797,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -917,12 +836,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -941,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, watchdog_nmi_reconfigure(); } -out: + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); return err; -- cgit v1.2.3 From b7a349819d4b9b5db64e523351e66a79a758eaa5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:00 +0200 Subject: watchdog/core: Rework CPU hotplug locking The watchdog proc interface causes extensive recursive locking of the CPU hotplug percpu rwsem, which is deadlock prone. Replace the get/put_online_cpus() pairs with cpu_hotplug_disable()/enable() calls for now. Later patches will remove that requirement completely. Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.568079057@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2d46757917e..cd79f644ea34 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -703,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); /* @@ -752,7 +752,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } @@ -794,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old, new; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); old = ACCESS_ONCE(watchdog_thresh); @@ -818,7 +818,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } @@ -833,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); @@ -856,7 +856,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } -- cgit v1.2.3 From 946d197794b23202b8b46c43016747c72fe23393 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:01 +0200 Subject: watchdog/core: Rename watchdog_proc_mutex Following patches will use the mutex for other purposes as well. Rename it as it is not longer a proc specific thing. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.647714850@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cd79f644ea34..7c3a0a76b41b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,8 +29,7 @@ #include #include -/* Watchdog configuration */ -static DEFINE_MUTEX(watchdog_proc_mutex); +static DEFINE_MUTEX(watchdog_mutex); int __read_mostly nmi_watchdog_enabled; @@ -704,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int *watchdog_param = (int *)table->data; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); /* * If the parameter is being read return the state of the corresponding @@ -751,7 +750,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, err = proc_watchdog_update(); } out: - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } @@ -795,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, int err, old, new; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -817,7 +816,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, set_sample_period(); } out: - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } @@ -834,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, int err; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { @@ -855,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, watchdog_nmi_reconfigure(); } - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } -- cgit v1.2.3 From 7a3558200739e1378800a7a6d7f63c031115f7a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:02 +0200 Subject: watchdog/core: Mark hardlockup_detector_disable() __init The function is only used by the KVM init code. Mark it __init to prevent creative abuse. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.727134632@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7c3a0a76b41b..1c185d9dd468 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -55,7 +55,7 @@ unsigned int __read_mostly hardlockup_panic = * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void hardlockup_detector_disable(void) +void __init hardlockup_detector_disable(void) { watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; } -- cgit v1.2.3 From 20d853fd0703b1d73c35a22024c0d4fcbcc57c8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:03 +0200 Subject: watchdog/hardlockup/perf: Remove broken self disable on failure The self disabling feature is broken vs. CPU hotplug locking: CPU 0 CPU 1 cpus_write_lock(); cpu_up(1) wait_for_completion() .... unpark_watchdog() ->unpark() perf_event_create() <- fails watchdog_enable &= ~NMI_WATCHDOG; .... cpus_write_unlock(); CPU 2 cpus_write_lock() cpu_down(2) wait_for_completion() wakeup(watchdog); watchdog() if (!(watchdog_enable & NMI_WATCHDOG)) watchdog_nmi_disable() perf_event_disable() .... cpus_read_lock(); stop_smpboot_threads() park_watchdog(); wait_for_completion(watchdog->parked); Result: End of hotplug and instantaneous full lockup of the machine. There is a similar problem with disabling the watchdog via the user space interface as the sysctl function fiddles with watchdog_enable directly. It's very debatable whether this is required at all. If the watchdog works nicely on N CPUs and it fails to enable on the N + 1 CPU either during hotplug or because the user space interface disabled it via sysctl cpumask and then some perf user grabbed the counter which is then unavailable for the watchdog when the sysctl cpumask gets changed back. There is no real justification for this. One of the reasons WHY this is done is the utter stupidity of the init code of the perf NMI watchdog. Instead of checking upfront at boot whether PERF is available and functional at all, it just does this check at run time over and over when user space fiddles with the sysctl. That's broken beyond repair along with the idiotic error code dependent warn level printks and the even more silly printk rate limiting. If the init code checks whether perf works at boot time, then this mess can be more or less avoided completely. Perf does not come magically into life at runtime. Brain usage while coding is overrated. Remove the cruft and add a temporary safe guard which gets removed later. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.806708429@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 --------------- kernel/watchdog_hld.c | 20 +++++++------------- 2 files changed, 7 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1c185d9dd468..af000956286c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -485,21 +485,6 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); - - /* - * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the - * failure path. Check for failures that can occur asynchronously - - * for example, when CPUs are on-lined - and shut down the hardware - * perf event on each CPU accordingly. - * - * The only non-obvious place this bit can be cleared is through - * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a - * pr_info here would be too noisy as it would result in a message - * every few seconds if the hardlockup was disabled but the softlockup - * enabled. - */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - watchdog_nmi_disable(cpu); } static struct smp_hotplug_thread watchdog_threads = { diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index c9586ebc2e98..7b602714ea53 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -23,6 +23,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); static unsigned long hardlockup_allcpu_dumped; +static bool hardlockup_detector_disabled; void arch_touch_nmi_watchdog(void) { @@ -178,6 +179,10 @@ int watchdog_nmi_enable(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto out; + /* A failure disabled the hardlockup detector permanently */ + if (hardlockup_detector_disabled) + return -ENODEV; + /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -206,18 +211,6 @@ int watchdog_nmi_enable(unsigned int cpu) goto out_save; } - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - /* skip displaying the same error again */ if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); @@ -232,7 +225,8 @@ int watchdog_nmi_enable(unsigned int cpu) pr_err("disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); - pr_info("Shutting down hard lockup detector on all cpus\n"); + pr_info("Disabling hard lockup detector permanently\n"); + hardlockup_detector_disabled = true; return PTR_ERR(event); -- cgit v1.2.3 From 941154bd6937a710ae9193a3c733c0029e5ae7b8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:04 +0200 Subject: watchdog/hardlockup/perf: Prevent CPU hotplug deadlock The following deadlock is possible in the watchdog hotplug code: cpus_write_lock() ... takedown_cpu() smpboot_park_threads() smpboot_park_thread() kthread_park() ->park() := watchdog_disable() watchdog_nmi_disable() perf_event_release_kernel(); put_event() _free_event() ->destroy() := hw_perf_event_destroy() x86_release_hardware() release_ds_buffers() get_online_cpus() when a per cpu watchdog perf event is destroyed which drops the last reference to the PMU hardware. The cleanup code there invokes get_online_cpus() which instantly deadlocks because the hotplug percpu rwsem is write locked. To solve this add a deferring mechanism: cpus_write_lock() kthread_park() watchdog_nmi_disable(deferred) perf_event_disable(event); move_event_to_deferred(event); .... cpus_write_unlock() cleaup_deferred_events() perf_event_release_kernel() This is still properly serialized against concurrent hotplug via the cpu_add_remove_lock, which is held by the task which initiated the hotplug event. This is also used to handle event destruction when the watchdog threads are parked via other mechanisms than CPU hotplug. Analyzed-by: Peter Zijlstra Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.884469246@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 6 ++++++ kernel/cpu.c | 6 ++++++ kernel/watchdog.c | 25 +++++++++++++++++++++++++ kernel/watchdog_hld.c | 34 ++++++++++++++++++++++++++++------ 4 files changed, 65 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 7eefe7abf44b..80354e6fa86d 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -13,9 +13,11 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); void lockup_detector_soft_poweroff(void); +void lockup_detector_cleanup(void); #else static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } +static inline void lockup_detector_cleanup(void) { } #endif #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -77,9 +79,13 @@ static inline void hardlockup_detector_disable(void) {} extern void arch_touch_nmi_watchdog(void); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); +extern void hardlockup_detector_perf_disable(void); +extern void hardlockup_detector_perf_cleanup(void); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } +static inline void hardlockup_detector_perf_disable(void) { } +static inline void hardlockup_detector_perf_cleanup(void) { } #if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..a96b348591df 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -734,6 +735,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); + /* + * Do post unplug cleanup. This is still protected against + * concurrent CPU hotplug via cpu_add_remove_lock. + */ + lockup_detector_cleanup(); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index af000956286c..dd1fd59683c5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -109,8 +109,10 @@ int __weak watchdog_nmi_enable(unsigned int cpu) { return 0; } + void __weak watchdog_nmi_disable(unsigned int cpu) { + hardlockup_detector_perf_disable(); } /* @@ -193,6 +195,8 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif #endif +static void __lockup_detector_cleanup(void); + /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally @@ -631,6 +635,24 @@ static void set_sample_period(void) } #endif /* SOFTLOCKUP */ +static void __lockup_detector_cleanup(void) +{ + lockdep_assert_held(&watchdog_mutex); + hardlockup_detector_perf_cleanup(); +} + +/** + * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes + * + * Caller must not hold the cpu hotplug rwsem. + */ +void lockup_detector_cleanup(void) +{ + mutex_lock(&watchdog_mutex); + __lockup_detector_cleanup(); + mutex_unlock(&watchdog_mutex); +} + /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * @@ -665,6 +687,8 @@ static int proc_watchdog_update(void) watchdog_nmi_reconfigure(); + __lockup_detector_cleanup(); + return err; } @@ -837,6 +861,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } watchdog_nmi_reconfigure(); + __lockup_detector_cleanup(); } mutex_unlock(&watchdog_mutex); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 7b602714ea53..94111ccb09b5 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); +static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; static bool hardlockup_detector_disabled; @@ -239,16 +241,18 @@ out: return 0; } -void watchdog_nmi_disable(unsigned int cpu) +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = this_cpu_read(watchdog_ev); if (event) { perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); + cpumask_set_cpu(smp_processor_id(), &dead_events_mask); /* watchdog_nmi_enable() expects this to be zero initially. */ if (atomic_dec_and_test(&watchdog_cpus)) @@ -256,6 +260,24 @@ void watchdog_nmi_disable(unsigned int cpu) } } +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ + int cpu; + + for_each_cpu(cpu, &dead_events_mask) { + struct perf_event *event = per_cpu(dead_event, cpu); + + per_cpu(dead_event, cpu) = NULL; + perf_event_release_kernel(event); + } + cpumask_clear(&dead_events_mask); +} + /** * hardlockup_detector_perf_stop - Globally stop watchdog events * -- cgit v1.2.3 From 01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:05 +0200 Subject: watchdog/core: Remove the park_in_progress obfuscation Commit: b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system") tries to fix the following issue: proc_write() set_sample_period() <--- New sample period becoms visible <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() thread->park() disable_nmi() <----- Broken ends The reason why this is broken is that the update of the watchdog threshold becomes immediately effective and visible for the hrtimer function which uses that value to rearm the timer. But the NMI/perf side still uses the old value up to the point where it is disabled. If the rate has been lowered then the NMI can run fast enough to 'detect' a hard lockup because the timer has not fired due to the longer period. The patch 'fixed' this by adding a variable: proc_write() set_sample_period() <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() park_in_progress = 1 <----- Broken ends nmi_watchdog() if (park_in_progress) return; The only effect of this variable was to make the window where the breakage can hit small enough that it was not longer observable in testing. From a correctness point of view it is a pointless bandaid which merily papers over the root cause: the unsychronized update of the variable. Looking deeper into the related code pathes unearthed similar problems in the watchdog_start()/stop() functions. watchdog_start() perf_nmi_event_start() hrtimer_start() watchdog_stop() hrtimer_cancel() perf_nmi_event_stop() In both cases the call order is wrong because if the tasks gets preempted or the VM gets scheduled out long enough after the first call, then there is a chance that the next NMI will see a stale hrtimer interrupt count and trigger a false positive hard lockup splat. Get rid of park_in_progress so the code can be gradually deobfuscated and pruned from several layers of duct tape papering over the root cause, which has been either ignored or not understood at all. Once this is removed the underlying problem will be fixed by rewriting the proc interface to do a proper synchronized update. Address the start/stop() ordering problem as well by reverting the call order, so this part is at least correct now. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 1 - kernel/watchdog.c | 37 +++++++++++++++++-------------------- kernel/watchdog_hld.c | 7 ++----- 3 files changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 80354e6fa86d..91a3a4a4c8ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; extern int soft_watchdog_enabled; -extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dd1fd59683c5..c290135fb415 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void) #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (!watchdog_enabled || - atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - /* kick off the timer for the hardlockup detector */ + /* + * Start the timer first to prevent the NMI watchdog triggering + * before the timer has a chance to fire. + */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); + /* Initialize timestamp */ + __touch_watchdog(); /* Enable the perf event */ watchdog_nmi_enable(cpu); - /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); - - /* initialize timestamp */ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); - __touch_watchdog(); } static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); - hrtimer_cancel(hrtimer); - /* disable the perf event */ + /* + * Disable the perf event first. That prevents that a large delay + * between disabling the timer and disabling the perf event causes + * the perf NMI to detect a false positive. + */ watchdog_nmi_disable(cpu); + hrtimer_cancel(hrtimer); } static void watchdog_cleanup(unsigned int cpu, bool online) @@ -518,16 +520,11 @@ static int watchdog_park_threads(void) { int cpu, ret = 0; - atomic_set(&watchdog_park_in_progress, 1); - for_each_watchdog_cpu(cpu) { ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); if (ret) break; } - - atomic_set(&watchdog_park_in_progress, 0); - return ret; } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 94111ccb09b5..0aa191ee3d51 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = { /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (atomic_read(&watchdog_park_in_progress) != 0) - return; - if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; -- cgit v1.2.3 From 2b9d7f233b835663cbc7b6b3f88dd20f61118d1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:06 +0200 Subject: watchdog/core: Clean up stub functions Having stub functions which take a full page is not helping the readablility of code. Condense them and move the doubled #ifdef variant into the SYSFS section. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.045545271@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 68 ++++++++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c290135fb415..af37c040436c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -125,10 +125,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * - sysctl_hardlockup_all_cpu_backtrace * - hardlockup_panic */ -void __weak watchdog_nmi_reconfigure(void) -{ -} - +void __weak watchdog_nmi_reconfigure(void) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -136,6 +133,11 @@ void __weak watchdog_nmi_reconfigure(void) #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* Global variables, exported for sysctl */ +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +int __read_mostly soft_watchdog_enabled; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -149,13 +151,9 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; } __setup("softlockup_panic=", softlockup_panic_setup); @@ -593,44 +591,13 @@ static void watchdog_disable_all_cpus(void) } } -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask); -} -#endif - -#else /* SOFTLOCKUP */ -static int watchdog_park_threads(void) -{ - return 0; -} - -static void watchdog_unpark_threads(void) -{ -} - -static int watchdog_enable_all_cpus(void) -{ - return 0; -} - -static void watchdog_disable_all_cpus(void) -{ -} - -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return 0; -} -#endif - -static void set_sample_period(void) -{ -} -#endif /* SOFTLOCKUP */ +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ +static inline int watchdog_park_threads(void) { return 0; } +static inline void watchdog_unpark_threads(void) { } +static inline int watchdog_enable_all_cpus(void) { return 0; } +static inline void watchdog_disable_all_cpus(void) { } +static inline void set_sample_period(void) { } +#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) { @@ -827,6 +794,15 @@ out: return err; } +static int watchdog_update_cpus(void) +{ + if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { + return smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_cpumask); + } + return 0; +} + /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the -- cgit v1.2.3 From 368a7e2ce8ff0ddcdcb37eadb76530b033f6eb2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:07 +0200 Subject: watchdog/core: Clean up the #ifdef maze The #ifdef maze in this file is horrible, group stuff at least a bit so one can figure out what belongs to what. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.139629546@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index af37c040436c..a9bdfde73e4b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,6 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -74,19 +73,21 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -#endif +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#ifdef CONFIG_SOFTLOCKUP_DETECTOR -int __read_mostly soft_watchdog_enabled; -#endif +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); +# endif /* CONFIG_SMP */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#endif struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -173,22 +174,14 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); #ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; + static int __init softlockup_all_cpu_backtrace_setup(char *str) { - sysctl_softlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); + sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -#endif #endif static void __lockup_detector_cleanup(void); -- cgit v1.2.3 From 05ba3de74a3f499dcaa37b186220aaf174c95a4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:08 +0200 Subject: watchdog/core: Split out cpumask write function Split the write part of the cpumask proc handler out into a separate helper to avoid deep indentation. This also reduces the patch complexity in the following cleanups. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.218075991@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a9bdfde73e4b..cedf45ab4d81 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -792,10 +792,29 @@ static int watchdog_update_cpus(void) if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { return smpboot_update_cpumask_percpu_thread(&watchdog_threads, &watchdog_cpumask); + __lockup_detector_cleanup(); } return 0; } +static void proc_watchdog_cpumask_update(void) +{ + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate a + * temporary cpumask, so we are likely not in a position to + * do much else to make things better. + */ + if (watchdog_update_cpus() != 0) + pr_err("cpumask update failed\n"); + } + + watchdog_nmi_reconfigure(); +} + /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the @@ -811,30 +830,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); - if (!err && write) { - /* Remove impossible cpus to keep sysctl output cleaner. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, - cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate - * a temporary cpumask, so we are likely not in a - * position to do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } - - watchdog_nmi_reconfigure(); - __lockup_detector_cleanup(); - } + if (!err && write) + proc_watchdog_cpumask_update(); mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } - #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) -- cgit v1.2.3 From 0d85923c7a81719567311ba0eae8ecb2efd4c8a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:09 +0200 Subject: smpboot/threads, watchdog/core: Avoid runtime allocation smpboot_update_cpumask_threads_percpu() allocates a temporary cpumask at runtime. This is suboptimal because the call site needs more code size for proper error handling than a statically allocated temporary mask requires data size. Add static temporary cpumask. The function is globaly serialized, so no further protection required. Remove the half baken error handling in the watchdog code and get rid of the export as there are no in tree modular users of that function. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.297288838@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/smpboot.h | 4 ++-- kernel/smpboot.c | 22 +++++++--------------- kernel/watchdog.c | 21 +++++---------------- 3 files changed, 14 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index 12910cf19869..c149aa7bedf3 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) } void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *); +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..ed7507b69b48 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -344,39 +344,31 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); * by the client, but only by calling this function. * This function can only be called on a registered smp_hotplug_thread. */ -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) { struct cpumask *old = plug_thread->cpumask; - cpumask_var_t tmp; + static struct cpumask tmp; unsigned int cpu; - if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) - return -ENOMEM; - get_online_cpus(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(tmp, old, new); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, old, new); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_thread(plug_thread, cpu); /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(tmp, new, old); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, new, old); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_unpark_thread(plug_thread, cpu); cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - - free_cpumask_var(tmp); - - return 0; } -EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cedf45ab4d81..8935a3a4c2fb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -787,31 +787,20 @@ out: return err; } -static int watchdog_update_cpus(void) +static void watchdog_update_cpus(void) { - if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { - return smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_cpumask); + if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) { + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_cpumask); __lockup_detector_cleanup(); } - return 0; } static void proc_watchdog_cpumask_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate a - * temporary cpumask, so we are likely not in a position to - * do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } - + watchdog_update_cpus(); watchdog_nmi_reconfigure(); } -- cgit v1.2.3 From 2eb2527f847d1bd8d8fb9db1e8139db5d6eddb36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:10 +0200 Subject: watchdog/core: Create new thread handling infrastructure The lockup detector reconfiguration tears down all watchdog threads when the watchdog is disabled and sets them up again when its enabled. That's a pointless exercise. The watchdog threads are not consuming an insane amount of resources, so it's enough to set them up at init time and keep them in parked position when the watchdog is disabled and unpark them when it is reenabled. The smpboot thread infrastructure takes care of keeping the force parked threads in place even across cpu hotplug. Another horrible mechanism are the open coded park/unpark loops which are used for reconfiguration of the watchdog. The smpboot infrastructure allows exactly the same via smpboot_update_cpumask_thread_percpu(), which is cpu hotplug safe. Using that instead of the open coded loops allows to get rid of the hotplug locking mess in the watchdog code. Implement a clean infrastructure which allows to replace the open coded nonsense. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.377182587@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8935a3a4c2fb..b35518375fb7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -139,6 +139,9 @@ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; int __read_mostly soft_watchdog_enabled; +struct cpumask watchdog_allowed_mask __read_mostly; +static bool softlockup_threads_initialized __read_mostly; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -584,12 +587,84 @@ static void watchdog_disable_all_cpus(void) } } +static void softlockup_update_smpboot_threads(void) +{ + lockdep_assert_held(&watchdog_mutex); + + if (!softlockup_threads_initialized) + return; + + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_allowed_mask); + __lockup_detector_cleanup(); +} + +/* Temporarily park all watchdog threads */ +static void softlockup_park_all_threads(void) +{ + cpumask_clear(&watchdog_allowed_mask); + softlockup_update_smpboot_threads(); +} + +/* + * Park threads which are not longer enabled and unpark threads which have + * been newly enabled. + */ +static void softlockup_update_threads(void) +{ + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + softlockup_update_smpboot_threads(); +} + +static void softlockup_reconfigure_threads(bool enabled) +{ + softlockup_park_all_threads(); + set_sample_period(); + if (enabled) + softlockup_update_threads(); +} + +/* + * Create the watchdog thread infrastructure. + * + * The threads are not unparked as watchdog_allowed_mask is empty. When + * the threads are sucessfully initialized, take the proper locks and + * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + */ +static __init void softlockup_init_threads(void) +{ + int ret; + + /* + * If sysctl is off and watchdog got disabled on the command line, + * nothing to do here. + */ + if (!IS_ENABLED(CONFIG_SYSCTL) && + !(watchdog_enabled && watchdog_thresh)) + return; + + ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_allowed_mask); + if (ret) { + pr_err("Failed to initialize soft lockup detector threads\n"); + return; + } + + mutex_lock(&watchdog_mutex); + softlockup_threads_initialized = true; + softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + mutex_unlock(&watchdog_mutex); +} + #else /* CONFIG_SOFTLOCKUP_DETECTOR */ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void set_sample_period(void) { } +static inline void softlockup_init_threads(void) { } +static inline void softlockup_update_threads(void) { } +static inline void softlockup_reconfigure_threads(bool enabled) { } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) -- cgit v1.2.3 From d57108d4f6791291e89d980e7f7a3566c32ab188 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:11 +0200 Subject: watchdog/core: Get rid of the thread teardown/setup dance The lockup detector reconfiguration tears down all watchdog threads when the watchdog is disabled and sets them up again when its enabled. That's a pointless exercise. The watchdog threads are not consuming an insane amount of resources, so it's enough to set them up at init time and keep them in parked position when the watchdog is disabled and unpark them when it is reenabled. The smpboot thread infrastructure takes care of keeping the force parked threads in place even across cpu hotplug. Aside of that the code implements the park/unpark facility of smp hotplug threads on its own, which is even more pointless. We have functionality in the smpboot thread code to do so. Use the new thread management functions and get rid of the unholy mess. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.470370113@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 190 ++++++------------------------------------------------ 1 file changed, 19 insertions(+), 171 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b35518375fb7..762d3ed82a08 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -91,13 +91,6 @@ int __read_mostly watchdog_thresh = 10; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* - * The 'watchdog_running' variable is set to 1 when the watchdog threads - * are registered/started and is set to 0 when the watchdog threads are - * unregistered/stopped, so it is an indicator whether the threads exist. - */ -static int __read_mostly watchdog_running; - /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -130,10 +123,6 @@ void __weak watchdog_nmi_reconfigure(void) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -259,11 +248,15 @@ void touch_all_softlockup_watchdogs(void) int cpu; /* - * this is done lockless - * do we care if a 0 races with a timestamp? - * all it means is the softlock check starts one cycle later + * watchdog_mutex cannpt be taken here, as this might be called + * from (soft)interrupt context, so the access to + * watchdog_allowed_cpumask might race with a concurrent update. + * + * The watchdog time stamp can race against a concurrent real + * update as well, the only side effect might be a cycle delay for + * the softlockup check. */ - for_each_watchdog_cpu(cpu) + for_each_cpu(cpu, &watchdog_allowed_mask) per_cpu(watchdog_touch_ts, cpu) = 0; wq_watchdog_touch(-1); } @@ -303,9 +296,6 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_enable_all_cpus(void); -static void watchdog_disable_all_cpus(void); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -498,95 +488,6 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -/* - * park all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function returns an error if kthread_park() of a watchdog thread - * fails. In this situation, the watchdog threads of some CPUs can already - * be parked and the watchdog threads of other CPUs can still be runnable. - * Callers are expected to handle this special condition as appropriate in - * their context. - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static int watchdog_park_threads(void) -{ - int cpu, ret = 0; - - for_each_watchdog_cpu(cpu) { - ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); - if (ret) - break; - } - return ret; -} - -/* - * unpark all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static void watchdog_unpark_threads(void) -{ - int cpu; - - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); -} - -static int update_watchdog_all_cpus(void) -{ - int ret; - - ret = watchdog_park_threads(); - if (ret) - return ret; - - watchdog_unpark_threads(); - - return 0; -} - -static int watchdog_enable_all_cpus(void) -{ - int err = 0; - - if (!watchdog_running) { - err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_cpumask); - if (err) - pr_err("Failed to create watchdog threads, disabled\n"); - else - watchdog_running = 1; - } else { - /* - * Enable/disable the lockup detectors or - * change the sample period 'on the fly'. - */ - err = update_watchdog_all_cpus(); - - if (err) { - watchdog_disable_all_cpus(); - pr_err("Failed to update lockup detectors, disabled\n"); - } - } - - if (err) - watchdog_enabled = 0; - - return err; -} - -static void watchdog_disable_all_cpus(void) -{ - if (watchdog_running) { - watchdog_running = 0; - smpboot_unregister_percpu_thread(&watchdog_threads); - } -} - static void softlockup_update_smpboot_threads(void) { lockdep_assert_held(&watchdog_mutex); @@ -661,7 +562,6 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static inline void set_sample_period(void) { } static inline void softlockup_init_threads(void) { } static inline void softlockup_update_threads(void) { } static inline void softlockup_reconfigure_threads(bool enabled) { } @@ -701,28 +601,10 @@ void lockup_detector_soft_poweroff(void) /* * Update the run state of the lockup detectors. */ -static int proc_watchdog_update(void) +static void proc_watchdog_update(void) { - int err = 0; - - /* - * Watchdog threads won't be started if they are already active. - * The 'watchdog_running' variable in watchdog_*_all_cpus() takes - * care of this. If those threads are already active, the sample - * period will be updated and the lockup detectors will be enabled - * or disabled 'on the fly'. - */ - if (watchdog_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - + softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); watchdog_nmi_reconfigure(); - - __lockup_detector_cleanup(); - - return err; - } /* @@ -778,17 +660,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, new = old & ~which; } while (cmpxchg(&watchdog_enabled, old, new) != old); - /* - * Update the run state of the lockup detectors. There is _no_ - * need to check the value returned by proc_watchdog_update() - * and to restore the previous value of 'watchdog_enabled' as - * both lockup detectors are disabled if proc_watchdog_update() - * returns an error. - */ - if (old == new) - goto out; - - err = proc_watchdog_update(); + if (old != new) + proc_watchdog_update(); } out: mutex_unlock(&watchdog_mutex); @@ -832,50 +705,28 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; + int err, old; cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); - old = ACCESS_ONCE(watchdog_thresh); + old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; + if (!err && write && old != READ_ONCE(watchdog_thresh)) + proc_watchdog_update(); - /* - * Update the sample period. Restore on failure. - */ - new = ACCESS_ONCE(watchdog_thresh); - if (old == new) - goto out; - - set_sample_period(); - err = proc_watchdog_update(); - if (err) { - watchdog_thresh = old; - set_sample_period(); - } -out: mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } -static void watchdog_update_cpus(void) -{ - if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) { - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_cpumask); - __lockup_detector_cleanup(); - } -} - static void proc_watchdog_cpumask_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - watchdog_update_cpus(); + + softlockup_update_threads(); watchdog_nmi_reconfigure(); } @@ -905,8 +756,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __init lockup_detector_init(void) { - set_sample_period(); - #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { pr_info("Disabling watchdog on nohz_full cores by default\n"); @@ -917,6 +766,5 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif - if (watchdog_enabled) - watchdog_enable_all_cpus(); + softlockup_init_threads(); } -- cgit v1.2.3 From e8b62b2dd14f8f2427856ba24cb7db922bda9bfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:12 +0200 Subject: watchdog/core: Further simplify sysctl handling Use a single function to update sysctl changes. This is not a high frequency user space interface and it's root only. Preparatory patch to cleanup the sysctl variable handling. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.549114957@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 762d3ed82a08..ca8747221e87 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -507,11 +507,8 @@ static void softlockup_park_all_threads(void) softlockup_update_smpboot_threads(); } -/* - * Park threads which are not longer enabled and unpark threads which have - * been newly enabled. - */ -static void softlockup_update_threads(void) +/* Unpark enabled threads */ +static void softlockup_unpark_threads(void) { cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); softlockup_update_smpboot_threads(); @@ -522,7 +519,7 @@ static void softlockup_reconfigure_threads(bool enabled) softlockup_park_all_threads(); set_sample_period(); if (enabled) - softlockup_update_threads(); + softlockup_unpark_threads(); } /* @@ -563,7 +560,6 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static inline void softlockup_update_threads(void) { } static inline void softlockup_reconfigure_threads(bool enabled) { } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -598,11 +594,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL -/* - * Update the run state of the lockup detectors. - */ +/* Propagate any changes to the watchdog threads */ static void proc_watchdog_update(void) { + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); watchdog_nmi_reconfigure(); } @@ -721,15 +717,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, return err; } -static void proc_watchdog_cpumask_update(void) -{ - /* Remove impossible cpus to keep sysctl output clean. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - - softlockup_update_threads(); - watchdog_nmi_reconfigure(); -} - /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the @@ -746,7 +733,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_cpumask_update(); + proc_watchdog_update(); mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); -- cgit v1.2.3 From 51d4052b01ca555e0d1d5fe297b309beb6c64aa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:14 +0200 Subject: watchdog/sysctl: Get rid of the #ifdeffery The sysctl of the nmi_watchdog file prevents writes by setting: min = max = 0 if none of the users is enabled. That involves ifdeffery and is competely non obvious. If none of the facilities is enabeld, then the file can simply be made read only. Move the ifdeffery into the header and use a constant for file permissions. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.706073616@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 6 ++++++ kernel/sysctl.c | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index cfebb3bc4eed..5774b443dba1 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -81,6 +81,12 @@ extern unsigned int hardlockup_panic; static inline void hardlockup_detector_disable(void) {} #endif +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +# define NMI_WATCHDOG_SYSCTL_PERM 0644 +#else +# define NMI_WATCHDOG_SYSCTL_PERM 0444 +#endif + #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); extern void hardlockup_detector_perf_stop(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..539cb4e97bb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -891,14 +891,10 @@ static struct ctl_table kern_table[] = { .procname = "nmi_watchdog", .data = &nmi_watchdog_enabled, .maxlen = sizeof (int), - .mode = 0644, + .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) .extra2 = &one, -#else - .extra2 = &zero, -#endif }, { .procname = "watchdog_cpumask", -- cgit v1.2.3 From 7feeb9cd4f5b34476ffb9e6d58d58c5416375b19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:15 +0200 Subject: watchdog/sysctl: Clean up sysctl variable name space Reflect that these variables are user interface related and remove the whitespace damage in the sysctl table while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.783210221@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 16 ++++++++-------- kernel/sysctl.c | 16 ++++++++-------- kernel/watchdog.c | 41 ++++++++++++++++++++--------------------- 3 files changed, 36 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5774b443dba1..4a8d1037364e 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,8 +17,8 @@ void lockup_detector_cleanup(void); bool is_hardlockup(void); extern int watchdog_user_enabled; -extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; +extern int nmi_watchdog_user_enabled; +extern int soft_watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; @@ -62,12 +62,12 @@ static inline void reset_hung_task_detector(void) { } * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. + * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and + * 'soft_watchdog_user_enabled' are variables that are only used as an + * 'interface' between the parameters in /proc/sys/kernel and the internal + * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is + * handled differently because its value is not boolean, and the lockup + * detectors are 'suspended' while 'watchdog_thresh' is equal zero. */ #define NMI_WATCHDOG_ENABLED_BIT 0 #define SOFT_WATCHDOG_ENABLED_BIT 1 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 539cb4e97bb8..4c08ed4a379e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -871,9 +871,9 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, @@ -889,8 +889,8 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, - .maxlen = sizeof (int), + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, @@ -906,9 +906,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = &zero, .extra2 = &one, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ca8747221e87..baae9fc95031 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,8 +31,6 @@ static DEFINE_MUTEX(watchdog_mutex); -int __read_mostly nmi_watchdog_enabled; - #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED; @@ -40,6 +38,17 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif +int __read_mostly nmi_watchdog_user_enabled; +int __read_mostly soft_watchdog_user_enabled; +int __read_mostly watchdog_user_enabled; +int __read_mostly watchdog_thresh = 10; + +struct cpumask watchdog_allowed_mask __read_mostly; +static bool softlockup_threads_initialized __read_mostly; + +struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + #ifdef CONFIG_HARDLOCKUP_DETECTOR /* * Should we panic when a soft-lockup or hard-lockup occurs: @@ -85,12 +94,6 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); # endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -int __read_mostly watchdog_user_enabled; -int __read_mostly watchdog_thresh = 10; - -struct cpumask watchdog_cpumask __read_mostly; -unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); - /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -113,7 +116,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * watchdog_nmi_reconfigure can be implemented to be notified after any * watchdog configuration change. The arch hardlockup watchdog should * respond to the following variables: - * - nmi_watchdog_enabled + * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask * - sysctl_hardlockup_all_cpu_backtrace @@ -126,10 +129,6 @@ void __weak watchdog_nmi_reconfigure(void) { } /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -int __read_mostly soft_watchdog_enabled; - -struct cpumask watchdog_allowed_mask __read_mostly; -static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -606,14 +605,14 @@ static void proc_watchdog_update(void) /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' contains the flag(s) - * -------------------|-----------------------|----------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed - * | | with SOFT_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------|-------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | + * | | SOFT_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From 6592ad2fcc8f15b4f99b36c1db7d9f65510c203b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:16 +0200 Subject: watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage Both the perf reconfiguration and the powerpc watchdog_nmi_reconfigure() need to be done in two steps. 1) Stop all NMIs 2) Read the new parameters and start NMIs Right now watchdog_nmi_reconfigure() is a combination of both. To allow a clean reconfiguration add a 'run' argument and split the functionality in powerpc. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170912194147.862865570@linutronix.de Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/watchdog.c | 17 +++++++++-------- include/linux/nmi.h | 2 ++ kernel/watchdog.c | 31 ++++++++++++++++++++++--------- 3 files changed, 33 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 5ded171f02d6..291af79a9826 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -355,17 +355,18 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_reconfigure(void) +void watchdog_nmi_reconfigure(bool run) { int cpu; - watchdog_calc_timeouts(); - - for_each_cpu(cpu, &wd_cpus_enabled) - stop_wd_on_cpu(cpu); - - for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) - start_wd_on_cpu(cpu); + if (!run) { + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + } else { + watchdog_calc_timeouts(); + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); + } } /* diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 4a8d1037364e..eee255bc0fd6 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -103,6 +103,8 @@ static inline void arch_touch_nmi_watchdog(void) {} #endif #endif +void watchdog_nmi_reconfigure(bool run); + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * diff --git a/kernel/watchdog.c b/kernel/watchdog.c index baae9fc95031..5693afd2b8ea 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -112,17 +112,25 @@ void __weak watchdog_nmi_disable(unsigned int cpu) hardlockup_detector_perf_disable(); } -/* - * watchdog_nmi_reconfigure can be implemented to be notified after any - * watchdog configuration change. The arch hardlockup watchdog should - * respond to the following variables: +/** + * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs + * @run: If false stop the watchdogs on all enabled CPUs + * If true start the watchdogs on all enabled CPUs + * + * The core call order is: + * watchdog_nmi_reconfigure(false); + * update_variables(); + * watchdog_nmi_reconfigure(true); + * + * The second call which starts the watchdogs again guarantees that the + * following variables are stable across the call. * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - sysctl_hardlockup_all_cpu_backtrace - * - hardlockup_panic + * + * After the call the variables can be changed again. */ -void __weak watchdog_nmi_reconfigure(void) { } +void __weak watchdog_nmi_reconfigure(bool run) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -515,10 +523,12 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(bool enabled) { + watchdog_nmi_reconfigure(false); softlockup_park_all_threads(); set_sample_period(); if (enabled) softlockup_unpark_threads(); + watchdog_nmi_reconfigure(true); } /* @@ -559,7 +569,11 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static inline void softlockup_reconfigure_threads(bool enabled) { } +static void softlockup_reconfigure_threads(bool enabled) +{ + watchdog_nmi_reconfigure(false); + watchdog_nmi_reconfigure(true); +} #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) @@ -599,7 +613,6 @@ static void proc_watchdog_update(void) /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); - watchdog_nmi_reconfigure(); } /* -- cgit v1.2.3 From 091549858ed881e5f3054374af4f5b1cac681d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:17 +0200 Subject: watchdog/core: Get rid of the racy update loop Letting user space poke directly at variables which are used at run time is stupid and causes a lot of race conditions and other issues. Seperate the user variables and on change invoke the reconfiguration, which then stops the watchdogs, reevaluates the new user value and restarts the watchdogs with the new parameters. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.939985640@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 95 +++++++++++++++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5693afd2b8ea..84886319d7b0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -32,15 +32,17 @@ static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | - NMI_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 1 #else -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 0 #endif -int __read_mostly nmi_watchdog_user_enabled; -int __read_mostly soft_watchdog_user_enabled; -int __read_mostly watchdog_user_enabled; +unsigned long __read_mostly watchdog_enabled; +int __read_mostly watchdog_user_enabled = 1; +int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; +int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; struct cpumask watchdog_allowed_mask __read_mostly; @@ -65,7 +67,7 @@ unsigned int __read_mostly hardlockup_panic = */ void __init hardlockup_detector_disable(void) { - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -75,9 +77,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -132,6 +134,23 @@ void __weak watchdog_nmi_disable(unsigned int cpu) */ void __weak watchdog_nmi_reconfigure(bool run) { } +/** + * lockup_detector_update_enable - Update the sysctl enable bit + * + * Caller needs to make sure that the NMI/perf watchdogs are off, so this + * can't race with watchdog_nmi_disable(). + */ +static void lockup_detector_update_enable(void) +{ + watchdog_enabled = 0; + if (!watchdog_user_enabled) + return; + if (nmi_watchdog_user_enabled) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + if (soft_watchdog_user_enabled) + watchdog_enabled |= SOFT_WATCHDOG_ENABLED; +} + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Global variables, exported for sysctl */ @@ -160,14 +179,14 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; + soft_watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -521,12 +540,13 @@ static void softlockup_unpark_threads(void) softlockup_update_smpboot_threads(); } -static void softlockup_reconfigure_threads(bool enabled) +static void softlockup_reconfigure_threads(void) { watchdog_nmi_reconfigure(false); softlockup_park_all_threads(); set_sample_period(); - if (enabled) + lockup_detector_update_enable(); + if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); watchdog_nmi_reconfigure(true); } @@ -546,6 +566,8 @@ static __init void softlockup_init_threads(void) * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. */ + lockup_detector_update_enable(); + if (!IS_ENABLED(CONFIG_SYSCTL) && !(watchdog_enabled && watchdog_thresh)) return; @@ -559,7 +581,7 @@ static __init void softlockup_init_threads(void) mutex_lock(&watchdog_mutex); softlockup_threads_initialized = true; - softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + softlockup_reconfigure_threads(); mutex_unlock(&watchdog_mutex); } @@ -569,9 +591,10 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static void softlockup_reconfigure_threads(bool enabled) +static void softlockup_reconfigure_threads(void) { watchdog_nmi_reconfigure(false); + lockup_detector_update_enable(); watchdog_nmi_reconfigure(true); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -612,7 +635,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + softlockup_reconfigure_threads(); } /* @@ -630,48 +653,24 @@ static void proc_watchdog_update(void) static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - int *watchdog_param = (int *)table->data; + int err, old, *param = table->data; cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); - /* - * If the parameter is being read return the state of the corresponding - * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the - * run state of the lockup detectors. - */ if (!write) { - *watchdog_param = (watchdog_enabled & which) != 0; + /* + * On read synchronize the userspace interface. This is a + * racy snapshot. + */ + *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } else { + old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err) - goto out; - - /* - * There is a race window between fetching the current value - * from 'watchdog_enabled' and storing the new value. During - * this race window, watchdog_nmi_enable() can sneak in and - * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. - * The 'cmpxchg' detects this race and the loop retries. - */ - do { - old = watchdog_enabled; - /* - * If the parameter value is not zero set the - * corresponding bit(s), else clear it(them). - */ - if (*watchdog_param) - new = old | which; - else - new = old & ~which; - } while (cmpxchg(&watchdog_enabled, old, new) != old); - - if (old != new) + if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); } -out: mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; -- cgit v1.2.3 From 178b9f7a36d2c74a38274b66dd89f53611298a19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:18 +0200 Subject: watchdog/hardlockup/perf: Implement init time perf validation The watchdog tries to create perf events even after it figured out that perf is not functional or the requested event is not supported. That's braindead as this can be done once at init time and if not supported the NMI watchdog can be turned off unconditonally. Implement the perf hardlockup detector functionality for that. This creates a new event create function, which will replace the unholy mess of the existing one in later patches. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.019090547@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 8 ++++++-- kernel/watchdog_hld.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index eee255bc0fd6..72c62a809e92 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -93,14 +93,18 @@ extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_disable(void); extern void hardlockup_detector_perf_cleanup(void); +extern int hardlockup_detector_perf_init(void); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_disable(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } -#if !defined(CONFIG_HAVE_NMI_WATCHDOG) +# if !defined(CONFIG_HAVE_NMI_WATCHDOG) +static inline int hardlockup_detector_perf_init(void) { return -ENODEV; } static inline void arch_touch_nmi_watchdog(void) {} -#endif +# else +static inline int hardlockup_detector_perf_init(void) { return 0; } +# endif #endif void watchdog_nmi_reconfigure(bool run); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 0aa191ee3d51..f7e752e6e9b4 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -238,6 +238,27 @@ out: return 0; } +static int hardlockup_detector_event_create(void) +{ + unsigned int cpu = smp_processor_id(); + struct perf_event_attr *wd_attr; + struct perf_event *evt; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + pr_info("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); + return PTR_ERR(evt); + } + this_cpu_write(watchdog_ev, evt); + return 0; +} + /** * hardlockup_detector_perf_disable - Disable the local event */ @@ -315,3 +336,19 @@ void __init hardlockup_detector_perf_restart(void) perf_event_enable(event); } } + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + int ret = hardlockup_detector_event_create(); + + if (ret) { + pr_info("Perf NMI watchdog permanetely disabled\n"); + } else { + perf_event_release_kernel(this_cpu_read(watchdog_ev)); + this_cpu_write(watchdog_ev, NULL); + } + return ret; +} -- cgit v1.2.3 From a994a3147e4c0c9c50a46e6cace7586254975e20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:19 +0200 Subject: watchdog/hardlockup/perf: Implement init time detection of perf Use the init time detection of the perf NMI watchdog to determine whether the perf NMI watchdog is functional. If not disable it permanentely. It won't come back magically at runtime. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.099799541@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84886319d7b0..fd8a998eb197 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,6 +44,7 @@ int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +int __read_mostly nmi_watchdog_available; struct cpumask watchdog_allowed_mask __read_mostly; static bool softlockup_threads_initialized __read_mostly; @@ -114,6 +115,12 @@ void __weak watchdog_nmi_disable(unsigned int cpu) hardlockup_detector_perf_disable(); } +/* Return 0, if a NMI watchdog is available. Error code otherwise */ +int __weak __init watchdog_nmi_probe(void) +{ + return hardlockup_detector_perf_init(); +} + /** * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs * @run: If false stop the watchdogs on all enabled CPUs @@ -145,7 +152,7 @@ static void lockup_detector_update_enable(void) watchdog_enabled = 0; if (!watchdog_user_enabled) return; - if (nmi_watchdog_user_enabled) + if (nmi_watchdog_available && nmi_watchdog_user_enabled) watchdog_enabled |= NMI_WATCHDOG_ENABLED; if (soft_watchdog_user_enabled) watchdog_enabled |= SOFT_WATCHDOG_ENABLED; @@ -692,6 +699,8 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (!nmi_watchdog_available && write) + return -ENOTSUPP; return proc_watchdog_common(NMI_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } @@ -764,5 +773,7 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif + if (!watchdog_nmi_probe()) + nmi_watchdog_available = true; softlockup_init_threads(); } -- cgit v1.2.3 From 2a1b8ee4f5665b4291e43e4a25d964c3eb2f4c32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:20 +0200 Subject: watchdog/hardlockup/perf: Implement CPU enable replacement watchdog_nmi_enable() is an unparseable mess, Provide a clean perf specific implementation, which will be used when the existing setup/teardown mess is replaced. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.180215498@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/nmi.h | 2 ++ kernel/watchdog_hld.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 72c62a809e92..89ba8b23c6fe 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -92,12 +92,14 @@ extern void arch_touch_nmi_watchdog(void); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_disable(void); +extern void hardlockup_detector_perf_enable(void); extern void hardlockup_detector_perf_cleanup(void); extern int hardlockup_detector_perf_init(void); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_disable(void) { } +static inline void hardlockup_detector_perf_enable(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } # if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline int hardlockup_detector_perf_init(void) { return -ENODEV; } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index f7e752e6e9b4..99a3f22e48cc 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -259,6 +259,17 @@ static int hardlockup_detector_event_create(void) return 0; } +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ + if (hardlockup_detector_event_create()) + return; + + perf_event_enable(this_cpu_read(watchdog_ev)); +} + /** * hardlockup_detector_perf_disable - Disable the local event */ -- cgit v1.2.3 From 146c9d0e9dfdb62ed6afd43cc263efafbbfd1dcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:21 +0200 Subject: watchdog/hardlockup/perf: Use new perf CPU enable mechanism Get rid of the hodgepodge which tries to be smart about perf being unavailable and error printout rate limiting. That's all not required simply because this is never invoked when the perf NMI watchdog is not functional. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.259651788@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 4 ++- kernel/watchdog_hld.c | 88 +++------------------------------------------------ 2 files changed, 8 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fd8a998eb197..5eb11960e4a2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -107,6 +107,7 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); */ int __weak watchdog_nmi_enable(unsigned int cpu) { + hardlockup_detector_perf_enable(); return 0; } @@ -465,7 +466,8 @@ static void watchdog_enable(unsigned int cpu) /* Initialize timestamp */ __touch_watchdog(); /* Enable the perf event */ - watchdog_nmi_enable(cpu); + if (watchdog_enabled & NMI_WATCHDOG_ENABLED) + watchdog_nmi_enable(cpu); watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 99a3f22e48cc..509bb6b59c41 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -25,7 +25,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; -static bool hardlockup_detector_disabled; +static unsigned int watchdog_cpus; void arch_touch_nmi_watchdog(void) { @@ -160,84 +160,6 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long firstcpu_err; -static atomic_t watchdog_cpus; - -int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - int firstcpu = 0; - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* A failure disabled the hardlockup detector permanently */ - if (hardlockup_detector_disabled) - return -ENODEV; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - if (atomic_inc_return(&watchdog_cpus) == 1) - firstcpu = 1; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save the first cpu's error for future comparision */ - if (firstcpu && IS_ERR(event)) - firstcpu_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for the first cpu initialized */ - if (firstcpu || firstcpu_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* skip displaying the same error again */ - if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Disabling hard lockup detector permanently\n"); - hardlockup_detector_disabled = true; - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - static int hardlockup_detector_event_create(void) { unsigned int cpu = smp_processor_id(); @@ -267,6 +189,9 @@ void hardlockup_detector_perf_enable(void) if (hardlockup_detector_event_create()) return; + if (!watchdog_cpus++) + pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + perf_event_enable(this_cpu_read(watchdog_ev)); } @@ -282,10 +207,7 @@ void hardlockup_detector_perf_disable(void) this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - - /* watchdog_nmi_enable() expects this to be zero initially. */ - if (atomic_dec_and_test(&watchdog_cpus)) - firstcpu_err = 0; + watchdog_cpus--; } } -- cgit v1.2.3 From a33d44843d4574ec05bec39527d8a87b7af2072c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:22 +0200 Subject: watchdog/hardlockup/perf: Simplify deferred event destroy Now that all functionality is properly serialized against CPU hotplug, remove the extra per cpu storage which holds the disabled events for cleanup. The core makes sure that cleanup happens before new events are created. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.340708074@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 509bb6b59c41..b2931154b5f2 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,7 +21,6 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -204,8 +203,6 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); - this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); watchdog_cpus--; } @@ -221,9 +218,9 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); + struct perf_event *event = per_cpu(watchdog_ev, cpu); - per_cpu(dead_event, cpu) = NULL; + per_cpu(watchdog_ev, cpu) = NULL; perf_event_release_kernel(event); } cpumask_clear(&dead_events_mask); -- cgit v1.2.3 From ab5fe3ff38ff9653490910cc71dbbedc95a86e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:23 +0200 Subject: watchdog/hardlockup: Clean up hotplug locking mess All watchdog thread related functions are delegated to the smpboot thread infrastructure, which handles serialization against CPU hotplug correctly. The sysctl interface is completely decoupled from anything which requires CPU hotplug protection. No need to protect the sysctl writes against cpu hotplug anymore. Remove it and add the now required protection to the powerpc arch_nmi_watchdog implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170912194148.418497420@linutronix.de Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/watchdog.c | 2 ++ kernel/watchdog.c | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 291af79a9826..dfb067764480 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -359,6 +359,7 @@ void watchdog_nmi_reconfigure(bool run) { int cpu; + cpus_read_lock(); if (!run) { for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); @@ -367,6 +368,7 @@ void watchdog_nmi_reconfigure(bool run) for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); } + cpus_read_unlock(); } /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5eb11960e4a2..f6ef163b72cd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -664,7 +664,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, { int err, old, *param = table->data; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); if (!write) { @@ -681,7 +680,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, proc_watchdog_update(); } mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } @@ -725,7 +723,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); old = READ_ONCE(watchdog_thresh); @@ -735,7 +732,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, proc_watchdog_update(); mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } @@ -750,7 +746,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); @@ -758,7 +753,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, proc_watchdog_update(); mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From c4fa6c43ce4b427350cfbb659436bfe3d9e09a1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 21 Sep 2017 09:54:13 -0400 Subject: cgroup: Reinit cgroup_taskset structure before cgroup_migrate_execute() returns The cgroup_taskset structure within the larger cgroup_mgctx structure is supposed to be used once and then discarded. That is not really the case in the hotplug code path: cpuset_hotplug_workfn() - cgroup_transfer_tasks() - cgroup_migrate() - cgroup_migrate_add_task() - cgroup_migrate_execute() In this case, the cgroup_migrate() function is called multiple time with the same cgroup_mgctx structure to transfer the tasks from one cgroup to another one-by-one. The second time cgroup_migrate() is called, the cgroup_taskset will be in an incorrect state and so may cause the system to panic. For example, [ 150.888410] Faulting instruction address: 0xc0000000001db648 [ 150.888414] Oops: Kernel access of bad area, sig: 11 [#1] [ 150.888417] SMP NR_CPUS=2048 [ 150.888417] NUMA [ 150.888419] pSeries : [ 150.888545] NIP [c0000000001db648] cpuset_can_attach+0x58/0x1b0 [ 150.888548] LR [c0000000001db638] cpuset_can_attach+0x48/0x1b0 [ 150.888551] Call Trace: [ 150.888554] [c0000005f65cb940] [c0000000001db638] cpuset_can_attach+0x48/0x1b 0 (unreliable) [ 150.888559] [c0000005f65cb9a0] [c0000000001cff04] cgroup_migrate_execute+0xc4/0x4b0 [ 150.888563] [c0000005f65cba20] [c0000000001d7d14] cgroup_transfer_tasks+0x1d4/0x370 [ 150.888568] [c0000005f65cbb70] [c0000000001ddcb0] cpuset_hotplug_workfn+0x710/0x8f0 [ 150.888572] [c0000005f65cbc80] [c00000000012032c] process_one_work+0x1ac/0x4d0 [ 150.888576] [c0000005f65cbd20] [c0000000001206f8] worker_thread+0xa8/0x5b0 [ 150.888580] [c0000005f65cbdc0] [c0000000001293f8] kthread+0x168/0x1b0 [ 150.888584] [c0000005f65cbe30] [c00000000000b368] ret_from_kernel_thread+0x5c/0x74 To allow reuse of the cgroup_mgctx structure, some fields in that structure are now re-initialized at the end of cgroup_migrate_execute() function call so that the structure can be reused again in a later iteration without causing problem. This bug was introduced in the commit e595cd706982 ("group: track migration context in cgroup_mgctx") in 4.11. This commit moves the cgroup_taskset initialization out of cgroup_migrate(). The commit 10467270fb3 ("cgroup: don't call migration methods if there are no tasks to migrate") helped, but did not completely resolve the problem. Fixes: e595cd706982bff0211e6fafe5a108421e747fbc ("group: track migration context in cgroup_mgctx") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.11+ --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2311,6 +2311,14 @@ out_release_tset: list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); + + /* + * Re-initialize the cgroup_taskset structure in case it is reused + * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() + * iteration. + */ + tset->nr_tasks = 0; + tset->csets = &tset->src_csets; return ret; } -- cgit v1.2.3 From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 17:48:06 +0200 Subject: futex: Fix pi_state->owner serialization There was a reported suspicion about a race between exit_pi_state_list() and put_pi_state(). The same report mentioned the comment with put_pi_state() said it should be called with hb->lock held, and it no longer is in all places. As it turns out, the pi_state->owner serialization is indeed broken. As per the new rules: 734009e96d19 ("futex: Change locking rules") pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock. For the sites setting pi_state->owner we already hold wait_lock (where required) but exit_pi_state_list() and put_pi_state() were not and raced on clearing it. Fixes: 734009e96d19 ("futex: Change locking rules") Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net --- kernel/futex.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. - * - * Must be called with the hb lock held. */ static void put_pi_state(struct futex_pi_state *pi_state) { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + struct task_struct *owner; - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) + if (current->pi_state_cache) { kfree(pi_state); - else { + } else { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); continue; } @@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock(&curr->pi_lock); get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ pi_state->owner = p; raw_spin_unlock_irq(&p->pi_lock); @@ -2878,6 +2888,7 @@ retry: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); put_pi_state(pi_state); -- cgit v1.2.3 From 2827a418ca1b23e432e62c9b3d0e7cf3255dfe88 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 19 Sep 2017 22:04:12 +0200 Subject: genirq: Check __free_irq() return value for NULL __free_irq() can return a NULL irqaction for example when trying to free already-free IRQ, but the callsite unconditionally dereferences the returned pointer. Fix this by adding a check and return NULL. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170919200412.GA29985@gmail.com --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) #endif action = __free_irq(irq, dev_id); + + if (!action) + return NULL; + devname = action->name; kfree(action); return devname; -- cgit v1.2.3 From 115ef3b7e61ac64e32827611a127002672ed3725 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 25 Sep 2017 20:21:54 +0200 Subject: watchdog/hardlockup/perf: Cure UP damage for_each_cpu() unintuitively reports CPU0 as set independend of the actual cpumask content on UP kernels. That leads to a NULL pointer dereference when the cleanup function is invoked and there is no event to clean up. Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner --- kernel/watchdog_hld.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index b2931154b5f2..204a8cadb717 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -220,8 +220,13 @@ void hardlockup_detector_perf_cleanup(void) for_each_cpu(cpu, &dead_events_mask) { struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* + * Required because for_each_cpu() reports unconditionally + * CPU0 as set on UP kernels. Sigh. + */ + if (event) + perf_event_release_kernel(event); per_cpu(watchdog_ev, cpu) = NULL; - perf_event_release_kernel(event); } cpumask_clear(&dead_events_mask); } -- cgit v1.2.3 From 7755d83e48397e822aac751b1545f8bcf71d133e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: irqdomain: Add __rcu annotations to radix tree accessors Fix various address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/1506082841-11530-1-git-send-email-yamada.masahiro@socionext.com --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ -- cgit v1.2.3 From 96abb968549cdefd0964d1f7af0a79f4e6e7f897 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:16 +0200 Subject: smp/hotplug: Allow external multi-instance rollback Currently the rollback of multi-instance states is handled inside cpuhp_invoke_callback(). The problem is that when we want to allow an explicit state change for rollback, we need to return from the function without doing the rollback. Change cpuhp_invoke_callback() to optionally return the multi-instance state, such that rollback can be done from a subsequent call. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.720361181@infradead.org --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..323b71050b54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -123,13 +123,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked - * @step: The step in the state machine + * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked + * @node: For multi-instance, do a single entry callback for install/remove + * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup, struct hlist_node *node) + bool bringup, struct hlist_node *node, + struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); @@ -138,6 +141,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { + WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; @@ -152,6 +156,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* Single invocation for instance add/remove */ if (node) { + WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +166,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { + if (lastp && node == *lastp) + break; + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); - if (ret) - goto err; + if (ret) { + if (!lastp) + goto err; + + *lastp = node; + return ret; + } cnt++; } + if (lastp) + *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ @@ -323,7 +338,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -334,7 +349,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -350,7 +365,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } } @@ -362,7 +377,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -428,11 +443,13 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -472,7 +489,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup, node); + return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->cb_state = state; st->single = true; @@ -595,7 +612,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -776,7 +793,7 @@ void notify_cpu_starting(unsigned int cpu) rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -1307,9 +1324,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; -- cgit v1.2.3 From 4dddfb5faa6118564b0c54a163353d13882299d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:17 +0200 Subject: smp/hotplug: Rewrite AP state machine core There is currently no explicit state change on rollback. That is, st->bringup, st->rollback and st->target are not consistent when doing the rollback. Rework the AP state handling to be more coherent. This does mean we have to do a second AP kick-and-wait for rollback, but since rollback is the slow path of a slowpath, this really should not matter. Take this opportunity to simplify the AP thread function to only run a single callback per invocation. This unifies the three single/up/down modes is supports. The looping it used to do for up/down are achieved by retaining should_run and relying on the main smpboot_thread_fn() loop. (I have most of a patch that does the same for the BP state handling, but that's not critical and gets a little complicated because CPUHP_BRINGUP_CPU does the AP handoff from a callback, which gets recursive @st usage, I still have de-fugly that.) [ tglx: Move cpuhp_down_callbacks() et al. into the HOTPLUG_CPU section to avoid gcc complaining about unused functions. Make the HOTPLUG_CPU one piece instead of having two consecutive ifdef sections of the same type. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.769658088@infradead.org --- kernel/cpu.c | 321 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 323b71050b54..1139063de5af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,6 +58,7 @@ struct cpuhp_cpu_state { bool single; bool bringup; struct hlist_node *node; + struct hlist_node *last; enum cpuhp_state cb_state; int result; struct completion done; @@ -112,6 +113,14 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -286,7 +295,72 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + + st->rollback = false; + st->last = NULL; + + st->target = target; + st->single = false; + st->bringup = st->state < target; + + return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ + st->rollback = true; + + /* + * If we have st->last we need to undo partial multi_instance of this + * state first. Otherwise start undo at the previous state. + */ + if (!st->last) { + if (st->bringup) + st->state--; + else + st->state++; + } + + st->target = prev_state; + st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ + if (!st->single && st->state == st->target) + return; + + st->result = 0; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state; + int ret; + + prev_state = cpuhp_set_state(st, target); + __cpuhp_kick_ap(st); + if ((ret = st->result)) { + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); + } + + return ret; +} static int bringup_wait_for_ap(unsigned int cpu) { @@ -301,12 +375,10 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) { - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - } - return st->result; + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -332,32 +404,6 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); - } -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - enum cpuhp_state target) -{ - enum cpuhp_state prev_state = st->state; - int ret = 0; - - for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } - return ret; -} static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { @@ -404,71 +450,90 @@ static int cpuhp_should_run(unsigned int cpu) return st->should_run; } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - - return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - return cpuhp_up_callbacks(cpu, st, st->target); -} - /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + * - single: runs st->cb_state + * - up: runs ++st->state, while st->state < st->target + * - down: runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - int ret = 0; + bool bringup = st->bringup; + enum cpuhp_state state; /* - * Paired with the mb() in cpuhp_kick_ap_work and - * cpuhp_invoke_ap_callback, so the work set is consistent visible. + * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures + * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (!st->should_run) - return; - st->should_run = false; + if (WARN_ON_ONCE(!st->should_run)) + return; lock_map_acquire(&cpuhp_state_lock_map); - /* Single callback invocation for [un]install ? */ + if (st->single) { - if (st->cb_state < CPUHP_AP_ONLINE) { - local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); - local_irq_enable(); + state = st->cb_state; + st->should_run = false; + } else { + if (bringup) { + st->state++; + state = st->state; + st->should_run = (st->state < st->target); + WARN_ON_ONCE(st->state > st->target); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); + state = st->state; + st->state--; + st->should_run = (st->state > st->target); + WARN_ON_ONCE(st->state < st->target); } - } else if (st->rollback) { - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + } + + WARN_ON_ONCE(!cpuhp_is_ap_state(state)); + + if (st->rollback) { + struct cpuhp_step *step = cpuhp_get_step(state); + if (step->skip_onerr) + goto next; + } + + if (cpuhp_is_atomic_state(state)) { + local_irq_disable(); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + local_irq_enable(); - undo_cpu_down(cpu, st); - st->rollback = false; + /* + * STARTING/DYING must not fail! + */ + WARN_ON_ONCE(st->result); } else { - /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - /* Regular hotplug work */ - if (st->state < st->target) - ret = cpuhp_ap_online(cpu, st); - else if (st->state > st->target) - ret = cpuhp_ap_offline(cpu, st); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + } + + if (st->result) { + /* + * If we fail on a rollback, we're up a creek without no + * paddle, no way forward, no way back. We loose, thanks for + * playing. + */ + WARN_ON_ONCE(st->rollback); + st->should_run = false; } + +next: lock_map_release(&cpuhp_state_lock_map); - st->result = ret; - complete(&st->done); + + if (!st->should_run) + complete(&st->done); } /* Invoke a single callback on a remote cpu */ @@ -477,6 +542,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; if (!cpu_online(cpu)) return 0; @@ -491,48 +557,43 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + st->rollback = false; + st->last = NULL; + + st->node = node; + st->bringup = bringup; st->cb_state = state; st->single = true; - st->bringup = bringup; - st->node = node; - /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() - */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); - wait_for_completion(&st->done); - return st->result; -} + __cpuhp_kick_ap(st); -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ - st->result = 0; - st->single = false; /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() + * If we failed and did a partial, do a rollback. */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); + if ((ret = st->result) && st->last) { + st->rollback = true; + st->bringup = !bringup; + + __cpuhp_kick_ap(st); + } + + return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; + enum cpuhp_state prev_state = st->state; + int ret; - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); lock_map_acquire(&cpuhp_state_lock_map); lock_map_release(&cpuhp_state_lock_map); - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - trace_cpuhp_exit(cpu, st->state, state, st->result); - return st->result; + + trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); + ret = cpuhp_kick_ap(st, st->target); + trace_cpuhp_exit(cpu, st->state, prev_state, ret); + + return ret; } static struct smp_hotplug_thread cpuhp_threads = { @@ -693,11 +754,32 @@ void cpuhp_report_idle_dead(void) cpuhp_complete_idle_dead, st, 0); } -#else -#define takedown_cpu NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + return ret; +} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -716,13 +798,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; - st->target = target; + prev_state = cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { + st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -737,6 +819,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; + + st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -744,9 +828,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { - st->target = prev_state; - st->rollback = true; - cpuhp_kick_ap_work(cpu); + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); } out: @@ -771,11 +854,15 @@ out: cpu_maps_update_done(); return err; } + int cpu_down(unsigned int cpu) { return do_cpu_down(cpu, CPUHP_OFFLINE); } EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** @@ -846,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - st->target = target; + cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1313,6 +1400,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; + /* + * If there's nothing to do, we done. + * Relies on the union for multi_instance. + */ if ((bringup && !sp->startup.single) || (!bringup && !sp->teardown.single)) return 0; -- cgit v1.2.3 From 724a86881d03ee5794148e65142e24ed3621be66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:18 +0200 Subject: smp/hotplug: Callback vs state-machine consistency While the generic callback functions have an 'int' return and thus appear to be allowed to return error, this is not true for all states. Specifically, what used to be STARTING/DYING are ran with IRQs disabled from critical parts of CPU bringup/teardown and are not allowed to fail. Add WARNs to enforce this rule. But since some callbacks are indeed allowed to fail, we have the situation where a state-machine rollback encounters a failure, in this case we're stuck, we can't go forward and we can't go back. Also add a WARN for that case. AFAICT this is a fundamental 'problem' with no real obvious solution. We want the 'prepare' callbacks to allow failure on either up or down. Typically on prepare-up this would be things like -ENOMEM from resource allocations, and the typical usage in prepare-down would be something like -EBUSY to avoid CPUs being taken away. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.819539119@infradead.org --- kernel/cpu.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1139063de5af..d6f1b8c36400 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -202,7 +202,14 @@ err: hlist_for_each(node, &step->list) { if (!cnt--) break; - cbm(cpu, node); + + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + /* + * Rollback must not fail, + */ + WARN_ON_ONCE(ret); } return ret; } @@ -659,6 +666,7 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); + int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -672,8 +680,13 @@ static int take_cpu_down(void *_param) WARN_ON(st->state != CPUHP_TEARDOWN_CPU); st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + /* + * DYING must not fail! + */ + WARN_ON_ONCE(ret); + } /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -876,11 +889,16 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + /* + * STARTING must not fail! + */ + WARN_ON_ONCE(ret); } } -- cgit v1.2.3 From 5f4b55e10645b7371322c800a5ec745cab487a6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:20 +0200 Subject: smp/hotplug: Differentiate the AP-work lockdep class between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. CPU0 CPU1 CPU2 cpuhp_up_callbacks: takedown_cpu: cpuhp_thread_fun: cpuhp_state irq_lock_sparse() irq_lock_sparse() wait_for_completion() cpuhp_state complete() Now that we have consistent AP state, we can trivially separate the AP-work class between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.922524234@infradead.org --- kernel/cpu.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d6f1b8c36400..d5c09985fbb6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -68,9 +68,26 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = - STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ + lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ + lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } + #endif /** @@ -486,7 +503,7 @@ static void cpuhp_thread_fun(unsigned int cpu) if (WARN_ON_ONCE(!st->should_run)) return; - lock_map_acquire(&cpuhp_state_lock_map); + cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; @@ -537,7 +554,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } next: - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_release(bringup); if (!st->should_run) complete(&st->done); @@ -554,8 +571,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls @@ -593,8 +613,11 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state prev_state = st->state; int ret; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(st, st->target); -- cgit v1.2.3 From 5ebe7742fff8be5f1359bc50f5d43fb6ff7bd060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:19 +0200 Subject: smp/hotplug: Differentiate the AP completion between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. takedown_cpu() irq_lock_sparse() wait_for_completion(&st->done) cpuhp_thread_fun cpuhp_up_callback cpuhp_invoke_callback irq_affinity_online_cpu irq_local_spare() irq_unlock_sparse() complete(&st->done) Now that we have consistent AP state, we can trivially separate the AP completion between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.872472799@infradead.org --- kernel/cpu.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d5c09985fbb6..6bbe261b851f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -46,7 +46,8 @@ * @bringup: Single callback bringup or teardown selector * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation - * @done: Signal completion to the issuer of the task + * @done_up: Signal completion to the issuer of the task for cpu-up + * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; @@ -61,7 +62,8 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; - struct completion done; + struct completion done_up; + struct completion done_down; #endif }; @@ -130,14 +132,6 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -232,6 +226,26 @@ err: } #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -368,7 +382,7 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) smp_mb(); st->should_run = true; wake_up_process(st->thread); - wait_for_completion(&st->done); + wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) @@ -391,7 +405,7 @@ static int bringup_wait_for_ap(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; @@ -464,7 +478,8 @@ static void cpuhp_create(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - init_completion(&st->done); + init_completion(&st->done_up); + init_completion(&st->done_down); } static int cpuhp_should_run(unsigned int cpu) @@ -557,7 +572,7 @@ next: cpuhp_lock_release(bringup); if (!st->should_run) - complete(&st->done); + complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ @@ -753,7 +768,7 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -772,7 +787,7 @@ static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; - complete(&st->done); + complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) @@ -939,7 +954,7 @@ void cpuhp_online_idle(enum cpuhp_state state) return; st->state = CPUHP_AP_ONLINE_IDLE; - complete(&st->done); + complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ -- cgit v1.2.3 From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- include/linux/cpuhotplug.h | 3 ++- kernel/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 477b2e6f60f7..6d508767e144 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,7 +22,8 @@ */ enum cpuhp_state { - CPUHP_OFFLINE, + CPUHP_INVALID = -1, + CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, diff --git a/kernel/cpu.c b/kernel/cpu.c index 6bbe261b851f..8de11a29e495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -67,7 +68,9 @@ struct cpuhp_cpu_state { #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; @@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; -- cgit v1.2.3 From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Thu, 28 Sep 2017 12:37:31 +0800 Subject: irq/generic-chip: Don't replace domain's name When generic irq chips are allocated for an irq domain the domain name is set to the irq chip name. That was done to have named domains before the recent changes which enforce domain naming were done. Since then the overwrite causes a memory leak when the domain name is dynamically allocated and even worse it would cause the domain free code to free the wrong name pointer, which might point to a constant. Remove the name assignment to prevent this. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } - d->name = name; return 0; } EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); -- cgit v1.2.3 From 77c01d11bbb2b5c005347061bf543ab94878314c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 26 Sep 2017 10:36:03 +0100 Subject: watchdog/hardlockup/perf: Fix spelling mistake: "permanetely" -> "permanently" Trivial fix to spelling mistake in pr_info message Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Acked-by: Don Zickus Cc: Andrew Morton Cc: Babu Moger Link: https://lkml.kernel.org/r/20170926093603.7756-1-colin.king@canonical.com --- kernel/watchdog_hld.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 204a8cadb717..71a62ceacdc8 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -280,7 +280,7 @@ int __init hardlockup_detector_perf_init(void) int ret = hardlockup_detector_event_create(); if (ret) { - pr_info("Perf NMI watchdog permanetely disabled\n"); + pr_info("Perf NMI watchdog permanently disabled\n"); } else { perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); -- cgit v1.2.3 From 87cbde8d9081b91df86a21d0d743cd700e04890a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 01:45:10 +0200 Subject: PM / s2idle: Invoke the ->wake() platform callback earlier The role of the ->wake() platform callback for suspend-to-idle is to deal with possible spurious wakeups, among other things. The ACPI implementation of it, acpi_s2idle_wake(), additionally checks the conditions for entering the Low Power S0 Idle state by the platform and reports the ones that have not been met. However, the ->wake() platform callback is invoked after calling dpm_noirq_resume_devices(), which means that the power states of some devices may have changed since s2idle_enter() returned, so some unmet Low Power S0 Idle conditions may be reported incorrectly as a result of that. To avoid these false positives, reorder the invocations of the dpm_noirq_resume_devices() routine and the ->wake() platform callback in s2idle_loop(). Fixes: 726fb6b4f2a8 (ACPI / PM: Check low power idle constraints for debug only) Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -120,22 +120,26 @@ static void s2idle_loop(void) * frozen processes + suspended devices + idle processors. * Thus s2idle_enter() should be called right after * all devices have been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, + * so prevent them from terminating the loop right away. */ error = dpm_noirq_suspend_devices(PMSG_SUSPEND); if (!error) s2idle_enter(); + else if (error == -EBUSY && pm_wakeup_pending()) + error = 0; - dpm_noirq_resume_devices(PMSG_RESUME); - if (error && (error != -EBUSY || !pm_wakeup_pending())) { - dpm_noirq_end(); - break; - } - - if (s2idle_ops && s2idle_ops->wake) + if (!error && s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); + if (error) + break; + if (s2idle_ops && s2idle_ops->sync) s2idle_ops->sync(); -- cgit v1.2.3 From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit v1.2.3 From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Thu, 7 Sep 2017 20:00:58 +0530 Subject: locking/rwsem-xadd: Fix missed wakeup due to reordering of load If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -612,6 +612,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); + /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize -- cgit v1.2.3 From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2017 13:43:15 -0400 Subject: fix infoleak in waitid(2) kernel_waitid() can return a PID, an error or 0. rusage is filled in the first case and waitid(2) rusage should've been copied out exactly in that case, *not* whenever kernel_waitid() has not returned an error. Compat variant shares that braino; none of kernel_wait4() callers do, so the below ought to fix it. Reported-and-tested-by: Alexander Potapenko Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland") Cc: stable@vger.kernel.org # v4.13 Signed-off-by: Al Viro --- kernel/exit.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; + if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } @@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err && uru) { - /* kernel_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - err = copy_to_user(uru, &ru, sizeof(ru)); - else - err = put_compat_rusage(&ru, uru); - if (err) - return -EFAULT; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } } if (!infop) -- cgit v1.2.3 From 2b0b8499ae75df91455bbeb7491d45affc384fb0 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 12 Sep 2017 10:14:54 +0800 Subject: ftrace: Fix kmemleak in unregister_ftrace_graph The trampoline allocated by function tracer was overwriten by function_graph tracer, and caused a memory leak. The save_global_trampoline should have saved the previous trampoline in register_ftrace_graph() and restored it in unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was only used in unregister_ftrace_graph as default value 0, and it overwrote the previous trampoline's value. Causing the previous allocated trampoline to be lost. kmmeleak backtrace: kmemleak_vmalloc+0x77/0xc0 __vmalloc_node_range+0x1b5/0x2c0 module_alloc+0x7c/0xd0 arch_ftrace_update_trampoline+0xb5/0x290 ftrace_startup+0x78/0x210 register_ftrace_function+0x8b/0xd0 function_trace_init+0x4f/0x80 tracing_set_tracer+0xe6/0x170 tracing_set_trace_write+0x90/0xd0 __vfs_write+0x37/0x170 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 return_from_SYSCALL_64+0x0/0x6a [ Looking further into this, I found that this was left over from when the function and function graph tracers shared the same ftrace_ops. But in commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer together"), the two were separated, and the save_global_trampoline no longer was necessary (and it may have been broken back then too). -- Steven Rostedt ] Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com Cc: stable@vger.kernel.org Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together") Signed-off-by: Shu Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From f39b536ce9248e9799ff900358d6f073ab2e6c55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Sep 2017 20:19:02 -0700 Subject: rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}() The read of ->dynticks_nmi_nesting in rcu_irq_enter() and rcu_irq_exit() is currently protected with READ_ONCE(). However, this protection is unnecessary because (1) ->dynticks_nmi_nesting is updated only by the current CPU, (2) Although NMI handlers can update this field, they reset it back to its old value before return, and (3) Interrupts are disabled, so nothing else can modify it. The value of ->dynticks_nmi_nesting is thus effectively constant, and so no protection is required. This commit therefore removes the READ_ONCE() protection from these two accesses. Link: http://lkml.kernel.org/r/20170926031902.GA2074@linux.vnet.ibm.com Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 63bee8e1b193..c03152f7e458 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -890,7 +890,7 @@ void rcu_irq_exit(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -1027,7 +1027,7 @@ void rcu_irq_enter(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; oldval = rdtp->dynticks_nesting; -- cgit v1.2.3 From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Oct 2017 15:37:20 -0700 Subject: bpf: fix bpf_tail_call() x64 JIT - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 4 ++-- include/uapi/linux/bpf.h | 2 +- kernel/bpf/core.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c9573660d51..0554e8aef4d5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ #define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..f90860d1f897 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,7 +312,7 @@ union bpf_attr { * jump into another BPF program * @ctx: context pointer passed to next program * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run + * @index: 32-bit index inside array that selects specific program to run * Return: 0 on success or negative error * * int bpf_clone_redirect(skb, ifindex, flags) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; -- cgit v1.2.3 From 630cc2b30a42c70628368a412beb4a5e5dd71abe Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:14:18 -0700 Subject: kernel/params.c: align add_sysfs_param documentation with code This parameter is named kp, so the documentation should use that. Fixes: 9b473de87209 ("param: Fix duplicate module prefixes") Link: http://lkml.kernel.org/r/20170919142656.64aea59e@endymion Signed-off-by: Jean Delvare Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..1cd8f1a895a8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs + * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and -- cgit v1.2.3 From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 3 Oct 2017 16:15:00 -0700 Subject: android: binder: drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Also restore mmput_async, which was initially introduced in commit ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom reaper context"), and was removed in commit 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently"). Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Signed-off-by: Sherry Yang Signed-off-by: Greg Kroah-Hartman Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Acked-by: Michal Hocko Cc: Martijn Coenen Cc: Todd Kjos Cc: Riley Andrews Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Hoeun Ryu Cc: Christopher Lameter Cc: Vegard Nossum Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/android/binder_alloc.c | 18 ++++++++++++------ include/linux/sched/mm.h | 6 ++++++ kernel/fork.c | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 8fe165844e47..064f5e31ec55 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct binder_alloc *alloc; uintptr_t page_addr; size_t index; + struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) @@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - if (alloc->vma) { + vma = alloc->vma; + if (vma) { mm = get_task_mm(alloc->tsk); if (!mm) goto err_get_task_mm_failed; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + } + + list_lru_isolate(lru, item); + spin_unlock(lock); + if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(alloc->vma, + zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE); @@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_end(alloc, index); - list_lru_isolate(lru, item); - + spin_lock(lock); mutex_unlock(&alloc->mutex); - return LRU_REMOVED; + return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: - mmput(mm); + mmput_async(mm); err_get_task_mm_failed: err_page_already_freed: mutex_unlock(&alloc->mutex); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3a19c253bdb1..ae53e413fb13 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +#ifdef CONFIG_MMU +/* same as above but performs the slow path from the async context. Can + * be called from the atomic context as well + */ +void mmput_async(struct mm_struct *); +#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, + async_put_work); + + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} +#endif + /** * set_mm_exe_file - change a reference to the mm's executable file * -- cgit v1.2.3 From 3181c38e4df257852a0c0a53552fd5c869402886 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 3 Oct 2017 16:16:07 -0700 Subject: kernel/sysctl.c: remove duplicate UINT_MAX check on do_proc_douintvec_conv() do_proc_douintvec_conv() has two UINT_MAX checks, we can remove one. This has no functional changes other than fixing a compiler warning: kernel/sysctl.c:2190]: (warning) Identical condition '*lvalp>UINT_MAX', second condition is always false Fixes: 4f2fec00afa60 ("sysctl: simplify unsigned int support") Link: http://lkml.kernel.org/r/20170919072918.12066-1-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reported-by: David Binderman Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 423554ad3610..4da9e622471f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2186,8 +2186,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; -- cgit v1.2.3 From 1fdcce6e16c54facc4f0688630d3b9ecfcaa411f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:23 -0700 Subject: memremap: add scheduling point to devm_memremap_pages devm_memremap_pages is initializing struct pages in for_each_device_pfn and that can take quite some time. We have even seen a soft lockup triggering on a non preemptive kernel NMI watchdog: BUG: soft lockup - CPU#61 stuck for 22s! [kworker/u641:11:1808] [...] RIP: 0010:[] [] devm_memremap_pages+0x327/0x430 [...] Call Trace: pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 fix this by adding cond_resched every 1024 pages. Link: http://lkml.kernel.org/r/20170918121410.24466-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; - int error, nid, is_ram; + int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, list_del(&page->lru); page->pgmap = pgmap; percpu_ref_get(ref); + if (!(++i % 1024)) + cond_resched(); } devres_add(dev, page_map); return __va(res->start); -- cgit v1.2.3 From c9653850c90d6050197bc76ba672e00a7771aea5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 3 Oct 2017 16:16:26 -0700 Subject: kernel/kcmp.c: drop branch leftover typo The else branch been left over and escaped the source code refresh. Not a problem but better clean it up. Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files") Link: http://lkml.kernel.org/r/20170917165838.GA1887@uranus.lan Reported-by: Eugene Syromiatnikov Signed-off-by: Cyrill Gorcunov Acked-by: Andrei Vagin Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..055bb2962a0b 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1, if (filp_epoll) { filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); - } else + } if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); -- cgit v1.2.3 From 90ceb2a3ad868f800eb1c9f4ede650daddd94b77 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:35 -0700 Subject: kernel/params.c: fix the maximum length in param_get_string The length parameter of strlcpy() is supposed to reflect the size of the target buffer, not of the source string. Harmless in this case as the buffer is PAGE_SIZE long and the source string is always much shorter than this, but conceptually wrong, so let's fix it. Link: http://lkml.kernel.org/r/20170928162515.24846b4f@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 1cd8f1a895a8..8283ba045f4f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); + return strlcpy(buffer, kps->string, PAGE_SIZE); } EXPORT_SYMBOL(param_get_string); -- cgit v1.2.3 From 96802e6b1dbf29d3012b39503c5dd6d9d8e82955 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:38 -0700 Subject: kernel/params.c: fix an overflow in param_attr_show Function param_attr_show could overflow the buffer it is operating on. The buffer size is PAGE_SIZE, and the string returned by attribute->param->ops->get is generated by scnprintf(buffer, PAGE_SIZE, ...) so it could be PAGE_SIZE - 1 long, with the terminating '\0' at the very end of the buffer. Calling strcat(..., "\n") on this isn't safe, as the '\0' will be replaced by '\n' (OK) and then another '\0' will be added past the end of the buffer (not OK.) Simply add the trailing '\n' when writing the attribute contents to the buffer originally. This is safe, and also faster. Credits to Teradata for discovering this issue. Link: http://lkml.kernel.org/r/20170928162602.60c379c7@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 8283ba045f4f..0cca488263dd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -224,7 +224,7 @@ char *parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return scnprintf(buffer, PAGE_SIZE, format, \ + return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ @@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); + return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); @@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + /* Replace \n with comma */ if (i) - buffer[off++] = ','; + buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); @@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, PAGE_SIZE); + return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); @@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr, kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } return count; } -- cgit v1.2.3 From e0596c80f442d1e1221c17dbb71b2aed43909221 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:41 -0700 Subject: kernel/params.c: improve STANDARD_PARAM_DEF readability Align the parameters passed to STANDARD_PARAM_DEF for clarity. Link: http://lkml.kernel.org/r/20170928162728.756143cc@endymion Signed-off-by: Jean Delvare Suggested-by: Ingo Molnar Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 0cca488263dd..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -236,14 +236,14 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit v1.2.3 From 6b9dc4806b28214a4a260517e59439e0ac12a15e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Oct 2017 12:34:50 +0200 Subject: watchdog/core, powerpc: Replace watchdog_nmi_reconfigure() The recent cleanup of the watchdog code split watchdog_nmi_reconfigure() into two stages. One to stop the NMI and one to restart it after reconfiguration. That was done by adding a boolean 'run' argument to the code, which is functionally correct but not necessarily a piece of art. Replace it by two explicit functions: watchdog_nmi_stop() and watchdog_nmi_start(). Fixes: 6592ad2fcc8f ("watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage") Requested-by: Linus 'Nursing his pet-peeve' Torvalds Signed-off-by: Thomas 'Mopping up garbage' Gleixner Acked-by: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710021957480.2114@nanos --- arch/powerpc/kernel/watchdog.c | 23 ++++++++++++++--------- include/linux/nmi.h | 3 ++- kernel/watchdog.c | 33 ++++++++++++++++++--------------- 3 files changed, 34 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index dfb067764480..2673ec8bec00 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -355,19 +355,24 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_reconfigure(bool run) +void watchdog_nmi_stop(void) { int cpu; cpus_read_lock(); - if (!run) { - for_each_cpu(cpu, &wd_cpus_enabled) - stop_wd_on_cpu(cpu); - } else { - watchdog_calc_timeouts(); - for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) - start_wd_on_cpu(cpu); - } + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + cpus_read_unlock(); +} + +void watchdog_nmi_start(void) +{ + int cpu; + + cpus_read_lock(); + watchdog_calc_timeouts(); + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); cpus_read_unlock(); } diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 89ba8b23c6fe..0c9ed49fb21a 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -109,7 +109,8 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } # endif #endif -void watchdog_nmi_reconfigure(bool run); +void watchdog_nmi_stop(void); +void watchdog_nmi_start(void); /** * touch_nmi_watchdog - restart NMI watchdog timeout. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f6ef163b72cd..6ad6226535d0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -123,24 +123,27 @@ int __weak __init watchdog_nmi_probe(void) } /** - * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs - * @run: If false stop the watchdogs on all enabled CPUs - * If true start the watchdogs on all enabled CPUs + * watchdog_nmi_stop - Stop the watchdog for reconfiguration * - * The core call order is: - * watchdog_nmi_reconfigure(false); + * The reconfiguration steps are: + * watchdog_nmi_stop(); * update_variables(); - * watchdog_nmi_reconfigure(true); + * watchdog_nmi_start(); + */ +void __weak watchdog_nmi_stop(void) { } + +/** + * watchdog_nmi_start - Start the watchdog after reconfiguration * - * The second call which starts the watchdogs again guarantees that the - * following variables are stable across the call. + * Counterpart to watchdog_nmi_stop(). + * + * The following variables have been updated in update_variables() and + * contain the currently valid configuration: * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - * After the call the variables can be changed again. */ -void __weak watchdog_nmi_reconfigure(bool run) { } +void __weak watchdog_nmi_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit @@ -551,13 +554,13 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(void) { - watchdog_nmi_reconfigure(false); + watchdog_nmi_stop(); softlockup_park_all_threads(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); - watchdog_nmi_reconfigure(true); + watchdog_nmi_start(); } /* @@ -602,9 +605,9 @@ static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { - watchdog_nmi_reconfigure(false); + watchdog_nmi_stop(); lockup_detector_update_enable(); - watchdog_nmi_reconfigure(true); + watchdog_nmi_start(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -- cgit v1.2.3 From e31d6883f21c1cdfe5bc64e28411f8a92b783fde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Oct 2017 16:37:53 +0200 Subject: watchdog/core, powerpc: Lock cpus across reconfiguration Instead of dropping the cpu hotplug lock after stopping NMI watchdog and threads and reaquiring for restart, the code and the protection rules become more obvious when holding cpu hotplug lock across the full reconfiguration. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710022105570.2114@nanos --- arch/powerpc/kernel/watchdog.c | 4 ---- kernel/smpboot.c | 3 +-- kernel/watchdog.c | 10 +++++++++- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2673ec8bec00..f9b4c6352d24 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -359,21 +359,17 @@ void watchdog_nmi_stop(void) { int cpu; - cpus_read_lock(); for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); - cpus_read_unlock(); } void watchdog_nmi_start(void) { int cpu; - cpus_read_lock(); watchdog_calc_timeouts(); for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); - cpus_read_unlock(); } /* diff --git a/kernel/smpboot.c b/kernel/smpboot.c index ed7507b69b48..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -351,7 +351,7 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread static struct cpumask tmp; unsigned int cpu; - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ @@ -367,7 +367,6 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); } static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6ad6226535d0..fff90fe10007 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -535,7 +535,6 @@ static void softlockup_update_smpboot_threads(void) smpboot_update_cpumask_percpu_thread(&watchdog_threads, &watchdog_allowed_mask); - __lockup_detector_cleanup(); } /* Temporarily park all watchdog threads */ @@ -554,6 +553,7 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(void) { + cpus_read_lock(); watchdog_nmi_stop(); softlockup_park_all_threads(); set_sample_period(); @@ -561,6 +561,12 @@ static void softlockup_reconfigure_threads(void) if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); watchdog_nmi_start(); + cpus_read_unlock(); + /* + * Must be called outside the cpus locked section to prevent + * recursive locking in the perf code. + */ + __lockup_detector_cleanup(); } /* @@ -605,9 +611,11 @@ static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { + cpus_read_lock(); watchdog_nmi_stop(); lockup_detector_update_enable(); watchdog_nmi_start(); + cpus_read_unlock(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -- cgit v1.2.3 From 34ddaa3e5c0096fef52485186c7eb6cf56ddc686 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Oct 2017 16:39:02 +0200 Subject: powerpc/watchdog: Make use of watchdog_nmi_probe() The rework of the core hotplug code triggers the WARN_ON in start_wd_cpu() on powerpc because it is called multiple times for the boot CPU. The first call is via: start_wd_on_cpu+0x80/0x2f0 watchdog_nmi_reconfigure+0x124/0x170 softlockup_reconfigure_threads+0x110/0x130 lockup_detector_init+0xbc/0xe0 kernel_init_freeable+0x18c/0x37c kernel_init+0x2c/0x160 ret_from_kernel_thread+0x5c/0xbc And then again via the CPU hotplug registration: start_wd_on_cpu+0x80/0x2f0 cpuhp_invoke_callback+0x194/0x620 cpuhp_thread_fun+0x7c/0x1b0 smpboot_thread_fn+0x290/0x2a0 kthread+0x168/0x1b0 ret_from_kernel_thread+0x5c/0xbc This can be avoided by setting up the cpu hotplug state with nocalls and move the initialization to the watchdog_nmi_probe() function. That initializes the hotplug callbacks without invoking the callback and the following core initialization function then configures the watchdog for the online CPUs (in this case CPU0) via softlockup_reconfigure_threads(). Reported-and-tested-by: Michael Ellerman Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org --- arch/powerpc/kernel/watchdog.c | 17 ++++++++--------- include/linux/nmi.h | 1 + kernel/watchdog.c | 5 ++++- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index f9b4c6352d24..c702a8981452 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -373,22 +373,21 @@ void watchdog_nmi_start(void) } /* - * This runs after lockup_detector_init() which sets up watchdog_cpumask. + * Invoked from core watchdog init. */ -static int __init powerpc_watchdog_init(void) +int __init watchdog_nmi_probe(void) { int err; - watchdog_calc_timeouts(); - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", - start_wd_on_cpu, stop_wd_on_cpu); - if (err < 0) + err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) { pr_warn("Watchdog could not be initialized"); - + return err; + } return 0; } -arch_initcall(powerpc_watchdog_init); static void handle_backtrace_ipi(struct pt_regs *regs) { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 0c9ed49fb21a..27e249ed7c5c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -111,6 +111,7 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } void watchdog_nmi_stop(void); void watchdog_nmi_start(void); +int watchdog_nmi_probe(void); /** * touch_nmi_watchdog - restart NMI watchdog timeout. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fff90fe10007..5c6fb7cd9ae8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -608,7 +608,6 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { cpus_read_lock(); @@ -617,6 +616,10 @@ static void softlockup_reconfigure_threads(void) watchdog_nmi_start(); cpus_read_unlock(); } +static inline void softlockup_init_threads(void) +{ + softlockup_reconfigure_threads(); +} #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) -- cgit v1.2.3 From 5587185ddb4b9f413299dfec0a022ad0212513e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Oct 2017 10:03:04 +0200 Subject: watchdog/core: Rename some softlockup_* functions The function names made sense up to the point where the watchdog (re)configuration was unified to use softlockup_reconfigure_threads() for all configuration purposes. But that includes scenarios which solely configure the nmi watchdog. Rename softlockup_reconfigure_threads() and softlockup_init_threads() so the function names match the functionality. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus --- kernel/watchdog.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5c6fb7cd9ae8..d241bd99cee1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -551,7 +551,7 @@ static void softlockup_unpark_threads(void) softlockup_update_smpboot_threads(); } -static void softlockup_reconfigure_threads(void) +static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -570,13 +570,13 @@ static void softlockup_reconfigure_threads(void) } /* - * Create the watchdog thread infrastructure. + * Create the watchdog thread infrastructure and configure the detector(s). * * The threads are not unparked as watchdog_allowed_mask is empty. When * the threads are sucessfully initialized, take the proper locks and * unpark the threads in the watchdog_cpumask if the watchdog is enabled. */ -static __init void softlockup_init_threads(void) +static __init void lockup_detector_setup(void) { int ret; @@ -599,7 +599,7 @@ static __init void softlockup_init_threads(void) mutex_lock(&watchdog_mutex); softlockup_threads_initialized = true; - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); mutex_unlock(&watchdog_mutex); } @@ -608,7 +608,7 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static void softlockup_reconfigure_threads(void) +static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -616,9 +616,9 @@ static void softlockup_reconfigure_threads(void) watchdog_nmi_start(); cpus_read_unlock(); } -static inline void softlockup_init_threads(void) +static inline void lockup_detector_setup(void) { - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -658,7 +658,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); } /* @@ -785,5 +785,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; - softlockup_init_threads(); + lockup_detector_setup(); } -- cgit v1.2.3 From 0b62bf862dc93a05fea97b6ca6ffca072e2f30c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Oct 2017 20:59:09 +0200 Subject: watchdog/core: Put softlockup_threads_initialized under ifdef guard The variable is unused when the softlockup detector is disabled in Kconfig. Signed-off-by: Thomas Gleixner --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d241bd99cee1..6bcb854909c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,7 +47,6 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly nmi_watchdog_available; struct cpumask watchdog_allowed_mask __read_mostly; -static bool softlockup_threads_initialized __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -168,6 +167,7 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -- cgit v1.2.3 From 8fe2d6ccd52b086268f2f36e5e2fc0fe3aeffa80 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 Oct 2017 16:20:56 -0700 Subject: bpf: fix liveness marking while processing Rx = Ry instruction the verifier does regs[insn->dst_reg] = regs[insn->src_reg] which often clears write mark (when Ry doesn't have it) that was just set by check_reg_arg(Rx) prior to the assignment. That causes mark_reg_read() to keep marking Rx in this block as REG_LIVE_READ (since the logic incorrectly misses that it's screened by the write) and in many of its parents (until lucky write into the same Rx or beginning of the program). That causes is_state_visited() logic to miss many pruning opportunities. Furthermore mark_reg_read() logic propagates the read mark for BPF_REG_FP as well (though it's readonly) which causes harmless but unnecssary work during is_state_visited(). Note that do_propagate_liveness() skips FP correctly, so do the same in mark_reg_read() as well. It saves 0.2 seconds for the test below program before after bpf_lb-DLB_L3.o 2604 2304 bpf_lb-DLB_L4.o 11159 3723 bpf_lb-DUNKNOWN.o 1116 1110 bpf_lxc-DDROP_ALL.o 34566 28004 bpf_lxc-DUNKNOWN.o 53267 39026 bpf_netdev.o 17843 16943 bpf_overlay.o 8672 7929 time ~11 sec ~4 sec Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning") Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..8b8d6ba39e23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; + if (regno == BPF_REG_FP) + /* We don't need to worry about FP liveness because it's read-only */ + return; + while (parent) { /* if read wasn't screened by an earlier write ... */ if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -2345,6 +2349,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ regs[insn->dst_reg] = regs[insn->src_reg]; + regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { -- cgit v1.2.3 From 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 9 Oct 2017 15:27:15 +0300 Subject: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1' Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced support for attaching an eBPF object by an fd, with the 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each IPT_SO_SET_REPLACE call. However this breaks subsequent iptables calls: # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT # iptables -A INPUT -s 5.6.7.8 -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. That's because iptables works by loading existing rules using IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with the replacement set. However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number (from the initial "iptables -m bpf" invocation) - so when 2nd invocation occurs, userspace passes a bogus fd number, which leads to 'bpf_mt_check_v1' to fail. One suggested solution [1] was to hack iptables userspace, to perform a "entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new, process-local fd per every 'xt_bpf_info_v1' entry seen. However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects. This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given '.fd' and instead perform an in-kernel lookup for the bpf object given the provided '.path'. It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is expected to provide the path of the pinned object. Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved. References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2 [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2 Reported-by: Rafael Buchbinder Signed-off-by: Shmulik Ladkani Acked-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/linux/bpf.h | 5 +++++ include/uapi/linux/netfilter/xt_bpf.h | 1 + kernel/bpf/inode.c | 1 + net/netfilter/xt_bpf.c | 22 ++++++++++++++++++++-- 4 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8390859e79e7..f1af7d63d678 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -368,6 +368,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) { } +static inline int bpf_obj_get_user(const char __user *pathname) +{ + return -EOPNOTSUPP; +} + static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h index b97725af2ac0..da161b56c79e 100644 --- a/include/uapi/linux/netfilter/xt_bpf.h +++ b/include/uapi/linux/netfilter/xt_bpf.h @@ -23,6 +23,7 @@ enum xt_bpf_modes { XT_BPF_MODE_FD_PINNED, XT_BPF_MODE_FD_ELF, }; +#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED struct xt_bpf_info_v1 { __u16 mode; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ out: putname(pname); return ret; } +EXPORT_SYMBOL_GPL(bpf_obj_get_user); static void bpf_evict_inode(struct inode *inode) { diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 38986a95216c..29123934887b 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) return 0; } +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) +{ + mm_segment_t oldfs = get_fs(); + int retval, fd; + + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path); + set_fs(oldfs); + if (fd < 0) + return fd; + + retval = __bpf_mt_check_fd(fd, ret); + sys_close(fd); + return retval; +} + static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; @@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par) return __bpf_mt_check_bytecode(info->bpf_program, info->bpf_program_num_elem, &info->filter); - else if (info->mode == XT_BPF_MODE_FD_PINNED || - info->mode == XT_BPF_MODE_FD_ELF) + else if (info->mode == XT_BPF_MODE_FD_ELF) return __bpf_mt_check_fd(info->fd, &info->filter); + else if (info->mode == XT_BPF_MODE_PATH_PINNED) + return __bpf_mt_check_path(info->path, &info->filter); else return -EINVAL; } -- cgit v1.2.3 From d153b153446f7d8832bb2ebd92309c8a6003b3bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Sep 2017 11:35:30 +0200 Subject: sched/core: Fix wake_affine() performance regression Eric reported a sysbench regression against commit: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed against his v3.10 enterprise kernel. PRE (current tip/master): ivb-ep sysbench: 2: [30 secs] transactions: 64110 (2136.94 per sec.) 5: [30 secs] transactions: 143644 (4787.99 per sec.) 10: [30 secs] transactions: 274298 (9142.93 per sec.) 20: [30 secs] transactions: 418683 (13955.45 per sec.) 40: [30 secs] transactions: 320731 (10690.15 per sec.) 80: [30 secs] transactions: 355096 (11834.28 per sec.) hsw-ex NAS: OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93 lu.C.x_threads_144_run_1.log: Time in seconds = 434.68 lu.C.x_threads_144_run_2.log: Time in seconds = 405.36 lu.C.x_threads_144_run_3.log: Time in seconds = 433.83 POST (+patch): ivb-ep sysbench: 2: [30 secs] transactions: 64494 (2149.75 per sec.) 5: [30 secs] transactions: 145114 (4836.99 per sec.) 10: [30 secs] transactions: 278311 (9276.69 per sec.) 20: [30 secs] transactions: 437169 (14571.60 per sec.) 40: [30 secs] transactions: 669837 (22326.73 per sec.) 80: [30 secs] transactions: 631739 (21055.88 per sec.) hsw-ex NAS: lu.C.x_threads_144_run_1.log: Time in seconds = 23.36 lu.C.x_threads_144_run_2.log: Time in seconds = 22.96 lu.C.x_threads_144_run_3.log: Time in seconds = 22.52 This patch takes out all the shiny wake_affine() stuff and goes back to utter basics. Between the two CPUs involved with the wakeup (the CPU doing the wakeup and the CPU we ran on previously) pick the CPU we can run on _now_. This restores much of the regressions against the older kernels, but leaves some ground in the overloaded case. The default-enabled WA_WEIGHT (which will be introduced in the next patch) is an attempt to address the overloaded situation. Reported-by: Eric Farman Signed-off-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Matthew Rosato Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: jinpuwang@gmail.com Cc: vcaputo@pengaru.com Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 8 --- kernel/sched/fair.c | 126 +++++------------------------------------ kernel/sched/features.h | 1 + 3 files changed, 16 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index d7b6dab956ec..7d065abc7a47 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -71,14 +71,6 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - - /* - * Some variables from the most recent sd_lb_stats for this domain, - * used by wake_affine(). - */ - unsigned long nr_running; - unsigned long load; - unsigned long capacity; }; struct sched_domain { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..28cabed85387 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p) return 1; } -struct llc_stats { - unsigned long nr_running; - unsigned long load; - unsigned long capacity; - int has_capacity; -}; - -static bool get_llc_stats(struct llc_stats *stats, int cpu) -{ - struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - - if (!sds) - return false; - - stats->nr_running = READ_ONCE(sds->nr_running); - stats->load = READ_ONCE(sds->load); - stats->capacity = READ_ONCE(sds->capacity); - stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); - - return true; -} - /* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous + * CPU. * - * Since we're running on 'stale' values, we might in fact create an imbalance - * but recomputing these values is expensive, as that'd mean iteration 2 cache - * domains worth of CPUs. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or + * will be) idle. */ + static bool -wake_affine_llc(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +wake_affine_idle(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) { - struct llc_stats prev_stats, this_stats; - s64 this_eff_load, prev_eff_load; - unsigned long task_load; - - if (!get_llc_stats(&prev_stats, prev_cpu) || - !get_llc_stats(&this_stats, this_cpu)) - return false; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current LLC. - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - /* in this case load hits 0 and this LLC is considered 'idle' */ - if (current_load > this_stats.load) - return true; - - this_stats.load -= current_load; - } - - /* - * The has_capacity stuff is not SMT aware, but by trying to balance - * the nr_running on both ends we try and fill the domain at equal - * rates, thereby first consuming cores before siblings. - */ - - /* if the old cache has capacity, stay there */ - if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) - return false; - - /* if this cache has capacity, come here */ - if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) + if (idle_cpu(this_cpu)) return true; - /* - * Check to see if we can move the load without causing too much - * imbalance. - */ - task_load = task_h_load(p); - - this_eff_load = 100; - this_eff_load *= prev_stats.capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_stats.capacity; - - this_eff_load *= this_stats.load + task_load; - prev_eff_load *= prev_stats.load - task_load; + if (sync && cpu_rq(this_cpu)->nr_running == 1) + return true; - return this_eff_load <= prev_eff_load; + return false; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine; - - /* - * Default to no affine wakeups; wake_affine() should not effect a task - * placement the load-balancer feels inclined to undo. The conservative - * option is therefore to not move tasks when they wake up. - */ - affine = false; + bool affine = false; - /* - * If the wakeup is across cache domains, try to evaluate if movement - * makes sense, otherwise rely on select_idle_siblings() to do - * placement inside the cache domain. - */ - if (!cpus_share_cache(prev_cpu, this_cpu)) - affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE) && !affine) + affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { @@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -7672,22 +7592,6 @@ next_group: if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; } - - if (!shared) - return; - - /* - * Since these are sums over groups they can contain some CPUs - * multiple times for the NUMA domains. - * - * Currently only wake_affine_llc() and find_busiest_group() - * uses these numbers, only the last is affected by this problem. - * - * XXX fix that. - */ - WRITE_ONCE(shared->nr_running, sds->total_running); - WRITE_ONCE(shared->load, sds->total_load); - WRITE_ONCE(shared->capacity, sds->total_capacity); } /** diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..0a519f8c224d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(WA_IDLE, true) -- cgit v1.2.3 From f2cdd9cc6c97e617b95f430f527a6e3165e1bee8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Oct 2017 09:23:24 +0200 Subject: sched/core: Address more wake_affine() regressions The trivial wake_affine_idle() implementation is very good for a number of workloads, but it comes apart at the moment there are no idle CPUs left, IOW. the overloaded case. hackbench: NO_WA_WEIGHT WA_WEIGHT hackbench-20 : 7.362717561 seconds 6.450509391 seconds (win) netperf: NO_WA_WEIGHT WA_WEIGHT TCP_SENDFILE-1 : Avg: 54524.6 Avg: 52224.3 TCP_SENDFILE-10 : Avg: 48185.2 Avg: 46504.3 TCP_SENDFILE-20 : Avg: 29031.2 Avg: 28610.3 TCP_SENDFILE-40 : Avg: 9819.72 Avg: 9253.12 TCP_SENDFILE-80 : Avg: 5355.3 Avg: 4687.4 TCP_STREAM-1 : Avg: 41448.3 Avg: 42254 TCP_STREAM-10 : Avg: 24123.2 Avg: 25847.9 TCP_STREAM-20 : Avg: 15834.5 Avg: 18374.4 TCP_STREAM-40 : Avg: 5583.91 Avg: 5599.57 TCP_STREAM-80 : Avg: 2329.66 Avg: 2726.41 TCP_RR-1 : Avg: 80473.5 Avg: 82638.8 TCP_RR-10 : Avg: 72660.5 Avg: 73265.1 TCP_RR-20 : Avg: 52607.1 Avg: 52634.5 TCP_RR-40 : Avg: 57199.2 Avg: 56302.3 TCP_RR-80 : Avg: 25330.3 Avg: 26867.9 UDP_RR-1 : Avg: 108266 Avg: 107844 UDP_RR-10 : Avg: 95480 Avg: 95245.2 UDP_RR-20 : Avg: 68770.8 Avg: 68673.7 UDP_RR-40 : Avg: 76231 Avg: 75419.1 UDP_RR-80 : Avg: 34578.3 Avg: 35639.1 UDP_STREAM-1 : Avg: 64684.3 Avg: 66606 UDP_STREAM-10 : Avg: 52701.2 Avg: 52959.5 UDP_STREAM-20 : Avg: 30376.4 Avg: 29704 UDP_STREAM-40 : Avg: 15685.8 Avg: 15266.5 UDP_STREAM-80 : Avg: 8415.13 Avg: 7388.97 (wins and losses) sysbench: NO_WA_WEIGHT WA_WEIGHT sysbench-mysql-2 : 2135.17 per sec. 2142.51 per sec. sysbench-mysql-5 : 4809.68 per sec. 4800.19 per sec. sysbench-mysql-10 : 9158.59 per sec. 9157.05 per sec. sysbench-mysql-20 : 14570.70 per sec. 14543.55 per sec. sysbench-mysql-40 : 22130.56 per sec. 22184.82 per sec. sysbench-mysql-80 : 20995.56 per sec. 21904.18 per sec. sysbench-psql-2 : 1679.58 per sec. 1705.06 per sec. sysbench-psql-5 : 3797.69 per sec. 3879.93 per sec. sysbench-psql-10 : 7253.22 per sec. 7258.06 per sec. sysbench-psql-20 : 11166.75 per sec. 11220.00 per sec. sysbench-psql-40 : 17277.28 per sec. 17359.78 per sec. sysbench-psql-80 : 17112.44 per sec. 17221.16 per sec. (increase on the top end) tbench: NO_WA_WEIGHT Throughput 685.211 MB/sec 2 clients 2 procs max_latency=0.123 ms Throughput 1596.64 MB/sec 5 clients 5 procs max_latency=0.119 ms Throughput 2985.47 MB/sec 10 clients 10 procs max_latency=0.262 ms Throughput 4521.15 MB/sec 20 clients 20 procs max_latency=0.506 ms Throughput 9438.1 MB/sec 40 clients 40 procs max_latency=2.052 ms Throughput 8210.5 MB/sec 80 clients 80 procs max_latency=8.310 ms WA_WEIGHT Throughput 697.292 MB/sec 2 clients 2 procs max_latency=0.127 ms Throughput 1596.48 MB/sec 5 clients 5 procs max_latency=0.080 ms Throughput 2975.22 MB/sec 10 clients 10 procs max_latency=0.254 ms Throughput 4575.14 MB/sec 20 clients 20 procs max_latency=0.502 ms Throughput 9468.65 MB/sec 40 clients 40 procs max_latency=2.069 ms Throughput 8631.73 MB/sec 80 clients 80 procs max_latency=8.605 ms (increase on the top end) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Rik van Riel Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 2 ++ 2 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28cabed85387..ed2ab474ec93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5363,6 +5363,10 @@ static int wake_wide(struct task_struct *p) * * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or * will be) idle. + * + * wake_affine_weight() - considers the weight to reflect the average + * scheduling latency of the CPUs. This seems to work + * for the overloaded case. */ static bool @@ -5378,6 +5382,40 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p, return false; } +static bool +wake_affine_weight(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + this_eff_load = target_load(this_cpu, sd->wake_idx); + prev_eff_load = source_load(prev_cpu, sd->wake_idx); + + if (sync) { + unsigned long current_load = task_h_load(current); + + if (current_load > this_eff_load) + return true; + + this_eff_load -= current_load; + } + + task_load = task_h_load(p); + + this_eff_load += task_load; + if (sched_feat(WA_BIAS)) + this_eff_load *= 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load -= task_load; + if (sched_feat(WA_BIAS)) + prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); + + return this_eff_load <= prev_eff_load; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { @@ -5387,6 +5425,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_IDLE) && !affine) affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && !affine) + affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { schedstat_inc(sd->ttwu_move_affine); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 0a519f8c224d..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -82,3 +82,5 @@ SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) -- cgit v1.2.3 From 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Oct 2017 10:36:53 +0200 Subject: sched/core: Ensure load_balance() respects the active_mask While load_balance() masks the source CPUs against active_mask, it had a hole against the destination CPU. Ensure the destination CPU is also part of the 'domain-mask & active-mask' set. Reported-by: Levin, Alexander (Sasha Levin) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed2ab474ec93..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8042,6 +8042,13 @@ static int should_we_balance(struct lb_env *env) struct sched_group *sg = env->sd->groups; int cpu, balance_cpu = -1; + /* + * Ensure the balancing environment is consistent; can happen + * when the softirq triggers 'during' hotplug. + */ + if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) + return 0; + /* * In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. -- cgit v1.2.3