From d0b6e0a8ef24b1b07078ababe5d91bcdf4f4264a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Sep 2017 21:36:55 +0200
Subject: watchdog/hardlockup: Provide interface to stop/restart perf events

Provide an interface to stop and restart perf NMI watchdog events on all
CPUs. This is only usable during init and especially for handling the perf
HT bug on Intel machines. It's safe to use it this way as nothing can
start/stop the NMI watchdog in parallel.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.167649596@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog_hld.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 3a09ea1b1d3d..c9586ebc2e98 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -261,3 +261,44 @@ void watchdog_nmi_disable(unsigned int cpu)
 			firstcpu_err = 0;
 	}
 }
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+	int cpu;
+
+	lockdep_assert_cpus_held();
+
+	for_each_online_cpu(cpu) {
+		struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+		if (event)
+			perf_event_disable(event);
+	}
+}
+
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+	int cpu;
+
+	lockdep_assert_cpus_held();
+
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		return;
+
+	for_each_online_cpu(cpu) {
+		struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+		if (event)
+			perf_event_enable(event);
+	}
+}
-- 
cgit v1.2.3


From 6554fd8cf06db86f861bb24d7487b2873ca444c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:36:57 +0200
Subject: watchdog/core: Provide interface to stop from poweroff()

PARISC has a a busy looping power off routine. If the watchdog is enabled
the watchdog timer will still fire, but the thread is not running, which
causes the softlockup watchdog to trigger.

Provide a interface which allows to turn the watchdog off.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: linux-parisc@vger.kernel.org
Link: http://lkml.kernel.org/r/20170912194146.327343752@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h |  6 +++---
 kernel/watchdog.c   | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b24d4a58674a..85bb268be39c 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -12,10 +12,10 @@
 
 #ifdef CONFIG_LOCKUP_DETECTOR
 void lockup_detector_init(void);
+void lockup_detector_soft_poweroff(void);
 #else
-static inline void lockup_detector_init(void)
-{
-}
+static inline void lockup_detector_init(void) { }
+static inline void lockup_detector_soft_poweroff(void) { }
 #endif
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..f23e373aa3bf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -333,7 +333,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
-	if (atomic_read(&watchdog_park_in_progress) != 0)
+	if (!watchdog_enabled ||
+	    atomic_read(&watchdog_park_in_progress) != 0)
 		return HRTIMER_NORESTART;
 
 	/* kick the hardlockup detector */
@@ -660,6 +661,17 @@ static void set_sample_period(void)
 }
 #endif /* SOFTLOCKUP */
 
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
+ */
+void lockup_detector_soft_poweroff(void)
+{
+	watchdog_enabled = 0;
+}
+
 /*
  * Suspend the hard and soft lockup detector by parking the watchdog threads.
  */
-- 
cgit v1.2.3


From 5490125d77a43016b26f629d4b485e2c62172551 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:36:59 +0200
Subject: watchdog/core: Remove broken suspend/resume interfaces

This interface has several issues:

 - It's causing recursive locking of the hotplug lock.

 - It's complete overkill to teardown all threads and then recreate them

The same can be achieved with the simple hardlockup_detector_perf_stop /
restart() interfaces. The abuse from the busy looping poweroff() loop of
PARISC has been solved as well.

Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.487537732@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/watchdog.c |  3 --
 include/linux/nmi.h            | 12 ------
 kernel/watchdog.c              | 89 +-----------------------------------------
 3 files changed, 1 insertion(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..5ded171f02d6 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu)
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
 		return 0;
 
-	if (watchdog_suspended)
-		return 0;
-
 	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
 		return 0;
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 85bb268be39c..7eefe7abf44b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -164,7 +164,6 @@ extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 extern struct cpumask watchdog_cpumask;
 extern unsigned long *watchdog_cpumask_bits;
-extern int __read_mostly watchdog_suspended;
 #ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
@@ -192,17 +191,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int lockup_detector_suspend(void);
-extern void lockup_detector_resume(void);
-#else
-static inline int lockup_detector_suspend(void)
-{
-	return 0;
-}
-
-static inline void lockup_detector_resume(void)
-{
-}
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f23e373aa3bf..b2d46757917e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -97,19 +97,6 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
  * unregistered/stopped, so it is an indicator whether the threads exist.
  */
 static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
 
 /*
  * These functions can be overridden if an architecture implements its
@@ -136,7 +123,6 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
  * - watchdog_cpumask
  * - sysctl_hardlockup_all_cpu_backtrace
  * - hardlockup_panic
- * - watchdog_suspended
  */
 void __weak watchdog_nmi_reconfigure(void)
 {
@@ -672,61 +658,6 @@ void lockup_detector_soft_poweroff(void)
 	watchdog_enabled = 0;
 }
 
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
- */
-int lockup_detector_suspend(void)
-{
-	int ret = 0;
-
-	get_online_cpus();
-	mutex_lock(&watchdog_proc_mutex);
-	/*
-	 * Multiple suspend requests can be active in parallel (counted by
-	 * the 'watchdog_suspended' variable). If the watchdog threads are
-	 * running, the first caller takes care that they will be parked.
-	 * The state of 'watchdog_running' cannot change while a suspend
-	 * request is active (see related code in 'proc' handlers).
-	 */
-	if (watchdog_running && !watchdog_suspended)
-		ret = watchdog_park_threads();
-
-	if (ret == 0)
-		watchdog_suspended++;
-	else {
-		watchdog_disable_all_cpus();
-		pr_err("Failed to suspend lockup detectors, disabled\n");
-		watchdog_enabled = 0;
-	}
-
-	watchdog_nmi_reconfigure();
-
-	mutex_unlock(&watchdog_proc_mutex);
-
-	return ret;
-}
-
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
- */
-void lockup_detector_resume(void)
-{
-	mutex_lock(&watchdog_proc_mutex);
-
-	watchdog_suspended--;
-	/*
-	 * The watchdog threads are unparked if they were previously running
-	 * and if there is no more active suspend request.
-	 */
-	if (watchdog_running && !watchdog_suspended)
-		watchdog_unpark_threads();
-
-	watchdog_nmi_reconfigure();
-
-	mutex_unlock(&watchdog_proc_mutex);
-	put_online_cpus();
-}
-
 #ifdef CONFIG_SYSCTL
 
 /*
@@ -775,12 +706,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 	get_online_cpus();
 	mutex_lock(&watchdog_proc_mutex);
 
-	if (watchdog_suspended) {
-		/* no parameter changes allowed while watchdog is suspended */
-		err = -EAGAIN;
-		goto out;
-	}
-
 	/*
 	 * If the parameter is being read return the state of the corresponding
 	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,12 +797,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 	get_online_cpus();
 	mutex_lock(&watchdog_proc_mutex);
 
-	if (watchdog_suspended) {
-		/* no parameter changes allowed while watchdog is suspended */
-		err = -EAGAIN;
-		goto out;
-	}
-
 	old = ACCESS_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
@@ -917,12 +836,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	get_online_cpus();
 	mutex_lock(&watchdog_proc_mutex);
 
-	if (watchdog_suspended) {
-		/* no parameter changes allowed while watchdog is suspended */
-		err = -EAGAIN;
-		goto out;
-	}
-
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write) {
 		/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -941,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
 		watchdog_nmi_reconfigure();
 	}
-out:
+
 	mutex_unlock(&watchdog_proc_mutex);
 	put_online_cpus();
 	return err;
-- 
cgit v1.2.3


From b7a349819d4b9b5db64e523351e66a79a758eaa5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:00 +0200
Subject: watchdog/core: Rework CPU hotplug locking

The watchdog proc interface causes extensive recursive locking of the CPU
hotplug percpu rwsem, which is deadlock prone.

Replace the get/put_online_cpus() pairs with cpu_hotplug_disable()/enable()
calls for now. Later patches will remove that requirement completely.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.568079057@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b2d46757917e..cd79f644ea34 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -703,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 	int err, old, new;
 	int *watchdog_param = (int *)table->data;
 
-	get_online_cpus();
+	cpu_hotplug_disable();
 	mutex_lock(&watchdog_proc_mutex);
 
 	/*
@@ -752,7 +752,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 	}
 out:
 	mutex_unlock(&watchdog_proc_mutex);
-	put_online_cpus();
+	cpu_hotplug_enable();
 	return err;
 }
 
@@ -794,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 {
 	int err, old, new;
 
-	get_online_cpus();
+	cpu_hotplug_disable();
 	mutex_lock(&watchdog_proc_mutex);
 
 	old = ACCESS_ONCE(watchdog_thresh);
@@ -818,7 +818,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 	}
 out:
 	mutex_unlock(&watchdog_proc_mutex);
-	put_online_cpus();
+	cpu_hotplug_enable();
 	return err;
 }
 
@@ -833,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 {
 	int err;
 
-	get_online_cpus();
+	cpu_hotplug_disable();
 	mutex_lock(&watchdog_proc_mutex);
 
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
@@ -856,7 +856,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	}
 
 	mutex_unlock(&watchdog_proc_mutex);
-	put_online_cpus();
+	cpu_hotplug_enable();
 	return err;
 }
 
-- 
cgit v1.2.3


From 946d197794b23202b8b46c43016747c72fe23393 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:01 +0200
Subject: watchdog/core: Rename watchdog_proc_mutex

Following patches will use the mutex for other purposes as well. Rename it
as it is not longer a proc specific thing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.647714850@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index cd79f644ea34..7c3a0a76b41b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,8 +29,7 @@
 #include <linux/kvm_para.h>
 #include <linux/kthread.h>
 
-/* Watchdog configuration */
-static DEFINE_MUTEX(watchdog_proc_mutex);
+static DEFINE_MUTEX(watchdog_mutex);
 
 int __read_mostly nmi_watchdog_enabled;
 
@@ -704,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 	int *watchdog_param = (int *)table->data;
 
 	cpu_hotplug_disable();
-	mutex_lock(&watchdog_proc_mutex);
+	mutex_lock(&watchdog_mutex);
 
 	/*
 	 * If the parameter is being read return the state of the corresponding
@@ -751,7 +750,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 		err = proc_watchdog_update();
 	}
 out:
-	mutex_unlock(&watchdog_proc_mutex);
+	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
 	return err;
 }
@@ -795,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 	int err, old, new;
 
 	cpu_hotplug_disable();
-	mutex_lock(&watchdog_proc_mutex);
+	mutex_lock(&watchdog_mutex);
 
 	old = ACCESS_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -817,7 +816,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 		set_sample_period();
 	}
 out:
-	mutex_unlock(&watchdog_proc_mutex);
+	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
 	return err;
 }
@@ -834,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	int err;
 
 	cpu_hotplug_disable();
-	mutex_lock(&watchdog_proc_mutex);
+	mutex_lock(&watchdog_mutex);
 
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write) {
@@ -855,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 		watchdog_nmi_reconfigure();
 	}
 
-	mutex_unlock(&watchdog_proc_mutex);
+	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
 	return err;
 }
-- 
cgit v1.2.3


From 7a3558200739e1378800a7a6d7f63c031115f7a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:02 +0200
Subject: watchdog/core: Mark hardlockup_detector_disable() __init

The function is only used by the KVM init code. Mark it __init to prevent
creative abuse.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.727134632@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7c3a0a76b41b..1c185d9dd468 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -55,7 +55,7 @@ unsigned int __read_mostly hardlockup_panic =
  * kernel command line parameters are parsed, because otherwise it is not
  * possible to override this in hardlockup_panic_setup().
  */
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
 {
 	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
 }
-- 
cgit v1.2.3


From 20d853fd0703b1d73c35a22024c0d4fcbcc57c8c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:03 +0200
Subject: watchdog/hardlockup/perf: Remove broken self disable on failure

The self disabling feature is broken vs. CPU hotplug locking:

CPU 0			   CPU 1
cpus_write_lock();
 cpu_up(1)
   wait_for_completion()
			   ....
			   unpark_watchdog()
			   ->unpark()
			     perf_event_create() <- fails
			       watchdog_enable &= ~NMI_WATCHDOG;
			   ....
cpus_write_unlock();
			   CPU 2
cpus_write_lock()
 cpu_down(2)
   wait_for_completion()
			   wakeup(watchdog);
			     watchdog()
			     if (!(watchdog_enable & NMI_WATCHDOG))
				watchdog_nmi_disable()
				  perf_event_disable()
				  ....
				  cpus_read_lock();

			   stop_smpboot_threads()
			     park_watchdog();
			       wait_for_completion(watchdog->parked);

Result: End of hotplug and instantaneous full lockup of the machine.

There is a similar problem with disabling the watchdog via the user space
interface as the sysctl function fiddles with watchdog_enable directly.

It's very debatable whether this is required at all. If the watchdog works
nicely on N CPUs and it fails to enable on the N + 1 CPU either during
hotplug or because the user space interface disabled it via sysctl cpumask
and then some perf user grabbed the counter which is then unavailable for
the watchdog when the sysctl cpumask gets changed back.

There is no real justification for this.

One of the reasons WHY this is done is the utter stupidity of the init code
of the perf NMI watchdog. Instead of checking upfront at boot whether PERF
is available and functional at all, it just does this check at run time
over and over when user space fiddles with the sysctl. That's broken beyond
repair along with the idiotic error code dependent warn level printks and
the even more silly printk rate limiting.

If the init code checks whether perf works at boot time, then this mess can
be more or less avoided completely. Perf does not come magically into life
at runtime. Brain usage while coding is overrated.

Remove the cruft and add a temporary safe guard which gets removed later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.806708429@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c     | 15 ---------------
 kernel/watchdog_hld.c | 20 +++++++-------------
 2 files changed, 7 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1c185d9dd468..af000956286c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -485,21 +485,6 @@ static void watchdog(unsigned int cpu)
 	__this_cpu_write(soft_lockup_hrtimer_cnt,
 			 __this_cpu_read(hrtimer_interrupts));
 	__touch_watchdog();
-
-	/*
-	 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
-	 * failure path. Check for failures that can occur asynchronously -
-	 * for example, when CPUs are on-lined - and shut down the hardware
-	 * perf event on each CPU accordingly.
-	 *
-	 * The only non-obvious place this bit can be cleared is through
-	 * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
-	 * pr_info here would be too noisy as it would result in a message
-	 * every few seconds if the hardlockup was disabled but the softlockup
-	 * enabled.
-	 */
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		watchdog_nmi_disable(cpu);
 }
 
 static struct smp_hotplug_thread watchdog_threads = {
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index c9586ebc2e98..7b602714ea53 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -23,6 +23,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 
 static unsigned long hardlockup_allcpu_dumped;
+static bool hardlockup_detector_disabled;
 
 void arch_touch_nmi_watchdog(void)
 {
@@ -178,6 +179,10 @@ int watchdog_nmi_enable(unsigned int cpu)
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
 		goto out;
 
+	/* A failure disabled the hardlockup detector permanently */
+	if (hardlockup_detector_disabled)
+		return -ENODEV;
+
 	/* is it already setup and enabled? */
 	if (event && event->state > PERF_EVENT_STATE_OFF)
 		goto out;
@@ -206,18 +211,6 @@ int watchdog_nmi_enable(unsigned int cpu)
 		goto out_save;
 	}
 
-	/*
-	 * Disable the hard lockup detector if _any_ CPU fails to set up
-	 * set up the hardware perf event. The watchdog() function checks
-	 * the NMI_WATCHDOG_ENABLED bit periodically.
-	 *
-	 * The barriers are for syncing up watchdog_enabled across all the
-	 * cpus, as clear_bit() does not use barriers.
-	 */
-	smp_mb__before_atomic();
-	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-	smp_mb__after_atomic();
-
 	/* skip displaying the same error again */
 	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
 		return PTR_ERR(event);
@@ -232,7 +225,8 @@ int watchdog_nmi_enable(unsigned int cpu)
 		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
 			cpu, PTR_ERR(event));
 
-	pr_info("Shutting down hard lockup detector on all cpus\n");
+	pr_info("Disabling hard lockup detector permanently\n");
+	hardlockup_detector_disabled = true;
 
 	return PTR_ERR(event);
 
-- 
cgit v1.2.3


From 941154bd6937a710ae9193a3c733c0029e5ae7b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:04 +0200
Subject: watchdog/hardlockup/perf: Prevent CPU hotplug deadlock

The following deadlock is possible in the watchdog hotplug code:

  cpus_write_lock()
    ...
      takedown_cpu()
        smpboot_park_threads()
          smpboot_park_thread()
            kthread_park()
              ->park() := watchdog_disable()
                watchdog_nmi_disable()
                  perf_event_release_kernel();
                    put_event()
                      _free_event()
                        ->destroy() := hw_perf_event_destroy()
                          x86_release_hardware()
                            release_ds_buffers()
                              get_online_cpus()

when a per cpu watchdog perf event is destroyed which drops the last
reference to the PMU hardware. The cleanup code there invokes
get_online_cpus() which instantly deadlocks because the hotplug percpu
rwsem is write locked.

To solve this add a deferring mechanism:

  cpus_write_lock()
			   kthread_park()
			    watchdog_nmi_disable(deferred)
			      perf_event_disable(event);
			      move_event_to_deferred(event);
			   ....
  cpus_write_unlock()
  cleaup_deferred_events()
    perf_event_release_kernel()

This is still properly serialized against concurrent hotplug via the
cpu_add_remove_lock, which is held by the task which initiated the hotplug
event.

This is also used to handle event destruction when the watchdog threads are
parked via other mechanisms than CPU hotplug.

Analyzed-by: Peter Zijlstra <peterz@infradead.org>

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.884469246@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  6 ++++++
 kernel/cpu.c          |  6 ++++++
 kernel/watchdog.c     | 25 +++++++++++++++++++++++++
 kernel/watchdog_hld.c | 34 ++++++++++++++++++++++++++++------
 4 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 7eefe7abf44b..80354e6fa86d 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -13,9 +13,11 @@
 #ifdef CONFIG_LOCKUP_DETECTOR
 void lockup_detector_init(void);
 void lockup_detector_soft_poweroff(void);
+void lockup_detector_cleanup(void);
 #else
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
+static inline void lockup_detector_cleanup(void) { }
 #endif
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -77,9 +79,13 @@ static inline void hardlockup_detector_disable(void) {}
 extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
+extern void hardlockup_detector_perf_disable(void);
+extern void hardlockup_detector_perf_cleanup(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
+static inline void hardlockup_detector_perf_disable(void) { }
+static inline void hardlockup_detector_perf_cleanup(void) { }
 #if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline void arch_touch_nmi_watchdog(void) {}
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..a96b348591df 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/nmi.h>
 #include <linux/smpboot.h>
 #include <linux/relay.h>
 #include <linux/slab.h>
@@ -734,6 +735,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 out:
 	cpus_write_unlock();
+	/*
+	 * Do post unplug cleanup. This is still protected against
+	 * concurrent CPU hotplug via cpu_add_remove_lock.
+	 */
+	lockup_detector_cleanup();
 	return ret;
 }
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index af000956286c..dd1fd59683c5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -109,8 +109,10 @@ int __weak watchdog_nmi_enable(unsigned int cpu)
 {
 	return 0;
 }
+
 void __weak watchdog_nmi_disable(unsigned int cpu)
 {
+	hardlockup_detector_perf_disable();
 }
 
 /*
@@ -193,6 +195,8 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
 #endif
 
+static void __lockup_detector_cleanup(void);
+
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
  * lockups can have false positives under extreme conditions. So we generally
@@ -631,6 +635,24 @@ static void set_sample_period(void)
 }
 #endif /* SOFTLOCKUP */
 
+static void __lockup_detector_cleanup(void)
+{
+	lockdep_assert_held(&watchdog_mutex);
+	hardlockup_detector_perf_cleanup();
+}
+
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
+ */
+void lockup_detector_cleanup(void)
+{
+	mutex_lock(&watchdog_mutex);
+	__lockup_detector_cleanup();
+	mutex_unlock(&watchdog_mutex);
+}
+
 /**
  * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
  *
@@ -665,6 +687,8 @@ static int proc_watchdog_update(void)
 
 	watchdog_nmi_reconfigure();
 
+	__lockup_detector_cleanup();
+
 	return err;
 
 }
@@ -837,6 +861,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 		}
 
 		watchdog_nmi_reconfigure();
+		__lockup_detector_cleanup();
 	}
 
 	mutex_unlock(&watchdog_mutex);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 7b602714ea53..94111ccb09b5 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,6 +21,8 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
 static bool hardlockup_detector_disabled;
@@ -239,16 +241,18 @@ out:
 	return 0;
 }
 
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
 {
-	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+	struct perf_event *event = this_cpu_read(watchdog_ev);
 
 	if (event) {
 		perf_event_disable(event);
-		per_cpu(watchdog_ev, cpu) = NULL;
-
-		/* should be in cleanup, but blocks oprofile */
-		perf_event_release_kernel(event);
+		this_cpu_write(watchdog_ev, NULL);
+		this_cpu_write(dead_event, event);
+		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
 
 		/* watchdog_nmi_enable() expects this to be zero initially. */
 		if (atomic_dec_and_test(&watchdog_cpus))
@@ -256,6 +260,24 @@ void watchdog_nmi_disable(unsigned int cpu)
 	}
 }
 
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &dead_events_mask) {
+		struct perf_event *event = per_cpu(dead_event, cpu);
+
+		per_cpu(dead_event, cpu) = NULL;
+		perf_event_release_kernel(event);
+	}
+	cpumask_clear(&dead_events_mask);
+}
+
 /**
  * hardlockup_detector_perf_stop - Globally stop watchdog events
  *
-- 
cgit v1.2.3


From 01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:05 +0200
Subject: watchdog/core: Remove the park_in_progress obfuscation

Commit:

  b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system")

tries to fix the following issue:

proc_write()
   set_sample_period()    <--- New sample period becoms visible
			  <----- Broken starts
   proc_watchdog_update()
     watchdog_enable_all_cpus()		watchdog_hrtimer_fn()
     update_watchdog_all_cpus()		   restart_timer(sample_period)
        watchdog_park_threads()

					thread->park()
					  disable_nmi()
			  <----- Broken ends

The reason why this is broken is that the update of the watchdog threshold
becomes immediately effective and visible for the hrtimer function which
uses that value to rearm the timer. But the NMI/perf side still uses the
old value up to the point where it is disabled. If the rate has been
lowered then the NMI can run fast enough to 'detect' a hard lockup because
the timer has not fired due to the longer period.

The patch 'fixed' this by adding a variable:

proc_write()
   set_sample_period()
					<----- Broken starts
   proc_watchdog_update()
     watchdog_enable_all_cpus()		watchdog_hrtimer_fn()
     update_watchdog_all_cpus()		   restart_timer(sample_period)
         watchdog_park_threads()
	  park_in_progress = 1
					<----- Broken ends
				        nmi_watchdog()
					  if (park_in_progress)
					     return;

The only effect of this variable was to make the window where the breakage
can hit small enough that it was not longer observable in testing. From a
correctness point of view it is a pointless bandaid which merily papers
over the root cause: the unsychronized update of the variable.

Looking deeper into the related code pathes unearthed similar problems in
the watchdog_start()/stop() functions.

 watchdog_start()
	perf_nmi_event_start()
	hrtimer_start()

 watchdog_stop()
	hrtimer_cancel()
	perf_nmi_event_stop()

In both cases the call order is wrong because if the tasks gets preempted
or the VM gets scheduled out long enough after the first call, then there is
a chance that the next NMI will see a stale hrtimer interrupt count and
trigger a false positive hard lockup splat.

Get rid of park_in_progress so the code can be gradually deobfuscated and
pruned from several layers of duct tape papering over the root cause,
which has been either ignored or not understood at all.

Once this is removed the underlying problem will be fixed by rewriting the
proc interface to do a proper synchronized update.

Address the start/stop() ordering problem as well by reverting the call
order, so this part is at least correct now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  1 -
 kernel/watchdog.c     | 37 +++++++++++++++++--------------------
 kernel/watchdog_hld.c |  7 ++-----
 3 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 80354e6fa86d..91a3a4a4c8ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
 extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
 #else
 static inline void touch_softlockup_watchdog_sched(void)
 {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dd1fd59683c5..c290135fb415 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void)
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
-	if (!watchdog_enabled ||
-	    atomic_read(&watchdog_park_in_progress) != 0)
+	if (!watchdog_enabled)
 		return HRTIMER_NORESTART;
 
 	/* kick the hardlockup detector */
@@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
 
 static void watchdog_enable(unsigned int cpu)
 {
-	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
-	/* kick off the timer for the hardlockup detector */
+	/*
+	 * Start the timer first to prevent the NMI watchdog triggering
+	 * before the timer has a chance to fire.
+	 */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+		      HRTIMER_MODE_REL_PINNED);
 
+	/* Initialize timestamp */
+	__touch_watchdog();
 	/* Enable the perf event */
 	watchdog_nmi_enable(cpu);
 
-	/* done here because hrtimer_start can only pin to smp_processor_id() */
-	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-		      HRTIMER_MODE_REL_PINNED);
-
-	/* initialize timestamp */
 	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
-	__touch_watchdog();
 }
 
 static void watchdog_disable(unsigned int cpu)
 {
-	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
 	watchdog_set_prio(SCHED_NORMAL, 0);
-	hrtimer_cancel(hrtimer);
-	/* disable the perf event */
+	/*
+	 * Disable the perf event first. That prevents that a large delay
+	 * between disabling the timer and disabling the perf event causes
+	 * the perf NMI to detect a false positive.
+	 */
 	watchdog_nmi_disable(cpu);
+	hrtimer_cancel(hrtimer);
 }
 
 static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -518,16 +520,11 @@ static int watchdog_park_threads(void)
 {
 	int cpu, ret = 0;
 
-	atomic_set(&watchdog_park_in_progress, 1);
-
 	for_each_watchdog_cpu(cpu) {
 		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
 		if (ret)
 			break;
 	}
-
-	atomic_set(&watchdog_park_in_progress, 0);
-
 	return ret;
 }
 
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 94111ccb09b5..0aa191ee3d51 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = {
 
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
-		 struct perf_sample_data *data,
-		 struct pt_regs *regs)
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs)
 {
 	/* Ensure the watchdog never gets throttled */
 	event->hw.interrupts = 0;
 
-	if (atomic_read(&watchdog_park_in_progress) != 0)
-		return;
-
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
 		__this_cpu_write(watchdog_nmi_touch, false);
 		return;
-- 
cgit v1.2.3


From 2b9d7f233b835663cbc7b6b3f88dd20f61118d1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:06 +0200
Subject: watchdog/core: Clean up stub functions

Having stub functions which take a full page is not helping the
readablility of code.

Condense them and move the doubled #ifdef variant into the SYSFS section.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.045545271@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 68 ++++++++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c290135fb415..af37c040436c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -125,10 +125,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
  * - sysctl_hardlockup_all_cpu_backtrace
  * - hardlockup_panic
  */
-void __weak watchdog_nmi_reconfigure(void)
-{
-}
-
+void __weak watchdog_nmi_reconfigure(void) { }
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
@@ -136,6 +133,11 @@ void __weak watchdog_nmi_reconfigure(void)
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
+/* Global variables, exported for sysctl */
+unsigned int __read_mostly softlockup_panic =
+			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+int __read_mostly soft_watchdog_enabled;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -149,13 +151,9 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static unsigned long soft_lockup_nmi_warn;
 
-unsigned int __read_mostly softlockup_panic =
-			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
 static int __init softlockup_panic_setup(char *str)
 {
 	softlockup_panic = simple_strtoul(str, NULL, 0);
-
 	return 1;
 }
 __setup("softlockup_panic=", softlockup_panic_setup);
@@ -593,44 +591,13 @@ static void watchdog_disable_all_cpus(void)
 	}
 }
 
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
-	return smpboot_update_cpumask_percpu_thread(
-		    &watchdog_threads, &watchdog_cpumask);
-}
-#endif
-
-#else /* SOFTLOCKUP */
-static int watchdog_park_threads(void)
-{
-	return 0;
-}
-
-static void watchdog_unpark_threads(void)
-{
-}
-
-static int watchdog_enable_all_cpus(void)
-{
-	return 0;
-}
-
-static void watchdog_disable_all_cpus(void)
-{
-}
-
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
-	return 0;
-}
-#endif
-
-static void set_sample_period(void)
-{
-}
-#endif /* SOFTLOCKUP */
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static inline void set_sample_period(void) { }
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)
 {
@@ -827,6 +794,15 @@ out:
 	return err;
 }
 
+static int watchdog_update_cpus(void)
+{
+	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) {
+		return smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+							    &watchdog_cpumask);
+	}
+	return 0;
+}
+
 /*
  * The cpumask is the mask of possible cpus that the watchdog can run
  * on, not the mask of cpus it is actually running on.  This allows the
-- 
cgit v1.2.3


From 368a7e2ce8ff0ddcdcb37eadb76530b033f6eb2d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:07 +0200
Subject: watchdog/core: Clean up the #ifdef maze

The #ifdef maze in this file is horrible, group stuff at least a bit so one
can figure out what belongs to what.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.139629546@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index af37c040436c..a9bdfde73e4b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -41,7 +41,6 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
 /*
  * Should we panic when a soft-lockup or hard-lockup occurs:
  */
@@ -74,19 +73,21 @@ static int __init hardlockup_panic_setup(char *str)
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
 
-#endif
+# ifdef CONFIG_SMP
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
 
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+	sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
+	return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 int __read_mostly watchdog_user_enabled;
 int __read_mostly watchdog_thresh = 10;
 
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
-int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
 struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 
@@ -173,22 +174,14 @@ static int __init nosoftlockup_setup(char *str)
 __setup("nosoftlockup", nosoftlockup_setup);
 
 #ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+
 static int __init softlockup_all_cpu_backtrace_setup(char *str)
 {
-	sysctl_softlockup_all_cpu_backtrace =
-		!!simple_strtol(str, NULL, 0);
+	sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
 	return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
-	sysctl_hardlockup_all_cpu_backtrace =
-		!!simple_strtol(str, NULL, 0);
-	return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
 #endif
 
 static void __lockup_detector_cleanup(void);
-- 
cgit v1.2.3


From 05ba3de74a3f499dcaa37b186220aaf174c95a4b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:08 +0200
Subject: watchdog/core: Split out cpumask write function

Split the write part of the cpumask proc handler out into a separate helper
to avoid deep indentation. This also reduces the patch complexity in the
following cleanups.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.218075991@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a9bdfde73e4b..cedf45ab4d81 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -792,10 +792,29 @@ static int watchdog_update_cpus(void)
 	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) {
 		return smpboot_update_cpumask_percpu_thread(&watchdog_threads,
 							    &watchdog_cpumask);
+		__lockup_detector_cleanup();
 	}
 	return 0;
 }
 
+static void proc_watchdog_cpumask_update(void)
+{
+	/* Remove impossible cpus to keep sysctl output clean. */
+	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+
+	if (watchdog_running) {
+		/*
+		 * Failure would be due to being unable to allocate a
+		 * temporary cpumask, so we are likely not in a position to
+		 * do much else to make things better.
+		 */
+		if (watchdog_update_cpus() != 0)
+			pr_err("cpumask update failed\n");
+	}
+
+	watchdog_nmi_reconfigure();
+}
+
 /*
  * The cpumask is the mask of possible cpus that the watchdog can run
  * on, not the mask of cpus it is actually running on.  This allows the
@@ -811,30 +830,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	mutex_lock(&watchdog_mutex);
 
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
-	if (!err && write) {
-		/* Remove impossible cpus to keep sysctl output cleaner. */
-		cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
-			    cpu_possible_mask);
-
-		if (watchdog_running) {
-			/*
-			 * Failure would be due to being unable to allocate
-			 * a temporary cpumask, so we are likely not in a
-			 * position to do much else to make things better.
-			 */
-			if (watchdog_update_cpus() != 0)
-				pr_err("cpumask update failed\n");
-		}
-
-		watchdog_nmi_reconfigure();
-		__lockup_detector_cleanup();
-	}
+	if (!err && write)
+		proc_watchdog_cpumask_update();
 
 	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
 	return err;
 }
-
 #endif /* CONFIG_SYSCTL */
 
 void __init lockup_detector_init(void)
-- 
cgit v1.2.3


From 0d85923c7a81719567311ba0eae8ecb2efd4c8a0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:09 +0200
Subject: smpboot/threads, watchdog/core: Avoid runtime allocation

smpboot_update_cpumask_threads_percpu() allocates a temporary cpumask at
runtime. This is suboptimal because the call site needs more code size for
proper error handling than a statically allocated temporary mask requires
data size.

Add static temporary cpumask. The function is globaly serialized, so no
further protection required.

Remove the half baken error handling in the watchdog code and get rid of
the export as there are no in tree modular users of that function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.297288838@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/smpboot.h |  4 ++--
 kernel/smpboot.c        | 22 +++++++---------------
 kernel/watchdog.c       | 21 +++++----------------
 3 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index 12910cf19869..c149aa7bedf3 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 }
 
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-					 const struct cpumask *);
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					  const struct cpumask *);
 
 #endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..ed7507b69b48 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,31 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
  * by the client, but only by calling this function.
  * This function can only be called on a registered smp_hotplug_thread.
  */
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-					 const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					  const struct cpumask *new)
 {
 	struct cpumask *old = plug_thread->cpumask;
-	cpumask_var_t tmp;
+	static struct cpumask tmp;
 	unsigned int cpu;
 
-	if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
-		return -ENOMEM;
-
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
 
 	/* Park threads that were exclusively enabled on the old mask. */
-	cpumask_andnot(tmp, old, new);
-	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+	cpumask_andnot(&tmp, old, new);
+	for_each_cpu_and(cpu, &tmp, cpu_online_mask)
 		smpboot_park_thread(plug_thread, cpu);
 
 	/* Unpark threads that are exclusively enabled on the new mask. */
-	cpumask_andnot(tmp, new, old);
-	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+	cpumask_andnot(&tmp, new, old);
+	for_each_cpu_and(cpu, &tmp, cpu_online_mask)
 		smpboot_unpark_thread(plug_thread, cpu);
 
 	cpumask_copy(old, new);
 
 	mutex_unlock(&smpboot_threads_lock);
 	put_online_cpus();
-
-	free_cpumask_var(tmp);
-
-	return 0;
 }
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
 
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index cedf45ab4d81..8935a3a4c2fb 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -787,31 +787,20 @@ out:
 	return err;
 }
 
-static int watchdog_update_cpus(void)
+static void watchdog_update_cpus(void)
 {
-	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) {
-		return smpboot_update_cpumask_percpu_thread(&watchdog_threads,
-							    &watchdog_cpumask);
+	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) {
+		smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+						     &watchdog_cpumask);
 		__lockup_detector_cleanup();
 	}
-	return 0;
 }
 
 static void proc_watchdog_cpumask_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-
-	if (watchdog_running) {
-		/*
-		 * Failure would be due to being unable to allocate a
-		 * temporary cpumask, so we are likely not in a position to
-		 * do much else to make things better.
-		 */
-		if (watchdog_update_cpus() != 0)
-			pr_err("cpumask update failed\n");
-	}
-
+	watchdog_update_cpus();
 	watchdog_nmi_reconfigure();
 }
 
-- 
cgit v1.2.3


From 2eb2527f847d1bd8d8fb9db1e8139db5d6eddb36 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:10 +0200
Subject: watchdog/core: Create new thread handling infrastructure

The lockup detector reconfiguration tears down all watchdog threads when
the watchdog is disabled and sets them up again when its enabled.

That's a pointless exercise. The watchdog threads are not consuming an
insane amount of resources, so it's enough to set them up at init time and
keep them in parked position when the watchdog is disabled and unpark them
when it is reenabled. The smpboot thread infrastructure takes care of
keeping the force parked threads in place even across cpu hotplug.

Another horrible mechanism are the open coded park/unpark loops which are
used for reconfiguration of the watchdog. The smpboot infrastructure allows
exactly the same via smpboot_update_cpumask_thread_percpu(), which is cpu
hotplug safe. Using that instead of the open coded loops allows to get rid
of the hotplug locking mess in the watchdog code.

Implement a clean infrastructure which allows to replace the open coded
nonsense.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.377182587@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8935a3a4c2fb..b35518375fb7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -139,6 +139,9 @@ unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
 int __read_mostly soft_watchdog_enabled;
 
+struct cpumask watchdog_allowed_mask __read_mostly;
+static bool softlockup_threads_initialized __read_mostly;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -584,12 +587,84 @@ static void watchdog_disable_all_cpus(void)
 	}
 }
 
+static void softlockup_update_smpboot_threads(void)
+{
+	lockdep_assert_held(&watchdog_mutex);
+
+	if (!softlockup_threads_initialized)
+		return;
+
+	smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+					     &watchdog_allowed_mask);
+	__lockup_detector_cleanup();
+}
+
+/* Temporarily park all watchdog threads */
+static void softlockup_park_all_threads(void)
+{
+	cpumask_clear(&watchdog_allowed_mask);
+	softlockup_update_smpboot_threads();
+}
+
+/*
+ * Park threads which are not longer enabled and unpark threads which have
+ * been newly enabled.
+ */
+static void softlockup_update_threads(void)
+{
+	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+	softlockup_update_smpboot_threads();
+}
+
+static void softlockup_reconfigure_threads(bool enabled)
+{
+	softlockup_park_all_threads();
+	set_sample_period();
+	if (enabled)
+		softlockup_update_threads();
+}
+
+/*
+ * Create the watchdog thread infrastructure.
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty.  When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void softlockup_init_threads(void)
+{
+	int ret;
+
+	/*
+	 * If sysctl is off and watchdog got disabled on the command line,
+	 * nothing to do here.
+	 */
+	if (!IS_ENABLED(CONFIG_SYSCTL) &&
+	    !(watchdog_enabled && watchdog_thresh))
+		return;
+
+	ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+						     &watchdog_allowed_mask);
+	if (ret) {
+		pr_err("Failed to initialize soft lockup detector threads\n");
+		return;
+	}
+
+	mutex_lock(&watchdog_mutex);
+	softlockup_threads_initialized = true;
+	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
+	mutex_unlock(&watchdog_mutex);
+}
+
 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
 static inline int watchdog_park_threads(void) { return 0; }
 static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
 static inline void set_sample_period(void) { }
+static inline void softlockup_init_threads(void) { }
+static inline void softlockup_update_threads(void) { }
+static inline void softlockup_reconfigure_threads(bool enabled) { }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)
-- 
cgit v1.2.3


From d57108d4f6791291e89d980e7f7a3566c32ab188 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:11 +0200
Subject: watchdog/core: Get rid of the thread teardown/setup dance

The lockup detector reconfiguration tears down all watchdog threads when
the watchdog is disabled and sets them up again when its enabled.

That's a pointless exercise. The watchdog threads are not consuming an
insane amount of resources, so it's enough to set them up at init time and
keep them in parked position when the watchdog is disabled and unpark them
when it is reenabled. The smpboot thread infrastructure takes care of
keeping the force parked threads in place even across cpu hotplug.

Aside of that the code implements the park/unpark facility of smp hotplug
threads on its own, which is even more pointless. We have functionality in
the smpboot thread code to do so.

Use the new thread management functions and get rid of the unholy mess.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.470370113@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 190 ++++++------------------------------------------------
 1 file changed, 19 insertions(+), 171 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b35518375fb7..762d3ed82a08 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -91,13 +91,6 @@ int __read_mostly watchdog_thresh = 10;
 struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 
-/*
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
- * are registered/started and is set to 0 when the watchdog threads are
- * unregistered/stopped, so it is an indicator whether the threads exist.
- */
-static int __read_mostly watchdog_running;
-
 /*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
@@ -130,10 +123,6 @@ void __weak watchdog_nmi_reconfigure(void) { }
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
-	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -259,11 +248,15 @@ void touch_all_softlockup_watchdogs(void)
 	int cpu;
 
 	/*
-	 * this is done lockless
-	 * do we care if a 0 races with a timestamp?
-	 * all it means is the softlock check starts one cycle later
+	 * watchdog_mutex cannpt be taken here, as this might be called
+	 * from (soft)interrupt context, so the access to
+	 * watchdog_allowed_cpumask might race with a concurrent update.
+	 *
+	 * The watchdog time stamp can race against a concurrent real
+	 * update as well, the only side effect might be a cycle delay for
+	 * the softlockup check.
 	 */
-	for_each_watchdog_cpu(cpu)
+	for_each_cpu(cpu, &watchdog_allowed_mask)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 	wq_watchdog_touch(-1);
 }
@@ -303,9 +296,6 @@ static void watchdog_interrupt_count(void)
 	__this_cpu_inc(hrtimer_interrupts);
 }
 
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
-
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -498,95 +488,6 @@ static struct smp_hotplug_thread watchdog_threads = {
 	.unpark			= watchdog_enable,
 };
 
-/*
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
-{
-	int cpu, ret = 0;
-
-	for_each_watchdog_cpu(cpu) {
-		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
-		if (ret)
-			break;
-	}
-	return ret;
-}
-
-/*
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
-{
-	int cpu;
-
-	for_each_watchdog_cpu(cpu)
-		kthread_unpark(per_cpu(softlockup_watchdog, cpu));
-}
-
-static int update_watchdog_all_cpus(void)
-{
-	int ret;
-
-	ret = watchdog_park_threads();
-	if (ret)
-		return ret;
-
-	watchdog_unpark_threads();
-
-	return 0;
-}
-
-static int watchdog_enable_all_cpus(void)
-{
-	int err = 0;
-
-	if (!watchdog_running) {
-		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
-							     &watchdog_cpumask);
-		if (err)
-			pr_err("Failed to create watchdog threads, disabled\n");
-		else
-			watchdog_running = 1;
-	} else {
-		/*
-		 * Enable/disable the lockup detectors or
-		 * change the sample period 'on the fly'.
-		 */
-		err = update_watchdog_all_cpus();
-
-		if (err) {
-			watchdog_disable_all_cpus();
-			pr_err("Failed to update lockup detectors, disabled\n");
-		}
-	}
-
-	if (err)
-		watchdog_enabled = 0;
-
-	return err;
-}
-
-static void watchdog_disable_all_cpus(void)
-{
-	if (watchdog_running) {
-		watchdog_running = 0;
-		smpboot_unregister_percpu_thread(&watchdog_threads);
-	}
-}
-
 static void softlockup_update_smpboot_threads(void)
 {
 	lockdep_assert_held(&watchdog_mutex);
@@ -661,7 +562,6 @@ static inline int watchdog_park_threads(void) { return 0; }
 static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
-static inline void set_sample_period(void) { }
 static inline void softlockup_init_threads(void) { }
 static inline void softlockup_update_threads(void) { }
 static inline void softlockup_reconfigure_threads(bool enabled) { }
@@ -701,28 +601,10 @@ void lockup_detector_soft_poweroff(void)
 /*
  * Update the run state of the lockup detectors.
  */
-static int proc_watchdog_update(void)
+static void proc_watchdog_update(void)
 {
-	int err = 0;
-
-	/*
-	 * Watchdog threads won't be started if they are already active.
-	 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
-	 * care of this. If those threads are already active, the sample
-	 * period will be updated and the lockup detectors will be enabled
-	 * or disabled 'on the fly'.
-	 */
-	if (watchdog_enabled && watchdog_thresh)
-		err = watchdog_enable_all_cpus();
-	else
-		watchdog_disable_all_cpus();
-
+	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
 	watchdog_nmi_reconfigure();
-
-	__lockup_detector_cleanup();
-
-	return err;
-
 }
 
 /*
@@ -778,17 +660,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 				new = old & ~which;
 		} while (cmpxchg(&watchdog_enabled, old, new) != old);
 
-		/*
-		 * Update the run state of the lockup detectors. There is _no_
-		 * need to check the value returned by proc_watchdog_update()
-		 * and to restore the previous value of 'watchdog_enabled' as
-		 * both lockup detectors are disabled if proc_watchdog_update()
-		 * returns an error.
-		 */
-		if (old == new)
-			goto out;
-
-		err = proc_watchdog_update();
+		if (old != new)
+			proc_watchdog_update();
 	}
 out:
 	mutex_unlock(&watchdog_mutex);
@@ -832,50 +705,28 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
 int proc_watchdog_thresh(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int err, old, new;
+	int err, old;
 
 	cpu_hotplug_disable();
 	mutex_lock(&watchdog_mutex);
 
-	old = ACCESS_ONCE(watchdog_thresh);
+	old = READ_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
-	if (err || !write)
-		goto out;
+	if (!err && write && old != READ_ONCE(watchdog_thresh))
+		proc_watchdog_update();
 
-	/*
-	 * Update the sample period. Restore on failure.
-	 */
-	new = ACCESS_ONCE(watchdog_thresh);
-	if (old == new)
-		goto out;
-
-	set_sample_period();
-	err = proc_watchdog_update();
-	if (err) {
-		watchdog_thresh = old;
-		set_sample_period();
-	}
-out:
 	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
 	return err;
 }
 
-static void watchdog_update_cpus(void)
-{
-	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) {
-		smpboot_update_cpumask_percpu_thread(&watchdog_threads,
-						     &watchdog_cpumask);
-		__lockup_detector_cleanup();
-	}
-}
-
 static void proc_watchdog_cpumask_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-	watchdog_update_cpus();
+
+	softlockup_update_threads();
 	watchdog_nmi_reconfigure();
 }
 
@@ -905,8 +756,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
 void __init lockup_detector_init(void)
 {
-	set_sample_period();
-
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_enabled()) {
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -917,6 +766,5 @@ void __init lockup_detector_init(void)
 	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #endif
 
-	if (watchdog_enabled)
-		watchdog_enable_all_cpus();
+	softlockup_init_threads();
 }
-- 
cgit v1.2.3


From e8b62b2dd14f8f2427856ba24cb7db922bda9bfd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:12 +0200
Subject: watchdog/core: Further simplify sysctl handling

Use a single function to update sysctl changes. This is not a high
frequency user space interface and it's root only.

Preparatory patch to cleanup the sysctl variable handling.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.549114957@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 762d3ed82a08..ca8747221e87 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -507,11 +507,8 @@ static void softlockup_park_all_threads(void)
 	softlockup_update_smpboot_threads();
 }
 
-/*
- * Park threads which are not longer enabled and unpark threads which have
- * been newly enabled.
- */
-static void softlockup_update_threads(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
 {
 	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
 	softlockup_update_smpboot_threads();
@@ -522,7 +519,7 @@ static void softlockup_reconfigure_threads(bool enabled)
 	softlockup_park_all_threads();
 	set_sample_period();
 	if (enabled)
-		softlockup_update_threads();
+		softlockup_unpark_threads();
 }
 
 /*
@@ -563,7 +560,6 @@ static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
-static inline void softlockup_update_threads(void) { }
 static inline void softlockup_reconfigure_threads(bool enabled) { }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
@@ -598,11 +594,11 @@ void lockup_detector_soft_poweroff(void)
 
 #ifdef CONFIG_SYSCTL
 
-/*
- * Update the run state of the lockup detectors.
- */
+/* Propagate any changes to the watchdog threads */
 static void proc_watchdog_update(void)
 {
+	/* Remove impossible cpus to keep sysctl output clean. */
+	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
 	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
 	watchdog_nmi_reconfigure();
 }
@@ -721,15 +717,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 	return err;
 }
 
-static void proc_watchdog_cpumask_update(void)
-{
-	/* Remove impossible cpus to keep sysctl output clean. */
-	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-
-	softlockup_update_threads();
-	watchdog_nmi_reconfigure();
-}
-
 /*
  * The cpumask is the mask of possible cpus that the watchdog can run
  * on, not the mask of cpus it is actually running on.  This allows the
@@ -746,7 +733,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write)
-		proc_watchdog_cpumask_update();
+		proc_watchdog_update();
 
 	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
-- 
cgit v1.2.3


From 51d4052b01ca555e0d1d5fe297b309beb6c64aa0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:14 +0200
Subject: watchdog/sysctl: Get rid of the #ifdeffery

The sysctl of the nmi_watchdog file prevents writes by setting:

    min = max = 0

if none of the users is enabled. That involves ifdeffery and is competely
non obvious.

If none of the facilities is enabeld, then the file can simply be made read
only. Move the ifdeffery into the header and use a constant for file
permissions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.706073616@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 6 ++++++
 kernel/sysctl.c     | 6 +-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index cfebb3bc4eed..5774b443dba1 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -81,6 +81,12 @@ extern unsigned int hardlockup_panic;
 static inline void hardlockup_detector_disable(void) {}
 #endif
 
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+# define NMI_WATCHDOG_SYSCTL_PERM	0644
+#else
+# define NMI_WATCHDOG_SYSCTL_PERM	0444
+#endif
+
 #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..539cb4e97bb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -891,14 +891,10 @@ static struct ctl_table kern_table[] = {
 		.procname       = "nmi_watchdog",
 		.data           = &nmi_watchdog_enabled,
 		.maxlen         = sizeof (int),
-		.mode           = 0644,
+		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
 		.extra1		= &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
 		.extra2		= &one,
-#else
-		.extra2		= &zero,
-#endif
 	},
 	{
 		.procname	= "watchdog_cpumask",
-- 
cgit v1.2.3


From 7feeb9cd4f5b34476ffb9e6d58d58c5416375b19 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:15 +0200
Subject: watchdog/sysctl: Clean up sysctl variable name space

Reflect that these variables are user interface related and remove the
whitespace damage in the sysctl table while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.783210221@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 16 ++++++++--------
 kernel/sysctl.c     | 16 ++++++++--------
 kernel/watchdog.c   | 41 ++++++++++++++++++++---------------------
 3 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5774b443dba1..4a8d1037364e 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,8 +17,8 @@ void lockup_detector_cleanup(void);
 bool is_hardlockup(void);
 
 extern int watchdog_user_enabled;
-extern int nmi_watchdog_enabled;
-extern int soft_watchdog_enabled;
+extern int nmi_watchdog_user_enabled;
+extern int soft_watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 
@@ -62,12 +62,12 @@ static inline void reset_hung_task_detector(void) { }
  * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
  * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
  *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
+ * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and
+ * 'soft_watchdog_user_enabled' are variables that are only used as an
+ * 'interface' between the parameters in /proc/sys/kernel and the internal
+ * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
+ * handled differently because its value is not boolean, and the lockup
+ * detectors are 'suspended' while 'watchdog_thresh' is equal zero.
  */
 #define NMI_WATCHDOG_ENABLED_BIT   0
 #define SOFT_WATCHDOG_ENABLED_BIT  1
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 539cb4e97bb8..4c08ed4a379e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -871,9 +871,9 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "watchdog",
-		.data           = &watchdog_user_enabled,
-		.maxlen         = sizeof (int),
-		.mode           = 0644,
+		.data		= &watchdog_user_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
 		.proc_handler   = proc_watchdog,
 		.extra1		= &zero,
 		.extra2		= &one,
@@ -889,8 +889,8 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname       = "nmi_watchdog",
-		.data           = &nmi_watchdog_enabled,
-		.maxlen         = sizeof (int),
+		.data		= &nmi_watchdog_user_enabled,
+		.maxlen		= sizeof(int),
 		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
 		.extra1		= &zero,
@@ -906,9 +906,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 	{
 		.procname       = "soft_watchdog",
-		.data           = &soft_watchdog_enabled,
-		.maxlen         = sizeof (int),
-		.mode           = 0644,
+		.data		= &soft_watchdog_user_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
 		.proc_handler   = proc_soft_watchdog,
 		.extra1		= &zero,
 		.extra2		= &one,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ca8747221e87..baae9fc95031 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,8 +31,6 @@
 
 static DEFINE_MUTEX(watchdog_mutex);
 
-int __read_mostly nmi_watchdog_enabled;
-
 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
 unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
 						NMI_WATCHDOG_ENABLED;
@@ -40,6 +38,17 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
 unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
 
+int __read_mostly nmi_watchdog_user_enabled;
+int __read_mostly soft_watchdog_user_enabled;
+int __read_mostly watchdog_user_enabled;
+int __read_mostly watchdog_thresh = 10;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+static bool softlockup_threads_initialized __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /*
  * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -85,12 +94,6 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 # endif /* CONFIG_SMP */
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-
 /*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
@@ -113,7 +116,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
  * watchdog_nmi_reconfigure can be implemented to be notified after any
  * watchdog configuration change. The arch hardlockup watchdog should
  * respond to the following variables:
- * - nmi_watchdog_enabled
+ * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
  * - sysctl_hardlockup_all_cpu_backtrace
@@ -126,10 +129,6 @@ void __weak watchdog_nmi_reconfigure(void) { }
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-int __read_mostly soft_watchdog_enabled;
-
-struct cpumask watchdog_allowed_mask __read_mostly;
-static bool softlockup_threads_initialized __read_mostly;
 
 static u64 __read_mostly sample_period;
 
@@ -606,14 +605,14 @@ static void proc_watchdog_update(void)
 /*
  * common function for watchdog, nmi_watchdog and soft_watchdog parameter
  *
- * caller             | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- *                    |                       | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller             | table->data points to      | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED |
+ *                    |                            | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
  */
 static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
-- 
cgit v1.2.3


From 6592ad2fcc8f15b4f99b36c1db7d9f65510c203b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:16 +0200
Subject: watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage

Both the perf reconfiguration and the powerpc watchdog_nmi_reconfigure()
need to be done in two steps.

     1) Stop all NMIs
     2) Read the new parameters and start NMIs

Right now watchdog_nmi_reconfigure() is a combination of both. To allow a
clean reconfiguration add a 'run' argument and split the functionality in
powerpc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20170912194147.862865570@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/watchdog.c | 17 +++++++++--------
 include/linux/nmi.h            |  2 ++
 kernel/watchdog.c              | 31 ++++++++++++++++++++++---------
 3 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 5ded171f02d6..291af79a9826 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -355,17 +355,18 @@ static void watchdog_calc_timeouts(void)
 	wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
 }
 
-void watchdog_nmi_reconfigure(void)
+void watchdog_nmi_reconfigure(bool run)
 {
 	int cpu;
 
-	watchdog_calc_timeouts();
-
-	for_each_cpu(cpu, &wd_cpus_enabled)
-		stop_wd_on_cpu(cpu);
-
-	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
-		start_wd_on_cpu(cpu);
+	if (!run) {
+		for_each_cpu(cpu, &wd_cpus_enabled)
+			stop_wd_on_cpu(cpu);
+	} else {
+		watchdog_calc_timeouts();
+		for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
+			start_wd_on_cpu(cpu);
+	}
 }
 
 /*
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 4a8d1037364e..eee255bc0fd6 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -103,6 +103,8 @@ static inline void arch_touch_nmi_watchdog(void) {}
 #endif
 #endif
 
+void watchdog_nmi_reconfigure(bool run);
+
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
  *
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index baae9fc95031..5693afd2b8ea 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -112,17 +112,25 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
 	hardlockup_detector_perf_disable();
 }
 
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
+/**
+ * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs
+ * @run:	If false stop the watchdogs on all enabled CPUs
+ *		If true start the watchdogs on all enabled CPUs
+ *
+ * The core call order is:
+ * watchdog_nmi_reconfigure(false);
+ * update_variables();
+ * watchdog_nmi_reconfigure(true);
+ *
+ * The second call which starts the watchdogs again guarantees that the
+ * following variables are stable across the call.
  * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
+ *
+ * After the call the variables can be changed again.
  */
-void __weak watchdog_nmi_reconfigure(void) { }
+void __weak watchdog_nmi_reconfigure(bool run) { }
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
@@ -515,10 +523,12 @@ static void softlockup_unpark_threads(void)
 
 static void softlockup_reconfigure_threads(bool enabled)
 {
+	watchdog_nmi_reconfigure(false);
 	softlockup_park_all_threads();
 	set_sample_period();
 	if (enabled)
 		softlockup_unpark_threads();
+	watchdog_nmi_reconfigure(true);
 }
 
 /*
@@ -559,7 +569,11 @@ static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
-static inline void softlockup_reconfigure_threads(bool enabled) { }
+static void softlockup_reconfigure_threads(bool enabled)
+{
+	watchdog_nmi_reconfigure(false);
+	watchdog_nmi_reconfigure(true);
+}
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)
@@ -599,7 +613,6 @@ static void proc_watchdog_update(void)
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
 	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
-	watchdog_nmi_reconfigure();
 }
 
 /*
-- 
cgit v1.2.3


From 091549858ed881e5f3054374af4f5b1cac681d50 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:17 +0200
Subject: watchdog/core: Get rid of the racy update loop

Letting user space poke directly at variables which are used at run time is
stupid and causes a lot of race conditions and other issues.

Seperate the user variables and on change invoke the reconfiguration, which
then stops the watchdogs, reevaluates the new user value and restarts the
watchdogs with the new parameters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.939985640@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 95 +++++++++++++++++++++++++++----------------------------
 1 file changed, 47 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5693afd2b8ea..84886319d7b0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -32,15 +32,17 @@
 static DEFINE_MUTEX(watchdog_mutex);
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
-						NMI_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT	(SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT	1
 #else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT	(SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT	0
 #endif
 
-int __read_mostly nmi_watchdog_user_enabled;
-int __read_mostly soft_watchdog_user_enabled;
-int __read_mostly watchdog_user_enabled;
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 
 struct cpumask watchdog_allowed_mask __read_mostly;
@@ -65,7 +67,7 @@ unsigned int __read_mostly hardlockup_panic =
  */
 void __init hardlockup_detector_disable(void)
 {
-	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+	nmi_watchdog_user_enabled = 0;
 }
 
 static int __init hardlockup_panic_setup(char *str)
@@ -75,9 +77,9 @@ static int __init hardlockup_panic_setup(char *str)
 	else if (!strncmp(str, "nopanic", 7))
 		hardlockup_panic = 0;
 	else if (!strncmp(str, "0", 1))
-		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+		nmi_watchdog_user_enabled = 0;
 	else if (!strncmp(str, "1", 1))
-		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+		nmi_watchdog_user_enabled = 1;
 	return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -132,6 +134,23 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
  */
 void __weak watchdog_nmi_reconfigure(bool run) { }
 
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
+{
+	watchdog_enabled = 0;
+	if (!watchdog_user_enabled)
+		return;
+	if (nmi_watchdog_user_enabled)
+		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+	if (soft_watchdog_user_enabled)
+		watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
+}
+
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
 /* Global variables, exported for sysctl */
@@ -160,14 +179,14 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 
 static int __init nowatchdog_setup(char *str)
 {
-	watchdog_enabled = 0;
+	watchdog_user_enabled = 0;
 	return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
 
 static int __init nosoftlockup_setup(char *str)
 {
-	watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+	soft_watchdog_user_enabled = 0;
 	return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
@@ -521,12 +540,13 @@ static void softlockup_unpark_threads(void)
 	softlockup_update_smpboot_threads();
 }
 
-static void softlockup_reconfigure_threads(bool enabled)
+static void softlockup_reconfigure_threads(void)
 {
 	watchdog_nmi_reconfigure(false);
 	softlockup_park_all_threads();
 	set_sample_period();
-	if (enabled)
+	lockup_detector_update_enable();
+	if (watchdog_enabled && watchdog_thresh)
 		softlockup_unpark_threads();
 	watchdog_nmi_reconfigure(true);
 }
@@ -546,6 +566,8 @@ static __init void softlockup_init_threads(void)
 	 * If sysctl is off and watchdog got disabled on the command line,
 	 * nothing to do here.
 	 */
+	lockup_detector_update_enable();
+
 	if (!IS_ENABLED(CONFIG_SYSCTL) &&
 	    !(watchdog_enabled && watchdog_thresh))
 		return;
@@ -559,7 +581,7 @@ static __init void softlockup_init_threads(void)
 
 	mutex_lock(&watchdog_mutex);
 	softlockup_threads_initialized = true;
-	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
+	softlockup_reconfigure_threads();
 	mutex_unlock(&watchdog_mutex);
 }
 
@@ -569,9 +591,10 @@ static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
-static void softlockup_reconfigure_threads(bool enabled)
+static void softlockup_reconfigure_threads(void)
 {
 	watchdog_nmi_reconfigure(false);
+	lockup_detector_update_enable();
 	watchdog_nmi_reconfigure(true);
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -612,7 +635,7 @@ static void proc_watchdog_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
+	softlockup_reconfigure_threads();
 }
 
 /*
@@ -630,48 +653,24 @@ static void proc_watchdog_update(void)
 static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int err, old, new;
-	int *watchdog_param = (int *)table->data;
+	int err, old, *param = table->data;
 
 	cpu_hotplug_disable();
 	mutex_lock(&watchdog_mutex);
 
-	/*
-	 * If the parameter is being read return the state of the corresponding
-	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
-	 * run state of the lockup detectors.
-	 */
 	if (!write) {
-		*watchdog_param = (watchdog_enabled & which) != 0;
+		/*
+		 * On read synchronize the userspace interface. This is a
+		 * racy snapshot.
+		 */
+		*param = (watchdog_enabled & which) != 0;
 		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	} else {
+		old = READ_ONCE(*param);
 		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-		if (err)
-			goto out;
-
-		/*
-		 * There is a race window between fetching the current value
-		 * from 'watchdog_enabled' and storing the new value. During
-		 * this race window, watchdog_nmi_enable() can sneak in and
-		 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
-		 * The 'cmpxchg' detects this race and the loop retries.
-		 */
-		do {
-			old = watchdog_enabled;
-			/*
-			 * If the parameter value is not zero set the
-			 * corresponding bit(s), else clear it(them).
-			 */
-			if (*watchdog_param)
-				new = old | which;
-			else
-				new = old & ~which;
-		} while (cmpxchg(&watchdog_enabled, old, new) != old);
-
-		if (old != new)
+		if (!err && old != READ_ONCE(*param))
 			proc_watchdog_update();
 	}
-out:
 	mutex_unlock(&watchdog_mutex);
 	cpu_hotplug_enable();
 	return err;
-- 
cgit v1.2.3


From 178b9f7a36d2c74a38274b66dd89f53611298a19 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:18 +0200
Subject: watchdog/hardlockup/perf: Implement init time perf validation

The watchdog tries to create perf events even after it figured out that
perf is not functional or the requested event is not supported.

That's braindead as this can be done once at init time and if not supported
the NMI watchdog can be turned off unconditonally.

Implement the perf hardlockup detector functionality for that. This creates
a new event create function, which will replace the unholy mess of the
existing one in later patches.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.019090547@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  8 ++++++--
 kernel/watchdog_hld.c | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index eee255bc0fd6..72c62a809e92 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -93,14 +93,18 @@ extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_disable(void);
 extern void hardlockup_detector_perf_cleanup(void);
+extern int hardlockup_detector_perf_init(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_disable(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
-#if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
 static inline void arch_touch_nmi_watchdog(void) {}
-#endif
+# else
+static inline int hardlockup_detector_perf_init(void) { return 0; }
+# endif
 #endif
 
 void watchdog_nmi_reconfigure(bool run);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 0aa191ee3d51..f7e752e6e9b4 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -238,6 +238,27 @@ out:
 	return 0;
 }
 
+static int hardlockup_detector_event_create(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct perf_event_attr *wd_attr;
+	struct perf_event *evt;
+
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+	/* Try to register using hardware perf events */
+	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+					       watchdog_overflow_callback, NULL);
+	if (IS_ERR(evt)) {
+		pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+			PTR_ERR(evt));
+		return PTR_ERR(evt);
+	}
+	this_cpu_write(watchdog_ev, evt);
+	return 0;
+}
+
 /**
  * hardlockup_detector_perf_disable - Disable the local event
  */
@@ -315,3 +336,19 @@ void __init hardlockup_detector_perf_restart(void)
 			perf_event_enable(event);
 	}
 }
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+	int ret = hardlockup_detector_event_create();
+
+	if (ret) {
+		pr_info("Perf NMI watchdog permanetely disabled\n");
+	} else {
+		perf_event_release_kernel(this_cpu_read(watchdog_ev));
+		this_cpu_write(watchdog_ev, NULL);
+	}
+	return ret;
+}
-- 
cgit v1.2.3


From a994a3147e4c0c9c50a46e6cace7586254975e20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:19 +0200
Subject: watchdog/hardlockup/perf: Implement init time detection of perf

Use the init time detection of the perf NMI watchdog to determine whether
the perf NMI watchdog is functional. If not disable it permanentely. It
won't come back magically at runtime.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.099799541@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 84886319d7b0..fd8a998eb197 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -44,6 +44,7 @@ int __read_mostly watchdog_user_enabled = 1;
 int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
 int __read_mostly soft_watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
 
 struct cpumask watchdog_allowed_mask __read_mostly;
 static bool softlockup_threads_initialized __read_mostly;
@@ -114,6 +115,12 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
 	hardlockup_detector_perf_disable();
 }
 
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
+int __weak __init watchdog_nmi_probe(void)
+{
+	return hardlockup_detector_perf_init();
+}
+
 /**
  * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs
  * @run:	If false stop the watchdogs on all enabled CPUs
@@ -145,7 +152,7 @@ static void lockup_detector_update_enable(void)
 	watchdog_enabled = 0;
 	if (!watchdog_user_enabled)
 		return;
-	if (nmi_watchdog_user_enabled)
+	if (nmi_watchdog_available && nmi_watchdog_user_enabled)
 		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
 	if (soft_watchdog_user_enabled)
 		watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
@@ -692,6 +699,8 @@ int proc_watchdog(struct ctl_table *table, int write,
 int proc_nmi_watchdog(struct ctl_table *table, int write,
 		      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	if (!nmi_watchdog_available && write)
+		return -ENOTSUPP;
 	return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
 				    table, write, buffer, lenp, ppos);
 }
@@ -764,5 +773,7 @@ void __init lockup_detector_init(void)
 	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #endif
 
+	if (!watchdog_nmi_probe())
+		nmi_watchdog_available = true;
 	softlockup_init_threads();
 }
-- 
cgit v1.2.3


From 2a1b8ee4f5665b4291e43e4a25d964c3eb2f4c32 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:20 +0200
Subject: watchdog/hardlockup/perf: Implement CPU enable replacement

watchdog_nmi_enable() is an unparseable mess, Provide a clean perf specific
implementation, which will be used when the existing setup/teardown mess is
replaced.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.180215498@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  2 ++
 kernel/watchdog_hld.c | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 72c62a809e92..89ba8b23c6fe 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -92,12 +92,14 @@ extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_disable(void);
+extern void hardlockup_detector_perf_enable(void);
 extern void hardlockup_detector_perf_cleanup(void);
 extern int hardlockup_detector_perf_init(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_disable(void) { }
+static inline void hardlockup_detector_perf_enable(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
 # if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index f7e752e6e9b4..99a3f22e48cc 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -259,6 +259,17 @@ static int hardlockup_detector_event_create(void)
 	return 0;
 }
 
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+	if (hardlockup_detector_event_create())
+		return;
+
+	perf_event_enable(this_cpu_read(watchdog_ev));
+}
+
 /**
  * hardlockup_detector_perf_disable - Disable the local event
  */
-- 
cgit v1.2.3


From 146c9d0e9dfdb62ed6afd43cc263efafbbfd1dcf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:21 +0200
Subject: watchdog/hardlockup/perf: Use new perf CPU enable mechanism

Get rid of the hodgepodge which tries to be smart about perf being
unavailable and error printout rate limiting.

That's all not required simply because this is never invoked when the perf
NMI watchdog is not functional.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.259651788@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c     |  4 ++-
 kernel/watchdog_hld.c | 88 +++------------------------------------------------
 2 files changed, 8 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index fd8a998eb197..5eb11960e4a2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -107,6 +107,7 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
+	hardlockup_detector_perf_enable();
 	return 0;
 }
 
@@ -465,7 +466,8 @@ static void watchdog_enable(unsigned int cpu)
 	/* Initialize timestamp */
 	__touch_watchdog();
 	/* Enable the perf event */
-	watchdog_nmi_enable(cpu);
+	if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+		watchdog_nmi_enable(cpu);
 
 	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 }
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 99a3f22e48cc..509bb6b59c41 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -25,7 +25,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event);
 static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
-static bool hardlockup_detector_disabled;
+static unsigned int watchdog_cpus;
 
 void arch_touch_nmi_watchdog(void)
 {
@@ -160,84 +160,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	return;
 }
 
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-
-int watchdog_nmi_enable(unsigned int cpu)
-{
-	struct perf_event_attr *wd_attr;
-	struct perf_event *event = per_cpu(watchdog_ev, cpu);
-	int firstcpu = 0;
-
-	/* nothing to do if the hard lockup detector is disabled */
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		goto out;
-
-	/* A failure disabled the hardlockup detector permanently */
-	if (hardlockup_detector_disabled)
-		return -ENODEV;
-
-	/* is it already setup and enabled? */
-	if (event && event->state > PERF_EVENT_STATE_OFF)
-		goto out;
-
-	/* it is setup but not enabled */
-	if (event != NULL)
-		goto out_enable;
-
-	if (atomic_inc_return(&watchdog_cpus) == 1)
-		firstcpu = 1;
-
-	wd_attr = &wd_hw_attr;
-	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
-	/* Try to register using hardware perf events */
-	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-
-	/* save the first cpu's error for future comparision */
-	if (firstcpu && IS_ERR(event))
-		firstcpu_err = PTR_ERR(event);
-
-	if (!IS_ERR(event)) {
-		/* only print for the first cpu initialized */
-		if (firstcpu || firstcpu_err)
-			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-		goto out_save;
-	}
-
-	/* skip displaying the same error again */
-	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
-		return PTR_ERR(event);
-
-	/* vary the KERN level based on the returned errno */
-	if (PTR_ERR(event) == -EOPNOTSUPP)
-		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-	else if (PTR_ERR(event) == -ENOENT)
-		pr_warn("disabled (cpu%i): hardware events not enabled\n",
-			 cpu);
-	else
-		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-			cpu, PTR_ERR(event));
-
-	pr_info("Disabling hard lockup detector permanently\n");
-	hardlockup_detector_disabled = true;
-
-	return PTR_ERR(event);
-
-	/* success path */
-out_save:
-	per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-	perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-	return 0;
-}
-
 static int hardlockup_detector_event_create(void)
 {
 	unsigned int cpu = smp_processor_id();
@@ -267,6 +189,9 @@ void hardlockup_detector_perf_enable(void)
 	if (hardlockup_detector_event_create())
 		return;
 
+	if (!watchdog_cpus++)
+		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
+
 	perf_event_enable(this_cpu_read(watchdog_ev));
 }
 
@@ -282,10 +207,7 @@ void hardlockup_detector_perf_disable(void)
 		this_cpu_write(watchdog_ev, NULL);
 		this_cpu_write(dead_event, event);
 		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
-
-		/* watchdog_nmi_enable() expects this to be zero initially. */
-		if (atomic_dec_and_test(&watchdog_cpus))
-			firstcpu_err = 0;
+		watchdog_cpus--;
 	}
 }
 
-- 
cgit v1.2.3


From a33d44843d4574ec05bec39527d8a87b7af2072c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:22 +0200
Subject: watchdog/hardlockup/perf: Simplify deferred event destroy

Now that all functionality is properly serialized against CPU hotplug,
remove the extra per cpu storage which holds the disabled events for
cleanup. The core makes sure that cleanup happens before new events are
created.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.340708074@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog_hld.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 509bb6b59c41..b2931154b5f2 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,7 +21,6 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
 static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
@@ -204,8 +203,6 @@ void hardlockup_detector_perf_disable(void)
 
 	if (event) {
 		perf_event_disable(event);
-		this_cpu_write(watchdog_ev, NULL);
-		this_cpu_write(dead_event, event);
 		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
 		watchdog_cpus--;
 	}
@@ -221,9 +218,9 @@ void hardlockup_detector_perf_cleanup(void)
 	int cpu;
 
 	for_each_cpu(cpu, &dead_events_mask) {
-		struct perf_event *event = per_cpu(dead_event, cpu);
+		struct perf_event *event = per_cpu(watchdog_ev, cpu);
 
-		per_cpu(dead_event, cpu) = NULL;
+		per_cpu(watchdog_ev, cpu) = NULL;
 		perf_event_release_kernel(event);
 	}
 	cpumask_clear(&dead_events_mask);
-- 
cgit v1.2.3


From ab5fe3ff38ff9653490910cc71dbbedc95a86e41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:23 +0200
Subject: watchdog/hardlockup: Clean up hotplug locking mess

All watchdog thread related functions are delegated to the smpboot thread
infrastructure, which handles serialization against CPU hotplug correctly.

The sysctl interface is completely decoupled from anything which requires
CPU hotplug protection.

No need to protect the sysctl writes against cpu hotplug anymore. Remove it
and add the now required protection to the powerpc arch_nmi_watchdog
implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20170912194148.418497420@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/watchdog.c | 2 ++
 kernel/watchdog.c              | 6 ------
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 291af79a9826..dfb067764480 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -359,6 +359,7 @@ void watchdog_nmi_reconfigure(bool run)
 {
 	int cpu;
 
+	cpus_read_lock();
 	if (!run) {
 		for_each_cpu(cpu, &wd_cpus_enabled)
 			stop_wd_on_cpu(cpu);
@@ -367,6 +368,7 @@ void watchdog_nmi_reconfigure(bool run)
 		for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
 			start_wd_on_cpu(cpu);
 	}
+	cpus_read_unlock();
 }
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5eb11960e4a2..f6ef163b72cd 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -664,7 +664,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 {
 	int err, old, *param = table->data;
 
-	cpu_hotplug_disable();
 	mutex_lock(&watchdog_mutex);
 
 	if (!write) {
@@ -681,7 +680,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 			proc_watchdog_update();
 	}
 	mutex_unlock(&watchdog_mutex);
-	cpu_hotplug_enable();
 	return err;
 }
 
@@ -725,7 +723,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 {
 	int err, old;
 
-	cpu_hotplug_disable();
 	mutex_lock(&watchdog_mutex);
 
 	old = READ_ONCE(watchdog_thresh);
@@ -735,7 +732,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 		proc_watchdog_update();
 
 	mutex_unlock(&watchdog_mutex);
-	cpu_hotplug_enable();
 	return err;
 }
 
@@ -750,7 +746,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 {
 	int err;
 
-	cpu_hotplug_disable();
 	mutex_lock(&watchdog_mutex);
 
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
@@ -758,7 +753,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 		proc_watchdog_update();
 
 	mutex_unlock(&watchdog_mutex);
-	cpu_hotplug_enable();
 	return err;
 }
 #endif /* CONFIG_SYSCTL */
-- 
cgit v1.2.3


From c4fa6c43ce4b427350cfbb659436bfe3d9e09a1d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 21 Sep 2017 09:54:13 -0400
Subject: cgroup: Reinit cgroup_taskset structure before
 cgroup_migrate_execute() returns

The cgroup_taskset structure within the larger cgroup_mgctx structure
is supposed to be used once and then discarded. That is not really the
case in the hotplug code path:

cpuset_hotplug_workfn()
 - cgroup_transfer_tasks()
   - cgroup_migrate()
     - cgroup_migrate_add_task()
     - cgroup_migrate_execute()

In this case, the cgroup_migrate() function is called multiple time
with the same cgroup_mgctx structure to transfer the tasks from
one cgroup to another one-by-one. The second time cgroup_migrate()
is called, the cgroup_taskset will be in an incorrect state and so
may cause the system to panic. For example,

  [  150.888410] Faulting instruction address: 0xc0000000001db648
  [  150.888414] Oops: Kernel access of bad area, sig: 11 [#1]
  [  150.888417] SMP NR_CPUS=2048
  [  150.888417] NUMA
  [  150.888419] pSeries
    :
  [  150.888545] NIP [c0000000001db648] cpuset_can_attach+0x58/0x1b0
  [  150.888548] LR [c0000000001db638] cpuset_can_attach+0x48/0x1b0
  [  150.888551] Call Trace:
  [  150.888554] [c0000005f65cb940] [c0000000001db638] cpuset_can_attach+0x48/0x1b 0 (unreliable)
  [  150.888559] [c0000005f65cb9a0] [c0000000001cff04] cgroup_migrate_execute+0xc4/0x4b0
  [  150.888563] [c0000005f65cba20] [c0000000001d7d14] cgroup_transfer_tasks+0x1d4/0x370
  [  150.888568] [c0000005f65cbb70] [c0000000001ddcb0] cpuset_hotplug_workfn+0x710/0x8f0
  [  150.888572] [c0000005f65cbc80] [c00000000012032c] process_one_work+0x1ac/0x4d0
  [  150.888576] [c0000005f65cbd20] [c0000000001206f8] worker_thread+0xa8/0x5b0
  [  150.888580] [c0000005f65cbdc0] [c0000000001293f8] kthread+0x168/0x1b0
  [  150.888584] [c0000005f65cbe30] [c00000000000b368] ret_from_kernel_thread+0x5c/0x74

To allow reuse of the cgroup_mgctx structure, some fields in that
structure are now re-initialized at the end of cgroup_migrate_execute()
function call so that the structure can be reused again in a later
iteration without causing problem.

This bug was introduced in the commit e595cd706982 ("group: track
migration context in cgroup_mgctx") in 4.11. This commit moves the
cgroup_taskset initialization out of cgroup_migrate(). The commit
10467270fb3 ("cgroup: don't call migration methods if there are no
tasks to migrate") helped, but did not completely resolve the problem.

Fixes: e595cd706982bff0211e6fafe5a108421e747fbc ("group: track migration context in cgroup_mgctx")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org # v4.11+
---
 kernel/cgroup/cgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..44857278eb8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2311,6 +2311,14 @@ out_release_tset:
 		list_del_init(&cset->mg_node);
 	}
 	spin_unlock_irq(&css_set_lock);
+
+	/*
+	 * Re-initialize the cgroup_taskset structure in case it is reused
+	 * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+	 * iteration.
+	 */
+	tset->nr_tasks = 0;
+	tset->csets    = &tset->src_csets;
 	return ret;
 }
 
-- 
cgit v1.2.3


From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 22 Sep 2017 17:48:06 +0200
Subject: futex: Fix pi_state->owner serialization

There was a reported suspicion about a race between exit_pi_state_list()
and put_pi_state(). The same report mentioned the comment with
put_pi_state() said it should be called with hb->lock held, and it no
longer is in all places.

As it turns out, the pi_state->owner serialization is indeed broken. As per
the new rules:

  734009e96d19 ("futex: Change locking rules")

pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock.
For the sites setting pi_state->owner we already hold wait_lock (where
required) but exit_pi_state_list() and put_pi_state() were not and
raced on clearing it.

Fixes: 734009e96d19 ("futex: Change locking rules")
Reported-by: Gratian Crisan <gratian.crisan@ni.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: dvhart@infradead.org
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net
---
 kernel/futex.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..0518a0bfc746 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
 /*
  * Drops a reference to the pi_state object and frees or caches it
  * when the last reference is gone.
- *
- * Must be called with the hb lock held.
  */
 static void put_pi_state(struct futex_pi_state *pi_state)
 {
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
 	 * and has cleaned up the pi_state already
 	 */
 	if (pi_state->owner) {
-		raw_spin_lock_irq(&pi_state->owner->pi_lock);
-		list_del_init(&pi_state->list);
-		raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+		struct task_struct *owner;
 
-		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+		owner = pi_state->owner;
+		if (owner) {
+			raw_spin_lock(&owner->pi_lock);
+			list_del_init(&pi_state->list);
+			raw_spin_unlock(&owner->pi_lock);
+		}
+		rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 	}
 
-	if (current->pi_state_cache)
+	if (current->pi_state_cache) {
 		kfree(pi_state);
-	else {
+	} else {
 		/*
 		 * pi_state->list is already empty.
 		 * clear pi_state->owner.
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr)
 		raw_spin_unlock_irq(&curr->pi_lock);
 
 		spin_lock(&hb->lock);
-
-		raw_spin_lock_irq(&curr->pi_lock);
+		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+		raw_spin_lock(&curr->pi_lock);
 		/*
 		 * We dropped the pi-lock, so re-check whether this
 		 * task still owns the PI-state:
 		 */
 		if (head->next != next) {
+			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 			spin_unlock(&hb->lock);
 			continue;
 		}
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr)
 		WARN_ON(list_empty(&pi_state->list));
 		list_del_init(&pi_state->list);
 		pi_state->owner = NULL;
-		raw_spin_unlock_irq(&curr->pi_lock);
+		raw_spin_unlock(&curr->pi_lock);
 
 		get_pi_state(pi_state);
+		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 		spin_unlock(&hb->lock);
 
 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 
 	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &p->pi_state_list);
+	/*
+	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+	 * because there is no concurrency as the object is not published yet.
+	 */
 	pi_state->owner = p;
 	raw_spin_unlock_irq(&p->pi_lock);
 
@@ -2878,6 +2888,7 @@ retry:
 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 		spin_unlock(&hb->lock);
 
+		/* drops pi_state->pi_mutex.wait_lock */
 		ret = wake_futex_pi(uaddr, uval, pi_state);
 
 		put_pi_state(pi_state);
-- 
cgit v1.2.3


From 2827a418ca1b23e432e62c9b3d0e7cf3255dfe88 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 19 Sep 2017 22:04:12 +0200
Subject: genirq: Check __free_irq() return value for NULL

__free_irq() can return a NULL irqaction for example when trying to free
already-free IRQ, but the callsite unconditionally dereferences the
returned pointer.

Fix this by adding a check and return NULL.

Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20170919200412.GA29985@gmail.com
---
 kernel/irq/manage.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..d00132b5c325 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
 #endif
 
 	action = __free_irq(irq, dev_id);
+
+	if (!action)
+		return NULL;
+
 	devname = action->name;
 	kfree(action);
 	return devname;
-- 
cgit v1.2.3


From 115ef3b7e61ac64e32827611a127002672ed3725 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 25 Sep 2017 20:21:54 +0200
Subject: watchdog/hardlockup/perf: Cure UP damage

for_each_cpu() unintuitively reports CPU0 as set independend of the actual
cpumask content on UP kernels. That leads to a NULL pointer dereference
when the cleanup function is invoked and there is no event to clean up.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/watchdog_hld.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index b2931154b5f2..204a8cadb717 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -220,8 +220,13 @@ void hardlockup_detector_perf_cleanup(void)
 	for_each_cpu(cpu, &dead_events_mask) {
 		struct perf_event *event = per_cpu(watchdog_ev, cpu);
 
+		/*
+		 * Required because for_each_cpu() reports  unconditionally
+		 * CPU0 as set on UP kernels. Sigh.
+		 */
+		if (event)
+			perf_event_release_kernel(event);
 		per_cpu(watchdog_ev, cpu) = NULL;
-		perf_event_release_kernel(event);
 	}
 	cpumask_clear(&dead_events_mask);
 }
-- 
cgit v1.2.3


From 7755d83e48397e822aac751b1545f8bcf71d133e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 22 Sep 2017 21:20:41 +0900
Subject: irqdomain: Add __rcu annotations to radix tree accessors

Fix various address spaces warning of sparse.

kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces)
kernel/irq/irqdomain.c:1463:14:    expected void **slot
kernel/irq/irqdomain.c:1463:14:    got void [noderef] <asn:4>**
kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces)
kernel/irq/irqdomain.c:1465:66:    expected void [noderef] <asn:4>**slot
kernel/irq/irqdomain.c:1465:66:    got void **slot

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: https://lkml.kernel.org/r/1506082841-11530-1-git-send-email-yamada.masahiro@socionext.com
---
 kernel/irq/irqdomain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..ac4644e92b49 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
 	struct irq_desc *desc;
 	struct irq_domain *domain;
 	struct radix_tree_iter iter;
-	void **slot;
+	void __rcu **slot;
 	int i;
 
 	seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n",
@@ -1453,7 +1453,7 @@ out_free_desc:
 /* The irq_data was moved, fix the revmap to refer to the new location */
 static void irq_domain_fix_revmap(struct irq_data *d)
 {
-	void **slot;
+	void __rcu **slot;
 
 	if (d->hwirq < d->domain->revmap_size)
 		return; /* Not using radix tree. */
-- 
cgit v1.2.3


From 96abb968549cdefd0964d1f7af0a79f4e6e7f897 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Sep 2017 19:00:16 +0200
Subject: smp/hotplug: Allow external multi-instance rollback

Currently the rollback of multi-instance states is handled inside
cpuhp_invoke_callback(). The problem is that when we want to allow an
explicit state change for rollback, we need to return from the
function without doing the rollback.

Change cpuhp_invoke_callback() to optionally return the multi-instance
state, such that rollback can be done from a subsequent call.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: bigeasy@linutronix.de
Cc: efault@gmx.de
Cc: rostedt@goodmis.org
Cc: max.byungchul.park@gmail.com
Link: https://lkml.kernel.org/r/20170920170546.720361181@infradead.org
---
 kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..323b71050b54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -123,13 +123,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 /**
  * cpuhp_invoke_callback _ Invoke the callbacks for a given state
  * @cpu:	The cpu for which the callback should be invoked
- * @step:	The step in the state machine
+ * @state:	The state to do callbacks for
  * @bringup:	True if the bringup callback should be invoked
+ * @node:	For multi-instance, do a single entry callback for install/remove
+ * @lastp:	For multi-instance rollback, remember how far we got
  *
  * Called from cpu hotplug and from the state register machinery.
  */
 static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
-				 bool bringup, struct hlist_node *node)
+				 bool bringup, struct hlist_node *node,
+				 struct hlist_node **lastp)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	struct cpuhp_step *step = cpuhp_get_step(state);
@@ -138,6 +141,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 	int ret, cnt;
 
 	if (!step->multi_instance) {
+		WARN_ON_ONCE(lastp && *lastp);
 		cb = bringup ? step->startup.single : step->teardown.single;
 		if (!cb)
 			return 0;
@@ -152,6 +156,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 
 	/* Single invocation for instance add/remove */
 	if (node) {
+		WARN_ON_ONCE(lastp && *lastp);
 		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
 		ret = cbm(cpu, node);
 		trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +166,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 	/* State transition. Invoke on all instances */
 	cnt = 0;
 	hlist_for_each(node, &step->list) {
+		if (lastp && node == *lastp)
+			break;
+
 		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
 		ret = cbm(cpu, node);
 		trace_cpuhp_exit(cpu, st->state, state, ret);
-		if (ret)
-			goto err;
+		if (ret) {
+			if (!lastp)
+				goto err;
+
+			*lastp = node;
+			return ret;
+		}
 		cnt++;
 	}
+	if (lastp)
+		*lastp = NULL;
 	return 0;
 err:
 	/* Rollback the instances if one failed */
@@ -323,7 +338,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
 		struct cpuhp_step *step = cpuhp_get_step(st->state);
 
 		if (!step->skip_onerr)
-			cpuhp_invoke_callback(cpu, st->state, true, NULL);
+			cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
 	}
 }
 
@@ -334,7 +349,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 	int ret = 0;
 
 	for (; st->state > target; st->state--) {
-		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
+		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
 		if (ret) {
 			st->target = prev_state;
 			undo_cpu_down(cpu, st);
@@ -350,7 +365,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 		struct cpuhp_step *step = cpuhp_get_step(st->state);
 
 		if (!step->skip_onerr)
-			cpuhp_invoke_callback(cpu, st->state, false, NULL);
+			cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
 	}
 }
 
@@ -362,7 +377,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 
 	while (st->state < target) {
 		st->state++;
-		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
 		if (ret) {
 			st->target = prev_state;
 			undo_cpu_up(cpu, st);
@@ -428,11 +443,13 @@ static void cpuhp_thread_fun(unsigned int cpu)
 		if (st->cb_state < CPUHP_AP_ONLINE) {
 			local_irq_disable();
 			ret = cpuhp_invoke_callback(cpu, st->cb_state,
-						    st->bringup, st->node);
+						    st->bringup, st->node,
+						    NULL);
 			local_irq_enable();
 		} else {
 			ret = cpuhp_invoke_callback(cpu, st->cb_state,
-						    st->bringup, st->node);
+						    st->bringup, st->node,
+						    NULL);
 		}
 	} else if (st->rollback) {
 		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
@@ -472,7 +489,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
 	 * we invoke the thread function directly.
 	 */
 	if (!st->thread)
-		return cpuhp_invoke_callback(cpu, state, bringup, node);
+		return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 
 	st->cb_state = state;
 	st->single = true;
@@ -595,7 +612,7 @@ static int take_cpu_down(void *_param)
 	st->state--;
 	/* Invoke the former CPU_DYING callbacks */
 	for (; st->state > target; st->state--)
-		cpuhp_invoke_callback(cpu, st->state, false, NULL);
+		cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
 
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
@@ -776,7 +793,7 @@ void notify_cpu_starting(unsigned int cpu)
 	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
 	while (st->state < target) {
 		st->state++;
-		cpuhp_invoke_callback(cpu, st->state, true, NULL);
+		cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
 	}
 }
 
@@ -1307,9 +1324,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
 	if (cpuhp_is_ap_state(state))
 		ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
 	else
-		ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+		ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 #else
-	ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+	ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 #endif
 	BUG_ON(ret && !bringup);
 	return ret;
-- 
cgit v1.2.3


From 4dddfb5faa6118564b0c54a163353d13882299d8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Sep 2017 19:00:17 +0200
Subject: smp/hotplug: Rewrite AP state machine core

There is currently no explicit state change on rollback. That is,
st->bringup, st->rollback and st->target are not consistent when doing
the rollback.

Rework the AP state handling to be more coherent. This does mean we
have to do a second AP kick-and-wait for rollback, but since rollback
is the slow path of a slowpath, this really should not matter.

Take this opportunity to simplify the AP thread function to only run a
single callback per invocation. This unifies the three single/up/down
modes is supports. The looping it used to do for up/down are achieved
by retaining should_run and relying on the main smpboot_thread_fn()
loop.

(I have most of a patch that does the same for the BP state handling,
but that's not critical and gets a little complicated because
CPUHP_BRINGUP_CPU does the AP handoff from a callback, which gets
recursive @st usage, I still have de-fugly that.)

[ tglx: Move cpuhp_down_callbacks() et al. into the HOTPLUG_CPU section to
  	avoid gcc complaining about unused functions. Make the HOTPLUG_CPU
  	one piece instead of having two consecutive ifdef sections of the
  	same type. ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: bigeasy@linutronix.de
Cc: efault@gmx.de
Cc: rostedt@goodmis.org
Cc: max.byungchul.park@gmail.com
Link: https://lkml.kernel.org/r/20170920170546.769658088@infradead.org
---
 kernel/cpu.c | 321 ++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 206 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 323b71050b54..1139063de5af 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,6 +58,7 @@ struct cpuhp_cpu_state {
 	bool			single;
 	bool			bringup;
 	struct hlist_node	*node;
+	struct hlist_node	*last;
 	enum cpuhp_state	cb_state;
 	int			result;
 	struct completion	done;
@@ -112,6 +113,14 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state)
 	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
 }
 
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+	return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 {
 	struct cpuhp_step *sp;
@@ -286,7 +295,72 @@ void cpu_hotplug_enable(void)
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+	enum cpuhp_state prev_state = st->state;
+
+	st->rollback = false;
+	st->last = NULL;
+
+	st->target = target;
+	st->single = false;
+	st->bringup = st->state < target;
+
+	return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+	st->rollback = true;
+
+	/*
+	 * If we have st->last we need to undo partial multi_instance of this
+	 * state first. Otherwise start undo at the previous state.
+	 */
+	if (!st->last) {
+		if (st->bringup)
+			st->state--;
+		else
+			st->state++;
+	}
+
+	st->target = prev_state;
+	st->bringup = !st->bringup;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+	if (!st->single && st->state == st->target)
+		return;
+
+	st->result = 0;
+	/*
+	 * Make sure the above stores are visible before should_run becomes
+	 * true. Paired with the mb() above in cpuhp_thread_fun()
+	 */
+	smp_mb();
+	st->should_run = true;
+	wake_up_process(st->thread);
+	wait_for_completion(&st->done);
+}
+
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+	enum cpuhp_state prev_state;
+	int ret;
+
+	prev_state = cpuhp_set_state(st, target);
+	__cpuhp_kick_ap(st);
+	if ((ret = st->result)) {
+		cpuhp_reset_state(st, prev_state);
+		__cpuhp_kick_ap(st);
+	}
+
+	return ret;
+}
 
 static int bringup_wait_for_ap(unsigned int cpu)
 {
@@ -301,12 +375,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
 	stop_machine_unpark(cpu);
 	kthread_unpark(st->thread);
 
-	/* Should we go further up ? */
-	if (st->target > CPUHP_AP_ONLINE_IDLE) {
-		__cpuhp_kick_ap_work(st);
-		wait_for_completion(&st->done);
-	}
-	return st->result;
+	if (st->target <= CPUHP_AP_ONLINE_IDLE)
+		return 0;
+
+	return cpuhp_kick_ap(st, st->target);
 }
 
 static int bringup_cpu(unsigned int cpu)
@@ -332,32 +404,6 @@ static int bringup_cpu(unsigned int cpu)
 /*
  * Hotplug state machine related functions
  */
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-	for (st->state++; st->state < st->target; st->state++) {
-		struct cpuhp_step *step = cpuhp_get_step(st->state);
-
-		if (!step->skip_onerr)
-			cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
-	}
-}
-
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
-				enum cpuhp_state target)
-{
-	enum cpuhp_state prev_state = st->state;
-	int ret = 0;
-
-	for (; st->state > target; st->state--) {
-		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
-		if (ret) {
-			st->target = prev_state;
-			undo_cpu_down(cpu, st);
-			break;
-		}
-	}
-	return ret;
-}
 
 static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
@@ -404,71 +450,90 @@ static int cpuhp_should_run(unsigned int cpu)
 	return st->should_run;
 }
 
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-	enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-
-	return cpuhp_down_callbacks(cpu, st, target);
-}
-
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-	return cpuhp_up_callbacks(cpu, st, st->target);
-}
-
 /*
  * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
  * callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ *  - single: runs st->cb_state
+ *  - up:     runs ++st->state, while st->state < st->target
+ *  - down:   runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
  */
 static void cpuhp_thread_fun(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
-	int ret = 0;
+	bool bringup = st->bringup;
+	enum cpuhp_state state;
 
 	/*
-	 * Paired with the mb() in cpuhp_kick_ap_work and
-	 * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+	 * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+	 * that if we see ->should_run we also see the rest of the state.
 	 */
 	smp_mb();
-	if (!st->should_run)
-		return;
 
-	st->should_run = false;
+	if (WARN_ON_ONCE(!st->should_run))
+		return;
 
 	lock_map_acquire(&cpuhp_state_lock_map);
-	/* Single callback invocation for [un]install ? */
+
 	if (st->single) {
-		if (st->cb_state < CPUHP_AP_ONLINE) {
-			local_irq_disable();
-			ret = cpuhp_invoke_callback(cpu, st->cb_state,
-						    st->bringup, st->node,
-						    NULL);
-			local_irq_enable();
+		state = st->cb_state;
+		st->should_run = false;
+	} else {
+		if (bringup) {
+			st->state++;
+			state = st->state;
+			st->should_run = (st->state < st->target);
+			WARN_ON_ONCE(st->state > st->target);
 		} else {
-			ret = cpuhp_invoke_callback(cpu, st->cb_state,
-						    st->bringup, st->node,
-						    NULL);
+			state = st->state;
+			st->state--;
+			st->should_run = (st->state > st->target);
+			WARN_ON_ONCE(st->state < st->target);
 		}
-	} else if (st->rollback) {
-		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+	}
+
+	WARN_ON_ONCE(!cpuhp_is_ap_state(state));
+
+	if (st->rollback) {
+		struct cpuhp_step *step = cpuhp_get_step(state);
+		if (step->skip_onerr)
+			goto next;
+	}
+
+	if (cpuhp_is_atomic_state(state)) {
+		local_irq_disable();
+		st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+		local_irq_enable();
 
-		undo_cpu_down(cpu, st);
-		st->rollback = false;
+		/*
+		 * STARTING/DYING must not fail!
+		 */
+		WARN_ON_ONCE(st->result);
 	} else {
-		/* Cannot happen .... */
-		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-
-		/* Regular hotplug work */
-		if (st->state < st->target)
-			ret = cpuhp_ap_online(cpu, st);
-		else if (st->state > st->target)
-			ret = cpuhp_ap_offline(cpu, st);
+		st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+	}
+
+	if (st->result) {
+		/*
+		 * If we fail on a rollback, we're up a creek without no
+		 * paddle, no way forward, no way back. We loose, thanks for
+		 * playing.
+		 */
+		WARN_ON_ONCE(st->rollback);
+		st->should_run = false;
 	}
+
+next:
 	lock_map_release(&cpuhp_state_lock_map);
-	st->result = ret;
-	complete(&st->done);
+
+	if (!st->should_run)
+		complete(&st->done);
 }
 
 /* Invoke a single callback on a remote cpu */
@@ -477,6 +542,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
 			 struct hlist_node *node)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	int ret;
 
 	if (!cpu_online(cpu))
 		return 0;
@@ -491,48 +557,43 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
 	if (!st->thread)
 		return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 
+	st->rollback = false;
+	st->last = NULL;
+
+	st->node = node;
+	st->bringup = bringup;
 	st->cb_state = state;
 	st->single = true;
-	st->bringup = bringup;
-	st->node = node;
 
-	/*
-	 * Make sure the above stores are visible before should_run becomes
-	 * true. Paired with the mb() above in cpuhp_thread_fun()
-	 */
-	smp_mb();
-	st->should_run = true;
-	wake_up_process(st->thread);
-	wait_for_completion(&st->done);
-	return st->result;
-}
+	__cpuhp_kick_ap(st);
 
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
-	st->result = 0;
-	st->single = false;
 	/*
-	 * Make sure the above stores are visible before should_run becomes
-	 * true. Paired with the mb() above in cpuhp_thread_fun()
+	 * If we failed and did a partial, do a rollback.
 	 */
-	smp_mb();
-	st->should_run = true;
-	wake_up_process(st->thread);
+	if ((ret = st->result) && st->last) {
+		st->rollback = true;
+		st->bringup = !bringup;
+
+		__cpuhp_kick_ap(st);
+	}
+
+	return ret;
 }
 
 static int cpuhp_kick_ap_work(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-	enum cpuhp_state state = st->state;
+	enum cpuhp_state prev_state = st->state;
+	int ret;
 
-	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
 	lock_map_acquire(&cpuhp_state_lock_map);
 	lock_map_release(&cpuhp_state_lock_map);
-	__cpuhp_kick_ap_work(st);
-	wait_for_completion(&st->done);
-	trace_cpuhp_exit(cpu, st->state, state, st->result);
-	return st->result;
+
+	trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+	ret = cpuhp_kick_ap(st, st->target);
+	trace_cpuhp_exit(cpu, st->state, prev_state, ret);
+
+	return ret;
 }
 
 static struct smp_hotplug_thread cpuhp_threads = {
@@ -693,11 +754,32 @@ void cpuhp_report_idle_dead(void)
 				 cpuhp_complete_idle_dead, st, 0);
 }
 
-#else
-#define takedown_cpu		NULL
-#endif
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	for (st->state++; st->state < st->target; st->state++) {
+		struct cpuhp_step *step = cpuhp_get_step(st->state);
 
-#ifdef CONFIG_HOTPLUG_CPU
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+	}
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+				enum cpuhp_state target)
+{
+	enum cpuhp_state prev_state = st->state;
+	int ret = 0;
+
+	for (; st->state > target; st->state--) {
+		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_down(cpu, st);
+			break;
+		}
+	}
+	return ret;
+}
 
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -716,13 +798,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	prev_state = st->state;
-	st->target = target;
+	prev_state = cpuhp_set_state(st, target);
 	/*
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread.
 	 */
 	if (st->state > CPUHP_TEARDOWN_CPU) {
+		st->target = max((int)target, CPUHP_TEARDOWN_CPU);
 		ret = cpuhp_kick_ap_work(cpu);
 		/*
 		 * The AP side has done the error rollback already. Just
@@ -737,6 +819,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 		 */
 		if (st->state > CPUHP_TEARDOWN_CPU)
 			goto out;
+
+		st->target = target;
 	}
 	/*
 	 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -744,9 +828,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	 */
 	ret = cpuhp_down_callbacks(cpu, st, target);
 	if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
-		st->target = prev_state;
-		st->rollback = true;
-		cpuhp_kick_ap_work(cpu);
+		cpuhp_reset_state(st, prev_state);
+		__cpuhp_kick_ap(st);
 	}
 
 out:
@@ -771,11 +854,15 @@ out:
 	cpu_maps_update_done();
 	return err;
 }
+
 int cpu_down(unsigned int cpu)
 {
 	return do_cpu_down(cpu, CPUHP_OFFLINE);
 }
 EXPORT_SYMBOL(cpu_down);
+
+#else
+#define takedown_cpu		NULL
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /**
@@ -846,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	st->target = target;
+	cpuhp_set_state(st, target);
 	/*
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread once more.
@@ -1313,6 +1400,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
 	struct cpuhp_step *sp = cpuhp_get_step(state);
 	int ret;
 
+	/*
+	 * If there's nothing to do, we done.
+	 * Relies on the union for multi_instance.
+	 */
 	if ((bringup && !sp->startup.single) ||
 	    (!bringup && !sp->teardown.single))
 		return 0;
-- 
cgit v1.2.3


From 724a86881d03ee5794148e65142e24ed3621be66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Sep 2017 19:00:18 +0200
Subject: smp/hotplug: Callback vs state-machine consistency

While the generic callback functions have an 'int' return and thus
appear to be allowed to return error, this is not true for all states.

Specifically, what used to be STARTING/DYING are ran with IRQs
disabled from critical parts of CPU bringup/teardown and are not
allowed to fail. Add WARNs to enforce this rule.

But since some callbacks are indeed allowed to fail, we have the
situation where a state-machine rollback encounters a failure, in this
case we're stuck, we can't go forward and we can't go back. Also add a
WARN for that case.

AFAICT this is a fundamental 'problem' with no real obvious solution.
We want the 'prepare' callbacks to allow failure on either up or down.
Typically on prepare-up this would be things like -ENOMEM from
resource allocations, and the typical usage in prepare-down would be
something like -EBUSY to avoid CPUs being taken away.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: bigeasy@linutronix.de
Cc: efault@gmx.de
Cc: rostedt@goodmis.org
Cc: max.byungchul.park@gmail.com
Link: https://lkml.kernel.org/r/20170920170546.819539119@infradead.org
---
 kernel/cpu.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1139063de5af..d6f1b8c36400 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -202,7 +202,14 @@ err:
 	hlist_for_each(node, &step->list) {
 		if (!cnt--)
 			break;
-		cbm(cpu, node);
+
+		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+		ret = cbm(cpu, node);
+		trace_cpuhp_exit(cpu, st->state, state, ret);
+		/*
+		 * Rollback must not fail,
+		 */
+		WARN_ON_ONCE(ret);
 	}
 	return ret;
 }
@@ -659,6 +666,7 @@ static int take_cpu_down(void *_param)
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 	enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
 	int err, cpu = smp_processor_id();
+	int ret;
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
@@ -672,8 +680,13 @@ static int take_cpu_down(void *_param)
 	WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
 	st->state--;
 	/* Invoke the former CPU_DYING callbacks */
-	for (; st->state > target; st->state--)
-		cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+	for (; st->state > target; st->state--) {
+		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+		/*
+		 * DYING must not fail!
+		 */
+		WARN_ON_ONCE(ret);
+	}
 
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
@@ -876,11 +889,16 @@ void notify_cpu_starting(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+	int ret;
 
 	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
 	while (st->state < target) {
 		st->state++;
-		cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+		/*
+		 * STARTING must not fail!
+		 */
+		WARN_ON_ONCE(ret);
 	}
 }
 
-- 
cgit v1.2.3


From 5f4b55e10645b7371322c800a5ec745cab487a6c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Sep 2017 19:00:20 +0200
Subject: smp/hotplug: Differentiate the AP-work lockdep class between up and
 down

With lockdep-crossrelease we get deadlock reports that span cpu-up and
cpu-down chains. Such deadlocks cannot possibly happen because cpu-up
and cpu-down are globally serialized.

  CPU0                  CPU1                    CPU2
  cpuhp_up_callbacks:   takedown_cpu:           cpuhp_thread_fun:

  cpuhp_state
                        irq_lock_sparse()
    irq_lock_sparse()
                        wait_for_completion()
                                                cpuhp_state
                                                complete()

Now that we have consistent AP state, we can trivially separate the
AP-work class between up and down using st->bringup.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: max.byungchul.park@gmail.com
Cc: bigeasy@linutronix.de
Cc: efault@gmx.de
Cc: rostedt@goodmis.org
Link: https://lkml.kernel.org/r/20170920170546.922524234@infradead.org
---
 kernel/cpu.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index d6f1b8c36400..d5c09985fbb6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -68,9 +68,26 @@ struct cpuhp_cpu_state {
 static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
 
 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
-static struct lockdep_map cpuhp_state_lock_map =
-	STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_up_map =
+	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+	lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static void inline cpuhp_lock_release(bool bringup)
+{
+	lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
+
 #endif
 
 /**
@@ -486,7 +503,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
 	if (WARN_ON_ONCE(!st->should_run))
 		return;
 
-	lock_map_acquire(&cpuhp_state_lock_map);
+	cpuhp_lock_acquire(bringup);
 
 	if (st->single) {
 		state = st->cb_state;
@@ -537,7 +554,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
 	}
 
 next:
-	lock_map_release(&cpuhp_state_lock_map);
+	cpuhp_lock_release(bringup);
 
 	if (!st->should_run)
 		complete(&st->done);
@@ -554,8 +571,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
 	if (!cpu_online(cpu))
 		return 0;
 
-	lock_map_acquire(&cpuhp_state_lock_map);
-	lock_map_release(&cpuhp_state_lock_map);
+	cpuhp_lock_acquire(false);
+	cpuhp_lock_release(false);
+
+	cpuhp_lock_acquire(true);
+	cpuhp_lock_release(true);
 
 	/*
 	 * If we are up and running, use the hotplug thread. For early calls
@@ -593,8 +613,11 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
 	enum cpuhp_state prev_state = st->state;
 	int ret;
 
-	lock_map_acquire(&cpuhp_state_lock_map);
-	lock_map_release(&cpuhp_state_lock_map);
+	cpuhp_lock_acquire(false);
+	cpuhp_lock_release(false);
+
+	cpuhp_lock_acquire(true);
+	cpuhp_lock_release(true);
 
 	trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
 	ret = cpuhp_kick_ap(st, st->target);
-- 
cgit v1.2.3


From 5ebe7742fff8be5f1359bc50f5d43fb6ff7bd060 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Sep 2017 19:00:19 +0200
Subject: smp/hotplug: Differentiate the AP completion between up and down

With lockdep-crossrelease we get deadlock reports that span cpu-up and
cpu-down chains. Such deadlocks cannot possibly happen because cpu-up
and cpu-down are globally serialized.

  takedown_cpu()
    irq_lock_sparse()
    wait_for_completion(&st->done)

                                cpuhp_thread_fun
                                  cpuhp_up_callback
                                    cpuhp_invoke_callback
                                      irq_affinity_online_cpu
                                        irq_local_spare()
                                        irq_unlock_sparse()
                                  complete(&st->done)

Now that we have consistent AP state, we can trivially separate the
AP completion between up and down using st->bringup.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: max.byungchul.park@gmail.com
Cc: bigeasy@linutronix.de
Cc: efault@gmx.de
Cc: rostedt@goodmis.org
Link: https://lkml.kernel.org/r/20170920170546.872472799@infradead.org
---
 kernel/cpu.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index d5c09985fbb6..6bbe261b851f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -46,7 +46,8 @@
  * @bringup:	Single callback bringup or teardown selector
  * @cb_state:	The state for a single callback (install/uninstall)
  * @result:	Result of the operation
- * @done:	Signal completion to the issuer of the task
+ * @done_up:	Signal completion to the issuer of the task for cpu-up
+ * @done_down:	Signal completion to the issuer of the task for cpu-down
  */
 struct cpuhp_cpu_state {
 	enum cpuhp_state	state;
@@ -61,7 +62,8 @@ struct cpuhp_cpu_state {
 	struct hlist_node	*last;
 	enum cpuhp_state	cb_state;
 	int			result;
-	struct completion	done;
+	struct completion	done_up;
+	struct completion	done_down;
 #endif
 };
 
@@ -130,14 +132,6 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state)
 	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
 }
 
-/*
- * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
- */
-static bool cpuhp_is_atomic_state(enum cpuhp_state state)
-{
-	return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
-}
-
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 {
 	struct cpuhp_step *sp;
@@ -232,6 +226,26 @@ err:
 }
 
 #ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+	struct completion *done = bringup ? &st->done_up : &st->done_down;
+	wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+	struct completion *done = bringup ? &st->done_up : &st->done_down;
+	complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+	return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 bool cpuhp_tasks_frozen;
@@ -368,7 +382,7 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
 	smp_mb();
 	st->should_run = true;
 	wake_up_process(st->thread);
-	wait_for_completion(&st->done);
+	wait_for_ap_thread(st, st->bringup);
 }
 
 static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
@@ -391,7 +405,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 
 	/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
-	wait_for_completion(&st->done);
+	wait_for_ap_thread(st, true);
 	if (WARN_ON_ONCE((!cpu_online(cpu))))
 		return -ECANCELED;
 
@@ -464,7 +478,8 @@ static void cpuhp_create(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 
-	init_completion(&st->done);
+	init_completion(&st->done_up);
+	init_completion(&st->done_down);
 }
 
 static int cpuhp_should_run(unsigned int cpu)
@@ -557,7 +572,7 @@ next:
 	cpuhp_lock_release(bringup);
 
 	if (!st->should_run)
-		complete(&st->done);
+		complete_ap_thread(st, bringup);
 }
 
 /* Invoke a single callback on a remote cpu */
@@ -753,7 +768,7 @@ static int takedown_cpu(unsigned int cpu)
 	 *
 	 * Wait for the stop thread to go away.
 	 */
-	wait_for_completion(&st->done);
+	wait_for_ap_thread(st, false);
 	BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
 
 	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -772,7 +787,7 @@ static void cpuhp_complete_idle_dead(void *arg)
 {
 	struct cpuhp_cpu_state *st = arg;
 
-	complete(&st->done);
+	complete_ap_thread(st, false);
 }
 
 void cpuhp_report_idle_dead(void)
@@ -939,7 +954,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
 		return;
 
 	st->state = CPUHP_AP_ONLINE_IDLE;
-	complete(&st->done);
+	complete_ap_thread(st, true);
 }
 
 /* Requires cpu_add_remove_lock to be held */
-- 
cgit v1.2.3


From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Sep 2017 19:00:21 +0200
Subject: smp/hotplug: Hotplug state fail injection

Add a sysfs file to one-time fail a specific state. This can be used
to test the state rollback code paths.

Something like this (hotplug-up.sh):

  #!/bin/bash

  echo 0 > /debug/sched_debug
  echo 1 > /debug/tracing/events/cpuhp/enable

  ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1`
  STATES=${1:-$ALL_STATES}

  for state in $STATES
  do
	  echo 0 > /sys/devices/system/cpu/cpu1/online
	  echo 0 > /debug/tracing/trace
	  echo Fail state: $state
	  echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail
	  cat /sys/devices/system/cpu/cpu1/hotplug/fail
	  echo 1 > /sys/devices/system/cpu/cpu1/online

	  cat /debug/tracing/trace > hotfail-${state}.trace

	  sleep 1
  done

Can be used to test for all possible rollback (barring multi-instance)
scenarios on CPU-up, CPU-down is a trivial modification of the above.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: bigeasy@linutronix.de
Cc: efault@gmx.de
Cc: rostedt@goodmis.org
Cc: max.byungchul.park@gmail.com
Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org
---
 include/linux/cpuhotplug.h |  3 ++-
 kernel/cpu.c               | 60 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 477b2e6f60f7..6d508767e144 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -22,7 +22,8 @@
  */
 
 enum cpuhp_state {
-	CPUHP_OFFLINE,
+	CPUHP_INVALID = -1,
+	CPUHP_OFFLINE = 0,
 	CPUHP_CREATE_THREADS,
 	CPUHP_PERF_PREPARE,
 	CPUHP_PERF_X86_PREPARE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6bbe261b851f..8de11a29e495 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -52,6 +52,7 @@
 struct cpuhp_cpu_state {
 	enum cpuhp_state	state;
 	enum cpuhp_state	target;
+	enum cpuhp_state	fail;
 #ifdef CONFIG_SMP
 	struct task_struct	*thread;
 	bool			should_run;
@@ -67,7 +68,9 @@ struct cpuhp_cpu_state {
 #endif
 };
 
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+	.fail = CPUHP_INVALID,
+};
 
 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
 static struct lockdep_map cpuhp_state_up_map =
@@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 	int (*cb)(unsigned int cpu);
 	int ret, cnt;
 
+	if (st->fail == state) {
+		st->fail = CPUHP_INVALID;
+
+		if (!(bringup ? step->startup.single : step->teardown.single))
+			return 0;
+
+		return -EAGAIN;
+	}
+
 	if (!step->multi_instance) {
 		WARN_ON_ONCE(lastp && *lastp);
 		cb = bringup ? step->startup.single : step->teardown.single;
@@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
 }
 static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
 
+
+static ssize_t write_cpuhp_fail(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+	struct cpuhp_step *sp;
+	int fail, ret;
+
+	ret = kstrtoint(buf, 10, &fail);
+	if (ret)
+		return ret;
+
+	/*
+	 * Cannot fail STARTING/DYING callbacks.
+	 */
+	if (cpuhp_is_atomic_state(fail))
+		return -EINVAL;
+
+	/*
+	 * Cannot fail anything that doesn't have callbacks.
+	 */
+	mutex_lock(&cpuhp_state_mutex);
+	sp = cpuhp_get_step(fail);
+	if (!sp->startup.single && !sp->teardown.single)
+		ret = -EINVAL;
+	mutex_unlock(&cpuhp_state_mutex);
+	if (ret)
+		return ret;
+
+	st->fail = fail;
+
+	return count;
+}
+
+static ssize_t show_cpuhp_fail(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+	return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+
 static struct attribute *cpuhp_cpu_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_target.attr,
+	&dev_attr_fail.attr,
 	NULL
 };
 
-- 
cgit v1.2.3


From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001
From: Jeffy Chen <jeffy.chen@rock-chips.com>
Date: Thu, 28 Sep 2017 12:37:31 +0800
Subject: irq/generic-chip: Don't replace domain's name

When generic irq chips are allocated for an irq domain the domain name is
set to the irq chip name. That was done to have named domains before the
recent changes which enforce domain naming were done.

Since then the overwrite causes a memory leak when the domain name is
dynamically allocated and even worse it would cause the domain free code to
free the wrong name pointer, which might point to a constant.

Remove the name assignment to prevent this.

Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only")
Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com
---
 kernel/irq/generic-chip.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..5270a54b9fa4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
 		/* Calc pointer to the next generic chip */
 		tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
 	}
-	d->name = name;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
-- 
cgit v1.2.3


From 77c01d11bbb2b5c005347061bf543ab94878314c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 26 Sep 2017 10:36:03 +0100
Subject: watchdog/hardlockup/perf: Fix spelling mistake: "permanetely" ->
 "permanently"

Trivial fix to spelling mistake in pr_info message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Babu Moger <babu.moger@oracle.com>
Link: https://lkml.kernel.org/r/20170926093603.7756-1-colin.king@canonical.com
---
 kernel/watchdog_hld.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 204a8cadb717..71a62ceacdc8 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -280,7 +280,7 @@ int __init hardlockup_detector_perf_init(void)
 	int ret = hardlockup_detector_event_create();
 
 	if (ret) {
-		pr_info("Perf NMI watchdog permanetely disabled\n");
+		pr_info("Perf NMI watchdog permanently disabled\n");
 	} else {
 		perf_event_release_kernel(this_cpu_read(watchdog_ev));
 		this_cpu_write(watchdog_ev, NULL);
-- 
cgit v1.2.3


From 87cbde8d9081b91df86a21d0d743cd700e04890a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 28 Sep 2017 01:45:10 +0200
Subject: PM / s2idle: Invoke the ->wake() platform callback earlier

The role of the ->wake() platform callback for suspend-to-idle is to
deal with possible spurious wakeups, among other things.  The ACPI
implementation of it, acpi_s2idle_wake(), additionally checks the
conditions for entering the Low Power S0 Idle state by the platform
and reports the ones that have not been met.

However, the ->wake() platform callback is invoked after calling
dpm_noirq_resume_devices(), which means that the power states of some
devices may have changed since s2idle_enter() returned, so some unmet
Low Power S0 Idle conditions may be reported incorrectly as a result
of that.

To avoid these false positives, reorder the invocations of the
dpm_noirq_resume_devices() routine and the ->wake() platform callback
in s2idle_loop().

Fixes: 726fb6b4f2a8 (ACPI / PM: Check low power idle constraints for debug only)
Tested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3e2b4f519009..ccd2d20e6b06 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -120,22 +120,26 @@ static void s2idle_loop(void)
 		 * frozen processes + suspended devices + idle processors.
 		 * Thus s2idle_enter() should be called right after
 		 * all devices have been suspended.
+		 *
+		 * Wakeups during the noirq suspend of devices may be spurious,
+		 * so prevent them from terminating the loop right away.
 		 */
 		error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
 		if (!error)
 			s2idle_enter();
+		else if (error == -EBUSY && pm_wakeup_pending())
+			error = 0;
 
-		dpm_noirq_resume_devices(PMSG_RESUME);
-		if (error && (error != -EBUSY || !pm_wakeup_pending())) {
-			dpm_noirq_end();
-			break;
-		}
-
-		if (s2idle_ops && s2idle_ops->wake)
+		if (!error && s2idle_ops && s2idle_ops->wake)
 			s2idle_ops->wake();
 
+		dpm_noirq_resume_devices(PMSG_RESUME);
+
 		dpm_noirq_end();
 
+		if (error)
+			break;
+
 		if (s2idle_ops && s2idle_ops->sync)
 			s2idle_ops->sync();
 
-- 
cgit v1.2.3


From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 6 Sep 2017 19:08:11 +0300
Subject: perf/aux: Only update ->aux_wakeup in non-overwrite mode

The following commit:

  d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index")

changed the AUX wakeup position calculation to rounddown(), which causes
a division-by-zero in AUX overwrite mode (aka "snapshot mode").

The zero denominator results from the fact that perf record doesn't set
aux_watermark to anything, in which case the kernel will set it to half
the AUX buffer size, but only for non-overwrite mode. In the overwrite
mode aux_watermark stays zero.

The good news is that, AUX overwrite mode, wakeups don't happen and
related bookkeeping is not relevant, so we can simply forego the whole
wakeup updates.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12ee..f684d8e5fa2b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -412,6 +412,19 @@ err:
 	return NULL;
 }
 
+static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
+{
+	if (rb->aux_overwrite)
+		return false;
+
+	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Commit the data written by hardware into the ring buffer by adjusting
  * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
@@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 	}
 
 	rb->user_page->aux_head = rb->aux_head;
-	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+	if (rb_need_aux_wakeup(rb))
 		wakeup = true;
-		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
-	}
 
 	if (wakeup) {
 		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 	rb->aux_head += size;
 
 	rb->user_page->aux_head = rb->aux_head;
-	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+	if (rb_need_aux_wakeup(rb)) {
 		perf_output_wakeup(handle);
-		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
 		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 	}
 
-- 
cgit v1.2.3


From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001
From: Prateek Sood <prsood@codeaurora.org>
Date: Thu, 7 Sep 2017 20:00:58 +0530
Subject: locking/rwsem-xadd: Fix missed wakeup due to reordering of load

If a spinner is present, there is a chance that the load of
rwsem_has_spinner() in rwsem_wake() can be reordered with
respect to decrement of rwsem count in __up_write() leading
to wakeup being missed:

 spinning writer                  up_write caller
 ---------------                  -----------------------
 [S] osq_unlock()                 [L] osq
  spin_lock(wait_lock)
  sem->count=0xFFFFFFFF00000001
            +0xFFFFFFFF00000000
  count=sem->count
  MB
                                   sem->count=0xFFFFFFFE00000001
                                             -0xFFFFFFFF00000001
                                   spin_trylock(wait_lock)
                                   return
 rwsem_try_write_lock(count)
 spin_unlock(wait_lock)
 schedule()

Reordering of atomic_long_sub_return_release() in __up_write()
and rwsem_has_spinner() in rwsem_wake() can cause missing of
wakeup in up_write() context. In spinning writer, sem->count
and local variable count is 0XFFFFFFFE00000001. It would result
in rwsem_try_write_lock() failing to acquire rwsem and spinning
writer going to sleep in rwsem_down_write_failed().

The smp_rmb() will make sure that the spinner state is
consulted after sem->count is updated in up_write context.

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: longman@redhat.com
Cc: parri.andrea@gmail.com
Cc: sramana@codeaurora.org
Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..1fefe6dcafd7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -612,6 +612,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 	unsigned long flags;
 	DEFINE_WAKE_Q(wake_q);
 
+	/*
+	* __rwsem_down_write_failed_common(sem)
+	*   rwsem_optimistic_spin(sem)
+	*     osq_unlock(sem->osq)
+	*   ...
+	*   atomic_long_add_return(&sem->count)
+	*
+	*      - VS -
+	*
+	*              __up_write()
+	*                if (atomic_long_sub_return_release(&sem->count) < 0)
+	*                  rwsem_wake(sem)
+	*                    osq_is_locked(&sem->osq)
+	*
+	* And __up_write() must observe !osq_is_locked() when it observes the
+	* atomic_long_add_return() in order to not miss a wakeup.
+	*
+	* This boils down to:
+	*
+	* [S.rel] X = 1                [RmW] r0 = (Y += 0)
+	*         MB                         RMB
+	* [RmW]   Y += 1               [L]   r1 = X
+	*
+	* exists (r0=1 /\ r1=0)
+	*/
+	smp_rmb();
+
 	/*
 	 * If a spinner is present, it is not necessary to do the wakeup.
 	 * Try to do wakeup only if the trylock succeeds to minimize
-- 
cgit v1.2.3


From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Sep 2017 13:43:15 -0400
Subject: fix infoleak in waitid(2)

kernel_waitid() can return a PID, an error or 0.  rusage is filled in the first
case and waitid(2) rusage should've been copied out exactly in that case, *not*
whenever kernel_waitid() has not returned an error.  Compat variant shares that
braino; none of kernel_wait4() callers do, so the below ought to fix it.

Reported-and-tested-by: Alexander Potapenko <glider@google.com>
Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland")
Cc: stable@vger.kernel.org # v4.13
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3481ababd06a..f2cd53e92147 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	struct waitid_info info = {.status = 0};
 	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
 	int signo = 0;
+
 	if (err > 0) {
 		signo = SIGCHLD;
 		err = 0;
-	}
-
-	if (!err) {
 		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
 			return -EFAULT;
 	}
@@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	if (err > 0) {
 		signo = SIGCHLD;
 		err = 0;
-	}
-
-	if (!err && uru) {
-		/* kernel_waitid() overwrites everything in ru */
-		if (COMPAT_USE_64BIT_TIME)
-			err = copy_to_user(uru, &ru, sizeof(ru));
-		else
-			err = put_compat_rusage(&ru, uru);
-		if (err)
-			return -EFAULT;
+		if (uru) {
+			/* kernel_waitid() overwrites everything in ru */
+			if (COMPAT_USE_64BIT_TIME)
+				err = copy_to_user(uru, &ru, sizeof(ru));
+			else
+				err = put_compat_rusage(&ru, uru);
+			if (err)
+				return -EFAULT;
+		}
 	}
 
 	if (!infop)
-- 
cgit v1.2.3


From 2b0b8499ae75df91455bbeb7491d45affc384fb0 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuwang@redhat.com>
Date: Tue, 12 Sep 2017 10:14:54 +0800
Subject: ftrace: Fix kmemleak in unregister_ftrace_graph

The trampoline allocated by function tracer was overwriten by function_graph
tracer, and caused a memory leak. The save_global_trampoline should have
saved the previous trampoline in register_ftrace_graph() and restored it in
unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was
only used in unregister_ftrace_graph as default value 0, and it overwrote the
previous trampoline's value. Causing the previous allocated trampoline to be
lost.

kmmeleak backtrace:
    kmemleak_vmalloc+0x77/0xc0
    __vmalloc_node_range+0x1b5/0x2c0
    module_alloc+0x7c/0xd0
    arch_ftrace_update_trampoline+0xb5/0x290
    ftrace_startup+0x78/0x210
    register_ftrace_function+0x8b/0xd0
    function_trace_init+0x4f/0x80
    tracing_set_tracer+0xe6/0x170
    tracing_set_trace_write+0x90/0xd0
    __vfs_write+0x37/0x170
    vfs_write+0xb2/0x1b0
    SyS_write+0x55/0xc0
    do_syscall_64+0x67/0x180
    return_from_SYSCALL_64+0x0/0x6a

[
  Looking further into this, I found that this was left over from when the
  function and function graph tracers shared the same ftrace_ops. But in
  commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer
  together"), the two were separated, and the save_global_trampoline no
  longer was necessary (and it may have been broken back then too).
  -- Steven Rostedt
]

Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com

Cc: stable@vger.kernel.org
Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together")
Signed-off-by: Shu Wang <shuwang@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
 
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
 static int __init set_graph_function(char *str)
 {
 	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void)
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-	/*
-	 * Function graph does not allocate the trampoline, but
-	 * other global_ops do. We need to reset the ALLOC_TRAMP flag
-	 * if one was used.
-	 */
-	global_ops.trampoline = save_global_trampoline;
-	if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
-		global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
  out:
 	mutex_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3


From f39b536ce9248e9799ff900358d6f073ab2e6c55 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 25 Sep 2017 20:19:02 -0700
Subject: rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}()

The read of ->dynticks_nmi_nesting in rcu_irq_enter() and rcu_irq_exit()
is currently protected with READ_ONCE().  However, this protection is
unnecessary because (1) ->dynticks_nmi_nesting is updated only by the
current CPU, (2) Although NMI handlers can update this field, they reset
it back to its old value before return, and (3) Interrupts are disabled,
so nothing else can modify it.  The value of ->dynticks_nmi_nesting is
thus effectively constant, and so no protection is required.

This commit therefore removes the READ_ONCE() protection from these
two accesses.

Link: http://lkml.kernel.org/r/20170926031902.GA2074@linux.vnet.ibm.com

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 63bee8e1b193..c03152f7e458 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -890,7 +890,7 @@ void rcu_irq_exit(void)
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 
 	/* Page faults can happen in NMI handlers, so check... */
-	if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+	if (rdtp->dynticks_nmi_nesting)
 		return;
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -1027,7 +1027,7 @@ void rcu_irq_enter(void)
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 
 	/* Page faults can happen in NMI handlers, so check... */
-	if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+	if (rdtp->dynticks_nmi_nesting)
 		return;
 
 	oldval = rdtp->dynticks_nesting;
-- 
cgit v1.2.3


From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 3 Oct 2017 15:37:20 -0700
Subject: bpf: fix bpf_tail_call() x64 JIT

- bpf prog_array just like all other types of bpf array accepts 32-bit index.
  Clarify that in the comment.
- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
- tighten corresponding check in the interpreter to stay consistent

The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
though JIT code is wrong it will check bounds correctly.
Hence two fixes tags. All other JITs don't have this problem.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 4 ++--
 include/uapi/linux/bpf.h    | 2 +-
 kernel/bpf/core.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c9573660d51..0554e8aef4d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	/* if (index >= array->map.max_entries)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
+	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
 #define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..f90860d1f897 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -312,7 +312,7 @@ union bpf_attr {
  *     jump into another BPF program
  *     @ctx: context pointer passed to next program
  *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: index inside array that selects specific program to run
+ *     @index: 32-bit index inside array that selects specific program to run
  *     Return: 0 on success or negative error
  *
  * int bpf_clone_redirect(skb, ifindex, flags)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
 		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
 		struct bpf_array *array = container_of(map, struct bpf_array, map);
 		struct bpf_prog *prog;
-		u64 index = BPF_R3;
+		u32 index = BPF_R3;
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-- 
cgit v1.2.3


From 630cc2b30a42c70628368a412beb4a5e5dd71abe Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:14:18 -0700
Subject: kernel/params.c: align add_sysfs_param documentation with code

This parameter is named kp, so the documentation should use that.

Fixes: 9b473de87209 ("param: Fix duplicate module prefixes")
Link: http://lkml.kernel.org/r/20170919142656.64aea59e@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..1cd8f1a895a8 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -600,7 +600,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
 /*
  * add_sysfs_param - add a parameter to sysfs
  * @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
  * @name: name of parameter
  *
  * Create a kobject if for a (per-module) parameter if mp NULL, and
-- 
cgit v1.2.3


From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Tue, 3 Oct 2017 16:15:00 -0700
Subject: android: binder: drop lru lock in isolate callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the global lru lock in isolate callback before calling
zap_page_range which calls cond_resched, and re-acquire the global lru
lock before returning.  Also change return code to LRU_REMOVED_RETRY.

Use mmput_async when fail to acquire mmap sem in an atomic context.

Fix "BUG: sleeping function called from invalid context"
errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled.

Also restore mmput_async, which was initially introduced in commit
ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom
reaper context"), and was removed in commit 212925802454 ("mm: oom: let
oom_reap_task and exit_mmap run concurrently").

Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com
Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder")
Signed-off-by: Sherry Yang <sherryy@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Kyle Yan <kyan@codeaurora.org>
Acked-by: Arve Hjønnevåg <arve@android.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Martijn Coenen <maco@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Riley Andrews <riandrews@android.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hoeun Ryu <hoeun.ryu@gmail.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/android/binder_alloc.c | 18 ++++++++++++------
 include/linux/sched/mm.h       |  6 ++++++
 kernel/fork.c                  | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 8fe165844e47..064f5e31ec55 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	struct binder_alloc *alloc;
 	uintptr_t page_addr;
 	size_t index;
+	struct vm_area_struct *vma;
 
 	alloc = page->alloc;
 	if (!mutex_trylock(&alloc->mutex))
@@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	index = page - alloc->pages;
 	page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
-	if (alloc->vma) {
+	vma = alloc->vma;
+	if (vma) {
 		mm = get_task_mm(alloc->tsk);
 		if (!mm)
 			goto err_get_task_mm_failed;
 		if (!down_write_trylock(&mm->mmap_sem))
 			goto err_down_write_mmap_sem_failed;
+	}
+
+	list_lru_isolate(lru, item);
+	spin_unlock(lock);
 
+	if (vma) {
 		trace_binder_unmap_user_start(alloc, index);
 
-		zap_page_range(alloc->vma,
+		zap_page_range(vma,
 			       page_addr + alloc->user_buffer_offset,
 			       PAGE_SIZE);
 
@@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	trace_binder_unmap_kernel_end(alloc, index);
 
-	list_lru_isolate(lru, item);
-
+	spin_lock(lock);
 	mutex_unlock(&alloc->mutex);
-	return LRU_REMOVED;
+	return LRU_REMOVED_RETRY;
 
 err_down_write_mmap_sem_failed:
-	mmput(mm);
+	mmput_async(mm);
 err_get_task_mm_failed:
 err_page_already_freed:
 	mutex_unlock(&alloc->mutex);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3a19c253bdb1..ae53e413fb13 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
+ * be called from the atomic context as well
+ */
+void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct,
+					    async_put_work);
+
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
+	}
+}
+#endif
+
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *
-- 
cgit v1.2.3


From 3181c38e4df257852a0c0a53552fd5c869402886 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Tue, 3 Oct 2017 16:16:07 -0700
Subject: kernel/sysctl.c: remove duplicate UINT_MAX check on
 do_proc_douintvec_conv()

do_proc_douintvec_conv() has two UINT_MAX checks, we can remove one.
This has no functional changes other than fixing a compiler warning:

  kernel/sysctl.c:2190]: (warning) Identical condition '*lvalp>UINT_MAX', second condition is always false

Fixes: 4f2fec00afa60 ("sysctl: simplify unsigned int support")
Link: http://lkml.kernel.org/r/20170919072918.12066-1-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reported-by: David Binderman <dcb314@hotmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 423554ad3610..4da9e622471f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2186,8 +2186,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
 				  int write, void *data)
 {
 	if (write) {
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
 		if (*lvalp > UINT_MAX)
 			return -EINVAL;
 		*valp = *lvalp;
-- 
cgit v1.2.3


From 1fdcce6e16c54facc4f0688630d3b9ecfcaa411f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:16:23 -0700
Subject: memremap: add scheduling point to devm_memremap_pages

devm_memremap_pages is initializing struct pages in for_each_device_pfn
and that can take quite some time.  We have even seen a soft lockup
triggering on a non preemptive kernel

  NMI watchdog: BUG: soft lockup - CPU#61 stuck for 22s! [kworker/u641:11:1808]
  [...]
  RIP: 0010:[<ffffffff8118b6b7>]  [<ffffffff8118b6b7>] devm_memremap_pages+0x327/0x430
  [...]
  Call Trace:
    pmem_attach_disk+0x2fd/0x3f0 [nd_pmem]
    nvdimm_bus_probe+0x64/0x110 [libnvdimm]
    driver_probe_device+0x1f7/0x420
    bus_for_each_drv+0x52/0x80
    __device_attach+0xb0/0x130
    bus_probe_device+0x87/0xa0
    device_add+0x3fc/0x5f0
    nd_async_device_register+0xe/0x40 [libnvdimm]
    async_run_entry_fn+0x43/0x150
    process_one_work+0x14e/0x410
    worker_thread+0x116/0x490
    kthread+0xc7/0xe0
    ret_from_fork+0x3f/0x70

fix this by adding cond_resched every 1024 pages.

Link: http://lkml.kernel.org/r/20170918121410.24466-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	pgprot_t pgprot = PAGE_KERNEL;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
-	int error, nid, is_ram;
+	int error, nid, is_ram, i = 0;
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		list_del(&page->lru);
 		page->pgmap = pgmap;
 		percpu_ref_get(ref);
+		if (!(++i % 1024))
+			cond_resched();
 	}
 	devres_add(dev, page_map);
 	return __va(res->start);
-- 
cgit v1.2.3


From c9653850c90d6050197bc76ba672e00a7771aea5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 3 Oct 2017 16:16:26 -0700
Subject: kernel/kcmp.c: drop branch leftover typo

The else branch been left over and escaped the source code refresh.  Not
a problem but better clean it up.

Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files")
Link: http://lkml.kernel.org/r/20170917165838.GA1887@uranus.lan
Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
 	if (filp_epoll) {
 		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
 		fput(filp_epoll);
-	} else
+	}
 
 	if (IS_ERR(filp_tgt))
 		return PTR_ERR(filp_tgt);
-- 
cgit v1.2.3


From 90ceb2a3ad868f800eb1c9f4ede650daddd94b77 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:16:35 -0700
Subject: kernel/params.c: fix the maximum length in param_get_string

The length parameter of strlcpy() is supposed to reflect the size of the
target buffer, not of the source string.  Harmless in this case as the
buffer is PAGE_SIZE long and the source string is always much shorter than
this, but conceptually wrong, so let's fix it.

Link: http://lkml.kernel.org/r/20170928162515.24846b4f@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 1cd8f1a895a8..8283ba045f4f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(param_set_copystring);
 int param_get_string(char *buffer, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
-	return strlcpy(buffer, kps->string, kps->maxlen);
+	return strlcpy(buffer, kps->string, PAGE_SIZE);
 }
 EXPORT_SYMBOL(param_get_string);
 
-- 
cgit v1.2.3


From 96802e6b1dbf29d3012b39503c5dd6d9d8e82955 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:16:38 -0700
Subject: kernel/params.c: fix an overflow in param_attr_show

Function param_attr_show could overflow the buffer it is operating on.

The buffer size is PAGE_SIZE, and the string returned by
attribute->param->ops->get is generated by scnprintf(buffer, PAGE_SIZE,
...) so it could be PAGE_SIZE - 1 long, with the terminating '\0' at the
very end of the buffer.  Calling strcat(..., "\n") on this isn't safe, as
the '\0' will be replaced by '\n' (OK) and then another '\0' will be added
past the end of the buffer (not OK.)

Simply add the trailing '\n' when writing the attribute contents to the
buffer originally.  This is safe, and also faster.

Credits to Teradata for discovering this issue.

Link: http://lkml.kernel.org/r/20170928162602.60c379c7@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 8283ba045f4f..0cca488263dd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
 	}								\
 	int param_get_##name(char *buffer, const struct kernel_param *kp) \
 	{								\
-		return scnprintf(buffer, PAGE_SIZE, format,		\
+		return scnprintf(buffer, PAGE_SIZE, format "\n",	\
 				*((type *)kp->arg));			\
 	}								\
 	const struct kernel_param_ops param_ops_##name = {			\
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
 
 int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
-	return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
 }
 EXPORT_SYMBOL(param_get_charp);
 
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
 	/* Y and N chosen as being relatively non-coder friendly */
-	return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+	return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
 
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
 
 int param_get_invbool(char *buffer, const struct kernel_param *kp)
 {
-	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+	return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 EXPORT_SYMBOL(param_get_invbool);
 
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 	struct kernel_param p = *kp;
 
 	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+		/* Replace \n with comma */
 		if (i)
-			buffer[off++] = ',';
+			buffer[off - 1] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
 		check_kparam_locked(p.mod);
 		ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
 int param_get_string(char *buffer, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
-	return strlcpy(buffer, kps->string, PAGE_SIZE);
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
 }
 EXPORT_SYMBOL(param_get_string);
 
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	kernel_param_lock(mk->mod);
 	count = attribute->param->ops->get(buf, attribute->param);
 	kernel_param_unlock(mk->mod);
-	if (count > 0) {
-		strcat(buf, "\n");
-		++count;
-	}
 	return count;
 }
 
-- 
cgit v1.2.3


From e0596c80f442d1e1221c17dbb71b2aed43909221 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:16:41 -0700
Subject: kernel/params.c: improve STANDARD_PARAM_DEF readability

Align the parameters passed to STANDARD_PARAM_DEF for clarity.

Link: http://lkml.kernel.org/r/20170928162728.756143cc@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 0cca488263dd..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
 	EXPORT_SYMBOL(param_ops_##name)
 
 
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte,	unsigned char,		"%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short,	short,			"%hi",  kstrtos16);
+STANDARD_PARAM_DEF(ushort,	unsigned short,		"%hu",  kstrtou16);
+STANDARD_PARAM_DEF(int,		int,			"%i",   kstrtoint);
+STANDARD_PARAM_DEF(uint,	unsigned int,		"%u",   kstrtouint);
+STANDARD_PARAM_DEF(long,	long,			"%li",  kstrtol);
+STANDARD_PARAM_DEF(ulong,	unsigned long,		"%lu",  kstrtoul);
+STANDARD_PARAM_DEF(ullong,	unsigned long long,	"%llu", kstrtoull);
 
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
-- 
cgit v1.2.3


From 6b9dc4806b28214a4a260517e59439e0ac12a15e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 2 Oct 2017 12:34:50 +0200
Subject: watchdog/core, powerpc: Replace watchdog_nmi_reconfigure()

The recent cleanup of the watchdog code split watchdog_nmi_reconfigure()
into two stages. One to stop the NMI and one to restart it after
reconfiguration. That was done by adding a boolean 'run' argument to the
code, which is functionally correct but not necessarily a piece of art.

Replace it by two explicit functions: watchdog_nmi_stop() and
watchdog_nmi_start().

Fixes: 6592ad2fcc8f ("watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage")
Requested-by: Linus 'Nursing his pet-peeve' Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Thomas 'Mopping up garbage' Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710021957480.2114@nanos
---
 arch/powerpc/kernel/watchdog.c | 23 ++++++++++++++---------
 include/linux/nmi.h            |  3 ++-
 kernel/watchdog.c              | 33 ++++++++++++++++++---------------
 3 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index dfb067764480..2673ec8bec00 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -355,19 +355,24 @@ static void watchdog_calc_timeouts(void)
 	wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
 }
 
-void watchdog_nmi_reconfigure(bool run)
+void watchdog_nmi_stop(void)
 {
 	int cpu;
 
 	cpus_read_lock();
-	if (!run) {
-		for_each_cpu(cpu, &wd_cpus_enabled)
-			stop_wd_on_cpu(cpu);
-	} else {
-		watchdog_calc_timeouts();
-		for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
-			start_wd_on_cpu(cpu);
-	}
+	for_each_cpu(cpu, &wd_cpus_enabled)
+		stop_wd_on_cpu(cpu);
+	cpus_read_unlock();
+}
+
+void watchdog_nmi_start(void)
+{
+	int cpu;
+
+	cpus_read_lock();
+	watchdog_calc_timeouts();
+	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
+		start_wd_on_cpu(cpu);
 	cpus_read_unlock();
 }
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 89ba8b23c6fe..0c9ed49fb21a 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -109,7 +109,8 @@ static inline int hardlockup_detector_perf_init(void) { return 0; }
 # endif
 #endif
 
-void watchdog_nmi_reconfigure(bool run);
+void watchdog_nmi_stop(void);
+void watchdog_nmi_start(void);
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f6ef163b72cd..6ad6226535d0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -123,24 +123,27 @@ int __weak __init watchdog_nmi_probe(void)
 }
 
 /**
- * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs
- * @run:	If false stop the watchdogs on all enabled CPUs
- *		If true start the watchdogs on all enabled CPUs
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
  *
- * The core call order is:
- * watchdog_nmi_reconfigure(false);
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
  * update_variables();
- * watchdog_nmi_reconfigure(true);
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
  *
- * The second call which starts the watchdogs again guarantees that the
- * following variables are stable across the call.
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
  * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
- *
- * After the call the variables can be changed again.
  */
-void __weak watchdog_nmi_reconfigure(bool run) { }
+void __weak watchdog_nmi_start(void) { }
 
 /**
  * lockup_detector_update_enable - Update the sysctl enable bit
@@ -551,13 +554,13 @@ static void softlockup_unpark_threads(void)
 
 static void softlockup_reconfigure_threads(void)
 {
-	watchdog_nmi_reconfigure(false);
+	watchdog_nmi_stop();
 	softlockup_park_all_threads();
 	set_sample_period();
 	lockup_detector_update_enable();
 	if (watchdog_enabled && watchdog_thresh)
 		softlockup_unpark_threads();
-	watchdog_nmi_reconfigure(true);
+	watchdog_nmi_start();
 }
 
 /*
@@ -602,9 +605,9 @@ static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
 static void softlockup_reconfigure_threads(void)
 {
-	watchdog_nmi_reconfigure(false);
+	watchdog_nmi_stop();
 	lockup_detector_update_enable();
-	watchdog_nmi_reconfigure(true);
+	watchdog_nmi_start();
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
-- 
cgit v1.2.3


From e31d6883f21c1cdfe5bc64e28411f8a92b783fde Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Oct 2017 16:37:53 +0200
Subject: watchdog/core, powerpc: Lock cpus across reconfiguration

Instead of dropping the cpu hotplug lock after stopping NMI watchdog and
threads and reaquiring for restart, the code and the protection rules
become more obvious when holding cpu hotplug lock across the full
reconfiguration.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710022105570.2114@nanos
---
 arch/powerpc/kernel/watchdog.c |  4 ----
 kernel/smpboot.c               |  3 +--
 kernel/watchdog.c              | 10 +++++++++-
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2673ec8bec00..f9b4c6352d24 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -359,21 +359,17 @@ void watchdog_nmi_stop(void)
 {
 	int cpu;
 
-	cpus_read_lock();
 	for_each_cpu(cpu, &wd_cpus_enabled)
 		stop_wd_on_cpu(cpu);
-	cpus_read_unlock();
 }
 
 void watchdog_nmi_start(void)
 {
 	int cpu;
 
-	cpus_read_lock();
 	watchdog_calc_timeouts();
 	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
 		start_wd_on_cpu(cpu);
-	cpus_read_unlock();
 }
 
 /*
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index ed7507b69b48..5043e7433f4b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -351,7 +351,7 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread
 	static struct cpumask tmp;
 	unsigned int cpu;
 
-	get_online_cpus();
+	lockdep_assert_cpus_held();
 	mutex_lock(&smpboot_threads_lock);
 
 	/* Park threads that were exclusively enabled on the old mask. */
@@ -367,7 +367,6 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread
 	cpumask_copy(old, new);
 
 	mutex_unlock(&smpboot_threads_lock);
-	put_online_cpus();
 }
 
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6ad6226535d0..fff90fe10007 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -535,7 +535,6 @@ static void softlockup_update_smpboot_threads(void)
 
 	smpboot_update_cpumask_percpu_thread(&watchdog_threads,
 					     &watchdog_allowed_mask);
-	__lockup_detector_cleanup();
 }
 
 /* Temporarily park all watchdog threads */
@@ -554,6 +553,7 @@ static void softlockup_unpark_threads(void)
 
 static void softlockup_reconfigure_threads(void)
 {
+	cpus_read_lock();
 	watchdog_nmi_stop();
 	softlockup_park_all_threads();
 	set_sample_period();
@@ -561,6 +561,12 @@ static void softlockup_reconfigure_threads(void)
 	if (watchdog_enabled && watchdog_thresh)
 		softlockup_unpark_threads();
 	watchdog_nmi_start();
+	cpus_read_unlock();
+	/*
+	 * Must be called outside the cpus locked section to prevent
+	 * recursive locking in the perf code.
+	 */
+	__lockup_detector_cleanup();
 }
 
 /*
@@ -605,9 +611,11 @@ static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
 static void softlockup_reconfigure_threads(void)
 {
+	cpus_read_lock();
 	watchdog_nmi_stop();
 	lockup_detector_update_enable();
 	watchdog_nmi_start();
+	cpus_read_unlock();
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
-- 
cgit v1.2.3


From 34ddaa3e5c0096fef52485186c7eb6cf56ddc686 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Oct 2017 16:39:02 +0200
Subject: powerpc/watchdog: Make use of watchdog_nmi_probe()

The rework of the core hotplug code triggers the WARN_ON in start_wd_cpu()
on powerpc because it is called multiple times for the boot CPU.

The first call is via:

  start_wd_on_cpu+0x80/0x2f0
  watchdog_nmi_reconfigure+0x124/0x170
  softlockup_reconfigure_threads+0x110/0x130
  lockup_detector_init+0xbc/0xe0
  kernel_init_freeable+0x18c/0x37c
  kernel_init+0x2c/0x160
  ret_from_kernel_thread+0x5c/0xbc

And then again via the CPU hotplug registration:

  start_wd_on_cpu+0x80/0x2f0
  cpuhp_invoke_callback+0x194/0x620
  cpuhp_thread_fun+0x7c/0x1b0
  smpboot_thread_fn+0x290/0x2a0
  kthread+0x168/0x1b0
  ret_from_kernel_thread+0x5c/0xbc

This can be avoided by setting up the cpu hotplug state with nocalls and
move the initialization to the watchdog_nmi_probe() function. That
initializes the hotplug callbacks without invoking the callback and the
following core initialization function then configures the watchdog for the
online CPUs (in this case CPU0) via softlockup_reconfigure_threads().

Reported-and-tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/kernel/watchdog.c | 17 ++++++++---------
 include/linux/nmi.h            |  1 +
 kernel/watchdog.c              |  5 ++++-
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index f9b4c6352d24..c702a8981452 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -373,22 +373,21 @@ void watchdog_nmi_start(void)
 }
 
 /*
- * This runs after lockup_detector_init() which sets up watchdog_cpumask.
+ * Invoked from core watchdog init.
  */
-static int __init powerpc_watchdog_init(void)
+int __init watchdog_nmi_probe(void)
 {
 	int err;
 
-	watchdog_calc_timeouts();
-
-	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
-				start_wd_on_cpu, stop_wd_on_cpu);
-	if (err < 0)
+	err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"powerpc/watchdog:online",
+					start_wd_on_cpu, stop_wd_on_cpu);
+	if (err < 0) {
 		pr_warn("Watchdog could not be initialized");
-
+		return err;
+	}
 	return 0;
 }
-arch_initcall(powerpc_watchdog_init);
 
 static void handle_backtrace_ipi(struct pt_regs *regs)
 {
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 0c9ed49fb21a..27e249ed7c5c 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -111,6 +111,7 @@ static inline int hardlockup_detector_perf_init(void) { return 0; }
 
 void watchdog_nmi_stop(void);
 void watchdog_nmi_start(void);
+int watchdog_nmi_probe(void);
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index fff90fe10007..5c6fb7cd9ae8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -608,7 +608,6 @@ static inline int watchdog_park_threads(void) { return 0; }
 static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
-static inline void softlockup_init_threads(void) { }
 static void softlockup_reconfigure_threads(void)
 {
 	cpus_read_lock();
@@ -617,6 +616,10 @@ static void softlockup_reconfigure_threads(void)
 	watchdog_nmi_start();
 	cpus_read_unlock();
 }
+static inline void softlockup_init_threads(void)
+{
+	softlockup_reconfigure_threads();
+}
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)
-- 
cgit v1.2.3


From 5587185ddb4b9f413299dfec0a022ad0212513e8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Oct 2017 10:03:04 +0200
Subject: watchdog/core: Rename some softlockup_* functions

The function names made sense up to the point where the watchdog
(re)configuration was unified to use softlockup_reconfigure_threads() for
all configuration purposes. But that includes scenarios which solely
configure the nmi watchdog.

Rename softlockup_reconfigure_threads() and softlockup_init_threads() so
the function names match the functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
---
 kernel/watchdog.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5c6fb7cd9ae8..d241bd99cee1 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -551,7 +551,7 @@ static void softlockup_unpark_threads(void)
 	softlockup_update_smpboot_threads();
 }
 
-static void softlockup_reconfigure_threads(void)
+static void lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
@@ -570,13 +570,13 @@ static void softlockup_reconfigure_threads(void)
 }
 
 /*
- * Create the watchdog thread infrastructure.
+ * Create the watchdog thread infrastructure and configure the detector(s).
  *
  * The threads are not unparked as watchdog_allowed_mask is empty.  When
  * the threads are sucessfully initialized, take the proper locks and
  * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
  */
-static __init void softlockup_init_threads(void)
+static __init void lockup_detector_setup(void)
 {
 	int ret;
 
@@ -599,7 +599,7 @@ static __init void softlockup_init_threads(void)
 
 	mutex_lock(&watchdog_mutex);
 	softlockup_threads_initialized = true;
-	softlockup_reconfigure_threads();
+	lockup_detector_reconfigure();
 	mutex_unlock(&watchdog_mutex);
 }
 
@@ -608,7 +608,7 @@ static inline int watchdog_park_threads(void) { return 0; }
 static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
-static void softlockup_reconfigure_threads(void)
+static void lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
@@ -616,9 +616,9 @@ static void softlockup_reconfigure_threads(void)
 	watchdog_nmi_start();
 	cpus_read_unlock();
 }
-static inline void softlockup_init_threads(void)
+static inline void lockup_detector_setup(void)
 {
-	softlockup_reconfigure_threads();
+	lockup_detector_reconfigure();
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
@@ -658,7 +658,7 @@ static void proc_watchdog_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-	softlockup_reconfigure_threads();
+	lockup_detector_reconfigure();
 }
 
 /*
@@ -785,5 +785,5 @@ void __init lockup_detector_init(void)
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
-	softlockup_init_threads();
+	lockup_detector_setup();
 }
-- 
cgit v1.2.3


From 0b62bf862dc93a05fea97b6ca6ffca072e2f30c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 2 Oct 2017 20:59:09 +0200
Subject: watchdog/core: Put softlockup_threads_initialized under ifdef guard

The variable is unused when the softlockup detector is disabled in Kconfig.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d241bd99cee1..6bcb854909c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,7 +47,6 @@ int __read_mostly watchdog_thresh = 10;
 int __read_mostly nmi_watchdog_available;
 
 struct cpumask watchdog_allowed_mask __read_mostly;
-static bool softlockup_threads_initialized __read_mostly;
 
 struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -168,6 +167,7 @@ static void lockup_detector_update_enable(void)
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
 
+static bool softlockup_threads_initialized __read_mostly;
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-- 
cgit v1.2.3


From 8fe2d6ccd52b086268f2f36e5e2fc0fe3aeffa80 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 Oct 2017 16:20:56 -0700
Subject: bpf: fix liveness marking

while processing Rx = Ry instruction the verifier does
regs[insn->dst_reg] = regs[insn->src_reg]
which often clears write mark (when Ry doesn't have it)
that was just set by check_reg_arg(Rx) prior to the assignment.
That causes mark_reg_read() to keep marking Rx in this block as
REG_LIVE_READ (since the logic incorrectly misses that it's
screened by the write) and in many of its parents (until lucky
write into the same Rx or beginning of the program).
That causes is_state_visited() logic to miss many pruning opportunities.

Furthermore mark_reg_read() logic propagates the read mark
for BPF_REG_FP as well (though it's readonly) which causes
harmless but unnecssary work during is_state_visited().
Note that do_propagate_liveness() skips FP correctly,
so do the same in mark_reg_read() as well.
It saves 0.2 seconds for the test below

program               before  after
bpf_lb-DLB_L3.o       2604    2304
bpf_lb-DLB_L4.o       11159   3723
bpf_lb-DUNKNOWN.o     1116    1110
bpf_lxc-DDROP_ALL.o   34566   28004
bpf_lxc-DUNKNOWN.o    53267   39026
bpf_netdev.o          17843   16943
bpf_overlay.o         8672    7929
time                  ~11 sec  ~4 sec

Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Edward Cree <ecree@solarflare.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..8b8d6ba39e23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 {
 	struct bpf_verifier_state *parent = state->parent;
 
+	if (regno == BPF_REG_FP)
+		/* We don't need to worry about FP liveness because it's read-only */
+		return;
+
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
 		if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -2345,6 +2349,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				 * copy register state to dest reg
 				 */
 				regs[insn->dst_reg] = regs[insn->src_reg];
+				regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
-- 
cgit v1.2.3


From 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 9 Oct 2017 15:27:15 +0300
Subject: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
support for attaching an eBPF object by an fd, with the
'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
IPT_SO_SET_REPLACE call.

However this breaks subsequent iptables calls:

 # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
 # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
 iptables: Invalid argument. Run `dmesg' for more information.

That's because iptables works by loading existing rules using
IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with
the replacement set.

However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number
(from the initial "iptables -m bpf" invocation) - so when 2nd invocation
occurs, userspace passes a bogus fd number, which leads to
'bpf_mt_check_v1' to fail.

One suggested solution [1] was to hack iptables userspace, to perform a
"entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new,
process-local fd per every 'xt_bpf_info_v1' entry seen.

However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to
depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects.

This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given
'.fd' and instead perform an in-kernel lookup for the bpf object given
the provided '.path'.

It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named
XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is
expected to provide the path of the pinned object.

Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved.

References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
            [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2

Reported-by: Rafael Buchbinder <rafi@rbk.ms>
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/bpf.h                   |  5 +++++
 include/uapi/linux/netfilter/xt_bpf.h |  1 +
 kernel/bpf/inode.c                    |  1 +
 net/netfilter/xt_bpf.c                | 22 ++++++++++++++++++++--
 4 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..f1af7d63d678 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -368,6 +368,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 						       u32 key)
 {
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index b97725af2ac0..da161b56c79e 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -23,6 +23,7 @@ enum xt_bpf_modes {
 	XT_BPF_MODE_FD_PINNED,
 	XT_BPF_MODE_FD_ELF,
 };
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
 
 struct xt_bpf_info_v1 {
 	__u16 mode;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ out:
 	putname(pname);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 38986a95216c..29123934887b 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/syscalls.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 	return 0;
 }
 
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+	mm_segment_t oldfs = get_fs();
+	int retval, fd;
+
+	set_fs(KERNEL_DS);
+	fd = bpf_obj_get_user(path);
+	set_fs(oldfs);
+	if (fd < 0)
+		return fd;
+
+	retval = __bpf_mt_check_fd(fd, ret);
+	sys_close(fd);
+	return retval;
+}
+
 static int bpf_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_bpf_info *info = par->matchinfo;
@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
 		return __bpf_mt_check_bytecode(info->bpf_program,
 					       info->bpf_program_num_elem,
 					       &info->filter);
-	else if (info->mode == XT_BPF_MODE_FD_PINNED ||
-		 info->mode == XT_BPF_MODE_FD_ELF)
+	else if (info->mode == XT_BPF_MODE_FD_ELF)
 		return __bpf_mt_check_fd(info->fd, &info->filter);
+	else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+		return __bpf_mt_check_path(info->path, &info->filter);
 	else
 		return -EINVAL;
 }
-- 
cgit v1.2.3


From d153b153446f7d8832bb2ebd92309c8a6003b3bb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 Sep 2017 11:35:30 +0200
Subject: sched/core: Fix wake_affine() performance regression

Eric reported a sysbench regression against commit:

  3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")

Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed
against his v3.10 enterprise kernel.

PRE (current tip/master):

 ivb-ep sysbench:

   2: [30 secs]     transactions:                        64110  (2136.94 per sec.)
   5: [30 secs]     transactions:                        143644 (4787.99 per sec.)
  10: [30 secs]     transactions:                        274298 (9142.93 per sec.)
  20: [30 secs]     transactions:                        418683 (13955.45 per sec.)
  40: [30 secs]     transactions:                        320731 (10690.15 per sec.)
  80: [30 secs]     transactions:                        355096 (11834.28 per sec.)

 hsw-ex NAS:

 OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds =                    18.01
 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds =                    17.89
 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds =                    17.93
 lu.C.x_threads_144_run_1.log: Time in seconds =                   434.68
 lu.C.x_threads_144_run_2.log: Time in seconds =                   405.36
 lu.C.x_threads_144_run_3.log: Time in seconds =                   433.83

POST (+patch):

 ivb-ep sysbench:

   2: [30 secs]     transactions:                        64494  (2149.75 per sec.)
   5: [30 secs]     transactions:                        145114 (4836.99 per sec.)
  10: [30 secs]     transactions:                        278311 (9276.69 per sec.)
  20: [30 secs]     transactions:                        437169 (14571.60 per sec.)
  40: [30 secs]     transactions:                        669837 (22326.73 per sec.)
  80: [30 secs]     transactions:                        631739 (21055.88 per sec.)

 hsw-ex NAS:

 lu.C.x_threads_144_run_1.log: Time in seconds =                    23.36
 lu.C.x_threads_144_run_2.log: Time in seconds =                    22.96
 lu.C.x_threads_144_run_3.log: Time in seconds =                    22.52

This patch takes out all the shiny wake_affine() stuff and goes back to
utter basics. Between the two CPUs involved with the wakeup (the CPU
doing the wakeup and the CPU we ran on previously) pick the CPU we can
run on _now_.

This restores much of the regressions against the older kernels,
but leaves some ground in the overloaded case. The default-enabled
WA_WEIGHT (which will be introduced in the next patch) is an attempt
to address the overloaded situation.

Reported-by: Eric Farman <farman@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jinpuwang@gmail.com
Cc: vcaputo@pengaru.com
Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h |   8 ---
 kernel/sched/fair.c            | 126 +++++------------------------------------
 kernel/sched/features.h        |   1 +
 3 files changed, 16 insertions(+), 119 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
-
-	/*
-	 * Some variables from the most recent sd_lb_stats for this domain,
-	 * used by wake_affine().
-	 */
-	unsigned long	nr_running;
-	unsigned long	load;
-	unsigned long	capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..28cabed85387 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
-struct llc_stats {
-	unsigned long	nr_running;
-	unsigned long	load;
-	unsigned long	capacity;
-	int		has_capacity;
-};
-
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
-{
-	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
-	if (!sds)
-		return false;
-
-	stats->nr_running	= READ_ONCE(sds->nr_running);
-	stats->load		= READ_ONCE(sds->load);
-	stats->capacity		= READ_ONCE(sds->capacity);
-	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
-
-	return true;
-}
-
 /*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
  *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *			will be) idle.
  */
+
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
-		int this_cpu, int prev_cpu, int sync)
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+		 int this_cpu, int prev_cpu, int sync)
 {
-	struct llc_stats prev_stats, this_stats;
-	s64 this_eff_load, prev_eff_load;
-	unsigned long task_load;
-
-	if (!get_llc_stats(&prev_stats, prev_cpu) ||
-	    !get_llc_stats(&this_stats, this_cpu))
-		return false;
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current LLC.
-	 */
-	if (sync) {
-		unsigned long current_load = task_h_load(current);
-
-		/* in this case load hits 0 and this LLC is considered 'idle' */
-		if (current_load > this_stats.load)
-			return true;
-
-		this_stats.load -= current_load;
-	}
-
-	/*
-	 * The has_capacity stuff is not SMT aware, but by trying to balance
-	 * the nr_running on both ends we try and fill the domain at equal
-	 * rates, thereby first consuming cores before siblings.
-	 */
-
-	/* if the old cache has capacity, stay there */
-	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-		return false;
-
-	/* if this cache has capacity, come here */
-	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
+	if (idle_cpu(this_cpu))
 		return true;
 
-	/*
-	 * Check to see if we can move the load without causing too much
-	 * imbalance.
-	 */
-	task_load = task_h_load(p);
-
-	this_eff_load = 100;
-	this_eff_load *= prev_stats.capacity;
-
-	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-	prev_eff_load *= this_stats.capacity;
-
-	this_eff_load *= this_stats.load + task_load;
-	prev_eff_load *= prev_stats.load - task_load;
+	if (sync && cpu_rq(this_cpu)->nr_running == 1)
+		return true;
 
-	return this_eff_load <= prev_eff_load;
+	return false;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
 	int this_cpu = smp_processor_id();
-	bool affine;
-
-	/*
-	 * Default to no affine wakeups; wake_affine() should not effect a task
-	 * placement the load-balancer feels inclined to undo. The conservative
-	 * option is therefore to not move tasks when they wake up.
-	 */
-	affine = false;
+	bool affine = false;
 
-	/*
-	 * If the wakeup is across cache domains, try to evaluate if movement
-	 * makes sense, otherwise rely on select_idle_siblings() to do
-	 * placement inside the cache domain.
-	 */
-	if (!cpus_share_cache(prev_cpu, this_cpu))
-		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+	if (sched_feat(WA_IDLE) && !affine)
+		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 	if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-	struct sched_domain_shared *shared = env->sd->shared;
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
 		if (env->dst_rq->rd->overload != overload)
 			env->dst_rq->rd->overload = overload;
 	}
-
-	if (!shared)
-		return;
-
-	/*
-	 * Since these are sums over groups they can contain some CPUs
-	 * multiple times for the NUMA domains.
-	 *
-	 * Currently only wake_affine_llc() and find_busiest_group()
-	 * uses these numbers, only the last is affected by this problem.
-	 *
-	 * XXX fix that.
-	 */
-	WRITE_ONCE(shared->nr_running,	sds->total_running);
-	WRITE_ONCE(shared->load,	sds->total_load);
-	WRITE_ONCE(shared->capacity,	sds->total_capacity);
 }
 
 /**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..0a519f8c224d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(WA_IDLE, true)
-- 
cgit v1.2.3


From f2cdd9cc6c97e617b95f430f527a6e3165e1bee8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 6 Oct 2017 09:23:24 +0200
Subject: sched/core: Address more wake_affine() regressions

The trivial wake_affine_idle() implementation is very good for a
number of workloads, but it comes apart at the moment there are no
idle CPUs left, IOW. the overloaded case.

hackbench:

		NO_WA_WEIGHT		WA_WEIGHT

hackbench-20  : 7.362717561 seconds	6.450509391 seconds

(win)

netperf:

		  NO_WA_WEIGHT		WA_WEIGHT

TCP_SENDFILE-1	: Avg: 54524.6		Avg: 52224.3
TCP_SENDFILE-10	: Avg: 48185.2          Avg: 46504.3
TCP_SENDFILE-20	: Avg: 29031.2          Avg: 28610.3
TCP_SENDFILE-40	: Avg: 9819.72          Avg: 9253.12
TCP_SENDFILE-80	: Avg: 5355.3           Avg: 4687.4

TCP_STREAM-1	: Avg: 41448.3          Avg: 42254
TCP_STREAM-10	: Avg: 24123.2          Avg: 25847.9
TCP_STREAM-20	: Avg: 15834.5          Avg: 18374.4
TCP_STREAM-40	: Avg: 5583.91          Avg: 5599.57
TCP_STREAM-80	: Avg: 2329.66          Avg: 2726.41

TCP_RR-1	: Avg: 80473.5          Avg: 82638.8
TCP_RR-10	: Avg: 72660.5          Avg: 73265.1
TCP_RR-20	: Avg: 52607.1          Avg: 52634.5
TCP_RR-40	: Avg: 57199.2          Avg: 56302.3
TCP_RR-80	: Avg: 25330.3          Avg: 26867.9

UDP_RR-1	: Avg: 108266           Avg: 107844
UDP_RR-10	: Avg: 95480            Avg: 95245.2
UDP_RR-20	: Avg: 68770.8          Avg: 68673.7
UDP_RR-40	: Avg: 76231            Avg: 75419.1
UDP_RR-80	: Avg: 34578.3          Avg: 35639.1

UDP_STREAM-1	: Avg: 64684.3          Avg: 66606
UDP_STREAM-10	: Avg: 52701.2          Avg: 52959.5
UDP_STREAM-20	: Avg: 30376.4          Avg: 29704
UDP_STREAM-40	: Avg: 15685.8          Avg: 15266.5
UDP_STREAM-80	: Avg: 8415.13          Avg: 7388.97

(wins and losses)

sysbench:

		    NO_WA_WEIGHT		WA_WEIGHT

sysbench-mysql-2  :  2135.17 per sec.		 2142.51 per sec.
sysbench-mysql-5  :  4809.68 per sec.            4800.19 per sec.
sysbench-mysql-10 :  9158.59 per sec.            9157.05 per sec.
sysbench-mysql-20 : 14570.70 per sec.           14543.55 per sec.
sysbench-mysql-40 : 22130.56 per sec.           22184.82 per sec.
sysbench-mysql-80 : 20995.56 per sec.           21904.18 per sec.

sysbench-psql-2   :  1679.58 per sec.            1705.06 per sec.
sysbench-psql-5   :  3797.69 per sec.            3879.93 per sec.
sysbench-psql-10  :  7253.22 per sec.            7258.06 per sec.
sysbench-psql-20  : 11166.75 per sec.           11220.00 per sec.
sysbench-psql-40  : 17277.28 per sec.           17359.78 per sec.
sysbench-psql-80  : 17112.44 per sec.           17221.16 per sec.

(increase on the top end)

tbench:

NO_WA_WEIGHT

Throughput 685.211 MB/sec   2 clients   2 procs  max_latency=0.123 ms
Throughput 1596.64 MB/sec   5 clients   5 procs  max_latency=0.119 ms
Throughput 2985.47 MB/sec  10 clients  10 procs  max_latency=0.262 ms
Throughput 4521.15 MB/sec  20 clients  20 procs  max_latency=0.506 ms
Throughput 9438.1  MB/sec  40 clients  40 procs  max_latency=2.052 ms
Throughput 8210.5  MB/sec  80 clients  80 procs  max_latency=8.310 ms

WA_WEIGHT

Throughput 697.292 MB/sec   2 clients   2 procs  max_latency=0.127 ms
Throughput 1596.48 MB/sec   5 clients   5 procs  max_latency=0.080 ms
Throughput 2975.22 MB/sec  10 clients  10 procs  max_latency=0.254 ms
Throughput 4575.14 MB/sec  20 clients  20 procs  max_latency=0.502 ms
Throughput 9468.65 MB/sec  40 clients  40 procs  max_latency=2.069 ms
Throughput 8631.73 MB/sec  80 clients  80 procs  max_latency=8.605 ms

(increase on the top end)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/features.h |  2 ++
 2 files changed, 43 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28cabed85387..ed2ab474ec93 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5363,6 +5363,10 @@ static int wake_wide(struct task_struct *p)
  *
  * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
  *			will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *			  scheduling latency of the CPUs. This seems to work
+ *			  for the overloaded case.
  */
 
 static bool
@@ -5378,6 +5382,40 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
 	return false;
 }
 
+static bool
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+		   int this_cpu, int prev_cpu, int sync)
+{
+	s64 this_eff_load, prev_eff_load;
+	unsigned long task_load;
+
+	this_eff_load = target_load(this_cpu, sd->wake_idx);
+	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+
+		if (current_load > this_eff_load)
+			return true;
+
+		this_eff_load -= current_load;
+	}
+
+	task_load = task_h_load(p);
+
+	this_eff_load += task_load;
+	if (sched_feat(WA_BIAS))
+		this_eff_load *= 100;
+	this_eff_load *= capacity_of(prev_cpu);
+
+	prev_eff_load -= task_load;
+	if (sched_feat(WA_BIAS))
+		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+	prev_eff_load *= capacity_of(this_cpu);
+
+	return this_eff_load <= prev_eff_load;
+}
+
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
@@ -5387,6 +5425,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	if (sched_feat(WA_IDLE) && !affine)
 		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
 
+	if (sched_feat(WA_WEIGHT) && !affine)
+		affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 	if (affine) {
 		schedstat_inc(sd->ttwu_move_affine);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 0a519f8c224d..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -82,3 +82,5 @@ SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
 SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
-- 
cgit v1.2.3


From 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Oct 2017 10:36:53 +0200
Subject: sched/core: Ensure load_balance() respects the active_mask

While load_balance() masks the source CPUs against active_mask, it had
a hole against the destination CPU. Ensure the destination CPU is also
part of the 'domain-mask & active-mask' set.

Reported-by: Levin, Alexander (Sasha Levin) <alexander.levin@verizon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed2ab474ec93..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8042,6 +8042,13 @@ static int should_we_balance(struct lb_env *env)
 	struct sched_group *sg = env->sd->groups;
 	int cpu, balance_cpu = -1;
 
+	/*
+	 * Ensure the balancing environment is consistent; can happen
+	 * when the softirq triggers 'during' hotplug.
+	 */
+	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+		return 0;
+
 	/*
 	 * In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
-- 
cgit v1.2.3