From 90c018942c2babab73814648b37808fc3bf2ed1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:37:38 -0800 Subject: timer: Use hlist_unhashed_lockless() in timer_pending() The timer_pending() function is mostly used in lockless contexts, so Without proper annotations, KCSAN might detect a data-race [1]. Using hlist_unhashed_lockless() instead of hand-coding it seems appropriate (as suggested by Paul E. McKenney). [1] BUG: KCSAN: data-race in del_timer / detach_if_pending write to 0xffff88808697d870 of 8 bytes by task 10 on cpu 0: __hlist_del include/linux/list.h:764 [inline] detach_timer kernel/time/timer.c:815 [inline] detach_if_pending+0xcd/0x2d0 kernel/time/timer.c:832 try_to_del_timer_sync+0x60/0xb0 kernel/time/timer.c:1226 del_timer_sync+0x6b/0xa0 kernel/time/timer.c:1365 schedule_timeout+0x2d2/0x6e0 kernel/time/timer.c:1896 rcu_gp_fqs_loop+0x37c/0x580 kernel/rcu/tree.c:1639 rcu_gp_kthread+0x143/0x230 kernel/rcu/tree.c:1799 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff88808697d870 of 8 bytes by task 12060 on cpu 1: del_timer+0x3b/0xb0 kernel/time/timer.c:1198 sk_stop_timer+0x25/0x60 net/core/sock.c:2845 inet_csk_clear_xmit_timers+0x69/0xa0 net/ipv4/inet_connection_sock.c:523 tcp_clear_xmit_timers include/net/tcp.h:606 [inline] tcp_v4_destroy_sock+0xa3/0x3f0 net/ipv4/tcp_ipv4.c:2096 inet_csk_destroy_sock+0xf4/0x250 net/ipv4/inet_connection_sock.c:836 tcp_close+0x6f3/0x970 net/ipv4/tcp.c:2497 inet_release+0x86/0x100 net/ipv4/af_inet.c:427 __sock_release+0x85/0x160 net/socket.c:590 sock_close+0x24/0x30 net/socket.c:1268 __fput+0x1e1/0x520 fs/file_table.c:280 ____fput+0x1f/0x30 fs/file_table.c:313 task_work_run+0xf6/0x130 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x2b4/0x2c0 arch/x86/entry/common.c:163 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12060 Comm: syz-executor.5 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, Signed-off-by: Eric Dumazet Cc: Thomas Gleixner [ paulmck: Pulled in Eric's later amendments. ] Signed-off-by: Paul E. McKenney --- include/linux/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/timer.h b/include/linux/timer.h index 1e6650ed066d..0dc19a8c39c9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -164,7 +164,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.pprev != NULL; + return !hlist_unhashed_lockless(&timer->entry); } extern void add_timer_on(struct timer_list *timer, int cpu); -- cgit v1.2.3 From 4dfd5cd83dc4458049c7f6eb9c4f361acc4239ea Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Sat, 18 Jan 2020 22:24:18 +0530 Subject: rculist: Add brackets around cond argument in __list_check_rcu macro Passing a complex lockdep condition to __list_check_rcu results in false positive lockdep splat due to incorrect expression evaluation. For example, a lockdep check condition `cond1 || cond2` is evaluated as `!cond1 || cond2 && !rcu_read_lock_any_held()` which, according to operator precedence, evaluates to `!cond1 || (cond2 && !rcu_read_lock_any_held())`. This would result in a lockdep splat when cond1 is false and cond2 is true which is logically incorrect. Signed-off-by: Amol Grover Acked-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 9f313e4999fe..8214cdc715f2 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -60,9 +60,9 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ - RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ + RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ - }) + }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) -- cgit v1.2.3 From 613707929b304737e6eb841588772f1994f6702b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 20 Jan 2020 15:42:26 +0100 Subject: rcu: Add a trace event for kfree_rcu() use of kfree_bulk() The event is given three parameters, first one is the name of RCU flavour, second one is the number of elements in array for free and last one is an address of the array holding pointers to be freed by the kfree_bulk() function. To enable the trace event your kernel has to be build with CONFIG_RCU_TRACE=y, after that it is possible to track the events using ftrace subsystem. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 28 ++++++++++++++++++++++++++++ kernel/rcu/tree.c | 3 +++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5e49b06e8104..49a49e68b916 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -623,6 +623,34 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_callback, __entry->rcuname, __entry->rhp, __entry->offset) ); +/* + * Tracepoint for the invocation of a single RCU callback of the special + * kfree_bulk() form. The first argument is the RCU flavor, the second + * argument is a number of elements in array to free, the third is an + * address of the array holding nr_records entries. + */ +TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback, + + TP_PROTO(const char *rcuname, unsigned long nr_records, void **p), + + TP_ARGS(rcuname, nr_records, p), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, nr_records) + __field(void **, p) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->nr_records = nr_records; + __entry->p = p; + ), + + TP_printk("%s bulk=0x%p nr_records=%lu", + __entry->rcuname, __entry->p, __entry->nr_records) +); + /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been * invoked. The first argument is the name of the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51a3aa884a7c..909f97efb1ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2792,6 +2792,9 @@ static void kfree_rcu_work(struct work_struct *work) debug_rcu_head_unqueue_bulk(bhead->head_free_debug); rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, + bhead->nr_records, bhead->records); + kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); -- cgit v1.2.3 From 59ee0326ccf712f9a637d5df2465a16a784cbfb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Nov 2019 18:54:06 -0800 Subject: rcutorture: Suppress forward-progress complaints during early boot Some larger systems can take in excess of 50 seconds to complete their early boot initcalls prior to spawing init. This does not in any way help the forward-progress judgments of built-in rcutorture (when rcutorture is built as a module, the insmod or modprobe command normally cannot happen until some time after boot completes). This commit therefore suppresses such complaints until about the time that init is spawned. This also includes a fix to a stupid error located by kbuild test robot. [ paulmck: Apply kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney [ paulmck: Fix to nohz_full slow-expediting recovery logic, per bpetkov. ] [ paulmck: Restrict splat to CONFIG_PREEMPT_RT=y kernels and simplify. ] Tested-by: Borislav Petkov --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/rcutorture.c | 3 ++- kernel/rcu/tree_exp.h | 7 ++++++- kernel/rcu/update.c | 12 ++++++++++++ 5 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b2b2dc990da9..045c28b71f4f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -83,6 +83,7 @@ void rcu_scheduler_starting(void); static inline void rcu_scheduler_starting(void) { } #endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } +static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 2f787b9029d1..45f3f66bb04d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -54,6 +54,7 @@ void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); +bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1aeecc165b21..9ba49788cb48 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1067,7 +1067,8 @@ rcu_torture_writer(void *arg) if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && - !torture_must_stop()) + !torture_must_stop() && + rcu_inkernel_boot_has_ended()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dcbd75791f39..a72d16eb869e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -485,6 +485,7 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) static void synchronize_rcu_expedited_wait(void) { int cpu; + unsigned long j; unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; @@ -496,7 +497,7 @@ static void synchronize_rcu_expedited_wait(void) trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { @@ -508,6 +509,10 @@ static void synchronize_rcu_expedited_wait(void) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } } + j = READ_ONCE(jiffies_till_first_fqs); + if (synchronize_rcu_expedited_wait_once(j + HZ)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)); } for (;;) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..feaaec5747a3 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -183,6 +183,8 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); +static bool rcu_boot_ended __read_mostly; + /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -191,7 +193,17 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); + rcu_boot_ended = 1; +} + +/* + * Let rcutorture know when it is OK to turn it up to eleven. + */ +bool rcu_inkernel_boot_has_ended(void) +{ + return rcu_boot_ended; } +EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended); #endif /* #ifndef CONFIG_TINY_RCU */ -- cgit v1.2.3 From 127e29815b4b2206c0a97ac1d83f92ffc0e25c34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Feb 2020 06:17:33 -0800 Subject: rcu: Make rcu_barrier() account for offline no-CBs CPUs Currently, rcu_barrier() ignores offline CPUs, However, it is possible for an offline no-CBs CPU to have callbacks queued, and rcu_barrier() must wait for those callbacks. This commit therefore makes rcu_barrier() directly invoke the rcu_barrier_func() with interrupts disabled for such CPUs. This requires passing the CPU number into this function so that it can entrain the rcu_barrier() callback onto the correct CPU's callback list, given that the code must instead execute on the current CPU. While in the area, this commit fixes a bug where the first CPU's callback might have been invoked before rcu_segcblist_entrain() returned, which would also result in an early wakeup. Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs") Signed-off-by: Paul E. McKenney [ paulmck: Apply optimization feedback from Boqun Feng. ] Cc: # 5.5.x --- include/trace/events/rcu.h | 1 + kernel/rcu/tree.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5e49b06e8104..d56d54c17497 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -712,6 +712,7 @@ TRACE_EVENT_RCU(rcu_torture_read, * "Begin": rcu_barrier() started. * "EarlyExit": rcu_barrier() piggybacked, thus early exit. * "Inc1": rcu_barrier() piggyback check counter incremented. + * "OfflineNoCBQ": rcu_barrier() found offline no-CBs CPU with callbacks. * "OnlineQ": rcu_barrier() found online CPU with callbacks. * "OnlineNQ": rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 739788ff0f6e..35292c8ffd4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3097,9 +3097,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *unused) +static void rcu_barrier_func(void *cpu_in) { - struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; @@ -3126,7 +3127,7 @@ static void rcu_barrier_func(void *unused) */ void rcu_barrier(void) { - int cpu; + uintptr_t cpu; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -3149,13 +3150,14 @@ void rcu_barrier(void) rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* - * Initialize the count to one rather than to zero in order to - * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Exclude CPU-hotplug operations - * to ensure that no offline CPU has callbacks queued. + * Initialize the count to two rather than to zero in order + * to avoid a too-soon return to zero in case of an immediate + * invocation of the just-enqueued callback (or preemption of + * this task). Exclude CPU-hotplug operations to ensure that no + * offline non-offloaded CPU has callbacks queued. */ init_completion(&rcu_state.barrier_completion); - atomic_set(&rcu_state.barrier_cpu_count, 1); + atomic_set(&rcu_state.barrier_cpu_count, 2); get_online_cpus(); /* @@ -3165,13 +3167,23 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (!cpu_online(cpu) && + if (cpu_is_offline(cpu) && !rcu_segcblist_is_offloaded(&rdp->cblist)) continue; - if (rcu_segcblist_n_cbs(&rdp->cblist)) { + if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); + smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); + } else if (rcu_segcblist_n_cbs(&rdp->cblist) && + cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, + rcu_state.barrier_sequence); + local_irq_disable(); + rcu_barrier_func((void *)cpu); + local_irq_enable(); + } else if (cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, + rcu_state.barrier_sequence); } else { rcu_barrier_trace(TPS("OnlineNQ"), cpu, rcu_state.barrier_sequence); @@ -3183,7 +3195,7 @@ void rcu_barrier(void) * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count)) complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ -- cgit v1.2.3