From a3e8162105e8266b94bb25d3d7e48645da4b0c26 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 08:39:53 -0700 Subject: rcu: Split rcu_report_exp_cpu_mult() mask parameter and use for tracing This commit renames the rcu_report_exp_cpu_mult() function from "mask" to "mask_in" and introduced a "mask" local variable to better support upcoming event-tracing additions. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Boqun Feng --- kernel/rcu/tree_exp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e..8d4895c854c5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp; raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); -- cgit v1.2.3 From 764f6a81103e83dc552237f77cc769dd550d8e01 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Sun, 10 Nov 2024 14:47:47 +0000 Subject: rcu: Remove READ_ONCE() for rdp->gpwrap access in __note_gp_changes() There is one access to the per-CPU rdp->gpwrap field in the __note_gp_changes() function that does not use READ_ONCE(), but all other accesses do use READ_ONCE(). When using the 8*TREE03 and CONFIG_NR_CPUS=8 configuration, KCSAN found no data races at that point. This is because all calls to __note_gp_changes() hold rnp->lock, which excludes writes to the rdp->gpwrap fields for all CPUs associated with that same leaf rcu_node structure. This commit therefore removes READ_ONCE() from rdp->gpwrap accesses within the __note_gp_changes() function. Signed-off-by: Zilin Guan Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..e4c0ce600b2b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1254,7 +1254,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1268,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1283,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); -- cgit v1.2.3 From 4dca1af414fb1f27c3350a65820cb0b91178e8fe Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:53 -0800 Subject: rcu: rename PREEMPT_AUTO to PREEMPT_LAZY Replace mentions of PREEMPT_AUTO with PREEMPT_LAZY. Also, since PREMPT_LAZY implies PREEMPTION, we can reduce the TASKS_RCU selection criteria from this: NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) to this: NEED_TASKS_RCU && PREEMPTION CC: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutiny.h | 2 +- kernel/rcu/Kconfig | 2 +- kernel/rcu/srcutiny.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274..31b59b4be2a7 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,7 +64,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); preempt_enable(); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..e2206f3a070c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -91,7 +91,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..f688bdad293e 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -98,7 +98,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +120,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +159,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +172,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +199,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +261,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); -- cgit v1.2.3 From 2c00e1199c060880a215b0d2b495b7738e8c69d7 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:54 -0800 Subject: sched: update __cond_resched comment about RCU quiescent states Update comment in __cond_resched() clarifying how urgently needed quiescent state are provided. Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 165c90ba64ea..d328707626e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7289,7 +7289,7 @@ int __sched __cond_resched(void) return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7298,6 +7298,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); -- cgit v1.2.3 From fcf0e25ad4c8d14d2faab4d9a17040f31efce205 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:55 -0800 Subject: rcu: handle unstable rdp in rcu_read_unlock_strict() rcu_read_unlock_strict() can be called with preemption enabled which can make for an unstable rdp and a racy norm value. Fix this by dropping the preempt-count in __rcu_read_unlock() after the call to rcu_read_unlock_strict(), adjusting the preempt-count check appropriately. Suggested-by: Frederic Weisbecker Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 2 +- kernel/rcu/tree_plugin.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..257e9ae34414 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { - preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); + preempt_enable(); } static inline int rcu_preempt_depth(void) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3600152b858e..9573408a9800 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); -- cgit v1.2.3 From 83b28cfe796464ebbde1cf7916c126da6d572685 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:56 -0800 Subject: rcu: handle quiescent states for PREEMPT_RCU=n, PREEMPT_COUNT=y With PREEMPT_RCU=n, cond_resched() provides urgently needed quiescent states for read-side critical sections via rcu_all_qs(). One reason why this was needed: lacking preempt-count, the tick handler has no way of knowing whether it is executing in a read-side critical section or not. With (PREEMPT_LAZY=y, PREEMPT_DYNAMIC=n), we get (PREEMPT_COUNT=y, PREEMPT_RCU=n). In this configuration cond_resched() is a stub and does not provide quiescent states via rcu_all_qs(). (PREEMPT_RCU=y provides this information via rcu_read_unlock() and its nesting counter.) So, use the availability of preempt_count() to report quiescent states in rcu_flavor_sched_clock_irq(). Suggested-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_plugin.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9573408a9800..3c0bbbbb686f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -984,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs -- cgit v1.2.3 From da2ac5623716cc3f32eeeafef9b7c08c73c2b20a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Jun 2024 08:43:02 -0700 Subject: srcu: Make Tiny SRCU able to operate in preemptible kernels Given that SRCU allows its read-side critical sections are not just preemptible, but also allow general blocking, there is not much reason to restrict Tiny SRCU to non-preemptible kernels. This commit therefore removes Tiny SRCU dependencies on non-preemptibility, primarily surrounding its interaction with rcutorture and early boot. Signed-off-by: Paul E. McKenney Cc: Ankur Arora Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 9 ++++++--- kernel/rcu/srcutiny.c | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..2909662c805f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -611,8 +611,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +622,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +633,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..2a94f0e65606 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. -- cgit v1.2.3 From 5f9e1bc50a046578ddbfb05cda0f053d856bef98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Dec 2024 16:16:32 -0800 Subject: srcu: Use ->srcu_gp_seq for rcutorture reader batch This commit stops using ->srcu_idx for rcutorture's reader-batch consistency checking, using ->srcu_gp_seq instead. This is a first step towards a faster srcu_read_{,un}lock_lite() that avoids the array accesses that use ->srcu_idx. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 2 ++ kernel/rcu/srcutree.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..1d2de50fb5d6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -791,6 +791,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -834,6 +835,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0..e69ce9d59abf 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1675,7 +1675,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); -- cgit v1.2.3 From 56eb8be144c2bdb3a96a0d4365777fc64c65c5d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 11:13:51 -0800 Subject: srcu: Pull ->srcu_{un,}lock_count into a new srcu_ctr structure This commit prepares for array-index-free srcu_read_lock*() by moving the ->srcu_{un,}lock_count fields into a new srcu_ctr structure. This will permit ->srcu_index to be replaced by a per-CPU pointer to this structure. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 13 ++++-- kernel/rcu/srcutree.c | 115 +++++++++++++++++++++++------------------------ 2 files changed, 66 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index b17814c9d1c7..c794d599db5c 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -17,14 +17,19 @@ struct srcu_node; struct srcu_struct; +/* One element of the srcu_data srcu_ctrs array. */ +struct srcu_ctr { + atomic_long_t srcu_locks; /* Locks per CPU. */ + atomic_long_t srcu_unlocks; /* Unlocks per CPU. */ +}; + /* * Per-CPU structure feeding into leaf srcu_node, similar in function * to rcu_node. */ struct srcu_data { /* Read-side state. */ - atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ - atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ + struct srcu_ctr srcu_ctrs[2]; /* Locks and unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Values: SRCU_READ_FLAVOR_.* */ @@ -221,7 +226,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ return idx; } @@ -240,7 +245,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e69ce9d59abf..d7ee2f345e19 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -429,10 +428,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,7 +442,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } @@ -455,8 +454,8 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +466,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -510,9 +509,9 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there + * ->srcu_idx and incrementing ->srcu_ctrs[idx].srcu_locks. And there * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks * in a system whose address space was fully populated with memory. * Call this quantity Nt. @@ -521,36 +520,36 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * code for a long time. That now-preempted updater has already * flipped ->srcu_idx (possibly during the preceding grace period), * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. + * period), and summed up the ->srcu_ctrs[idx].srcu_unlocks counters. * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] + * increment the old ->srcu_idx value's ->srcu_ctrs[idx].srcu_locks * counter, in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_idx and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + * as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_idx, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_idx in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_idx until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_idx twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +580,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -746,7 +745,7 @@ int __srcu_read_lock(struct srcu_struct *ssp) int idx; idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -760,7 +759,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -777,7 +776,7 @@ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -793,7 +792,7 @@ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1123,17 +1122,17 @@ static void srcu_flip(struct srcu_struct *ssp) /* * Because the flip of ->srcu_idx is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_idx from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_idx. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of @@ -1914,8 +1913,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1922,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; -- cgit v1.2.3 From 795e7efec6ea7e9d597c3fced9f5307fae467cb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 16:08:54 -0800 Subject: srcu: Make SRCU readers use ->srcu_ctrs for counter selection This commit causes SRCU readers to use ->srcu_ctrs for counter selection instead of ->srcu_idx. This takes another step towards array-indexing-free SRCU readers. [ paulmck: Apply kernel test robot feedback. ] Co-developed-by: Z qiang Signed-off-by: Z qiang Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 +++++---- kernel/rcu/srcutree.c | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index c794d599db5c..1b01ced61a45 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -101,6 +101,7 @@ struct srcu_usage { */ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ + struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ @@ -167,6 +168,7 @@ struct srcu_struct { #define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ + .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } @@ -222,13 +224,12 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); */ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return idx; + return scp - &ssp->sda->srcu_ctrs[0]; } /* diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d7ee2f345e19..7efde1a2344e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -253,8 +253,10 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -742,12 +744,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return scp - &ssp->sda->srcu_ctrs[0]; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -772,13 +773,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_locks); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return scpp - &ssp->sda->srcu_ctrs[0]; } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -1152,6 +1152,8 @@ static void srcu_flip(struct srcu_struct *ssp) smp_mb(); /* E */ /* Pairs with B and C. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -2000,6 +2002,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } -- cgit v1.2.3 From 821ca6fa15d864951da89233da8fd89e932d5215 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 16:32:12 -0800 Subject: srcu: Make Tree SRCU updates independent of ->srcu_idx This commit makes Tree SRCU updates independent of ->srcu_idx, then drop ->srcu_idx. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 1 - kernel/rcu/srcutree.c | 68 ++++++++++++++++++++++++------------------------ 2 files changed, 34 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1b01ced61a45..6b7eba59f384 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -100,7 +100,6 @@ struct srcu_usage { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7efde1a2344e..247bdf42fb54 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -246,7 +246,6 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); @@ -510,38 +509,39 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's + * the current ->srcu_ctrp but not yet have incremented its CPU's * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_ctrs[idx].srcu_locks. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_ctrs[idx].srcu_unlocks counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_ctrs[idx].srcu_locks - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that + * the old value of ->srcu_ctrp and is just about to use that * value to index its increment of ->srcu_ctrs[idx].srcu_locks. * But as soon as it leaves that SRCU read-side critical section, * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must - * follow the updater's above read from that same value. Thus, - * as soon the reading task does an smp_mb() and a later fetch from - * ->srcu_idx, that task will be guaranteed to get the new index. + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks * in __srcu_read_unlock() is after the smp_mb(), and the fetch - * from ->srcu_idx in __srcu_read_lock() is before the smp_mb(). - * Thus, that task might not see the new value of ->srcu_idx until + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until * the -second- __srcu_read_lock(), which in turn means that this * task might well increment ->srcu_ctrs[idx].srcu_locks for the - * old value of ->srcu_idx twice, not just once. + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any @@ -1095,7 +1095,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { @@ -1113,14 +1113,14 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums * matched and because that summing uses atomic_long_read(), @@ -1128,15 +1128,15 @@ static void srcu_flip(struct srcu_struct *ssp) * summing and the WRITE_ONCE() in this call to srcu_flip(). * This ordering ensures that if this updater saw a given reader's * increment from __srcu_read_lock(), that reader was using a value - * of ->srcu_idx from before the previous call to srcu_flip(), + * of ->srcu_ctrp from before the previous call to srcu_flip(), * which should be quite rare. This ordering thus helps forward * progress because the grace period could otherwise be delayed * by additional calls to __srcu_read_lock() using that old (soon - * to be new) value of ->srcu_idx. + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1151,7 +1151,6 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. WRITE_ONCE(ssp->srcu_ctrp, &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); @@ -1466,8 +1465,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1693,7 +1693,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1721,7 +1721,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1739,7 +1739,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1897,7 +1897,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", -- cgit v1.2.3 From d31e31365b5b6c0cdfc74d71be87234ced564395 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2025 17:04:49 -0800 Subject: srcu: Force synchronization for srcu_get_delay() Currently, srcu_get_delay() can be called concurrently, for example, by a CPU that is the first to request a new grace period and the CPU processing the current grace period. Although concurrent access is harmless, it unnecessarily expands the state space. Additionally, all calls to srcu_get_delay() are from slow paths. This commit therefore protects all calls to srcu_get_delay() with ssp->srcu_sup->lock, which is already held on the invocation from the srcu_funnel_gp_start() function. While in the area, this commit also adds a lockdep_assert_held() to srcu_get_delay() itself. Reported-by: syzbot+16a19b06125a2963eaee@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 247bdf42fb54..121dd290cae1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -648,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -675,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -1101,7 +1106,9 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1850,7 +1857,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { -- cgit v1.2.3 From 4d86b1e7e1e98eb1f0e3c5a4635a5c37cbd22919 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 06:48:15 -0800 Subject: srcu: Add SRCU_READ_FLAVOR_SLOWGP to flag need for synchronize_rcu() This commit switches from a direct test of SRCU_READ_FLAVOR_LITE to a new SRCU_READ_FLAVOR_SLOWGP macro to check for substituting synchronize_rcu() for smp_mb() in SRCU grace periods. Right now, SRCU_READ_FLAVOR_SLOWGP is exactly SRCU_READ_FLAVOR_LITE, but the addition of the _fast() flavor of SRCU will change that. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 +++ kernel/rcu/srcutree.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ca00b9af7c23..505f5bdce444 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -49,6 +49,9 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ SRCU_READ_FLAVOR_LITE) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE + // Flavors requiring synchronize_rcu() + // instead of smp_mb(). #ifdef CONFIG_TINY_SRCU #include diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 121dd290cae1..8b5c50bc98e5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -449,7 +449,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -1205,7 +1205,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); -- cgit v1.2.3 From f4bde41dd19db5e2ea9e0b4a19ac2573f7244d03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 14:31:27 -0800 Subject: srcu: Pull pointer-to-integer conversion into __srcu_ptr_to_ctr() This commit abstracts the srcu_read_lock*() pointer-to-integer conversion into a new __srcu_ptr_to_ctr(). This will be used in rcutorture for testing an srcu_read_lock_fast() that returns a pointer rather than an integer. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 ++++++++- kernel/rcu/srcutree.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index e29cc57eac81..f41bb3a55a04 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -211,6 +211,13 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that +// element's index. +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return scpp - &ssp->sda->srcu_ctrs[0]; +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -228,7 +235,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return scp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scp); } /* diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8b5c50bc98e5..a91651866485 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -753,7 +753,7 @@ int __srcu_read_lock(struct srcu_struct *ssp) this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return scp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -783,7 +783,7 @@ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return scpp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); -- cgit v1.2.3 From 4937096b579a36cfa5764a229d1a89542e10cf5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 15:27:24 -0800 Subject: srcu: Pull integer-to-pointer conversion into __srcu_ctr_to_ptr() This commit abstracts the srcu_read_unlock*() integer-to-pointer conversion into a new __srcu_ctr_to_ptr(). This will be used in rcutorture for testing an srcu_read_unlock_fast() that avoids array-indexing overhead by taking a pointer rather than an integer. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 ++++++++- kernel/rcu/srcutree.c | 6 ++---- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index f41bb3a55a04..55fa400624bb 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -218,6 +218,13 @@ static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __ return scpp - &ssp->sda->srcu_ctrs[0]; } +// Converts an integer to a per-CPU pointer to the corresponding +// ->srcu_ctrs[] array element. +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return &ssp->sda->srcu_ctrs[idx]; +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -252,7 +259,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); /* Z */ + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a91651866485..7a8ace83c98d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -765,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -794,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_unlocks); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); -- cgit v1.2.3 From 176d19eecb4821e541e68fdc57b2d7907b52cfd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 13:24:44 -0800 Subject: rcutorture: Add ability to test srcu_read_{,un}lock_fast() This commit permits rcutorture to test srcu_read_{,un}lock_fast(), which is specified by the rcutorture.reader_flavor=0x8 kernel boot parameter. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1d2de50fb5d6..1bd3eaa0b8e7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -677,6 +677,7 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { @@ -694,6 +695,12 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -719,6 +726,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) -- cgit v1.2.3 From 4c3fca0f5990af9a3c15a2944854ce08c5937630 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 16:32:31 -0800 Subject: refscale: Add srcu_read_lock_fast() support using "srcu-fast" This commit creates a new srcu-fast option for the refscale.scale_type module parameter that selects srcu_read_lock_fast() and srcu_read_unlock_fast(). Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/refscale.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1b47376acdc4..f11a7c2af778 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -1163,7 +1193,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, -- cgit v1.2.3 From 623b52802bb0b33bee28716002249aae5d89c5ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2024 14:57:51 -0800 Subject: torture: Add get_torture_init_jiffies() for test-start time This commit adds a get_torture_init_jiffies() function that returns the value of the jiffies counter at the start of the test, that is, at the point where torture_init_begin() was invoked. This will be used to enable torture-test holdoffs for tests implemented using per-CPU kthreads, which are created and deleted by CPU-hotplug operations, and thus (unlike normal kthreads) don't automatically know when the test started. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/torture.h | 1 + kernel/torture.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 0134e7221cae..1b59056c3b18 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -104,6 +104,7 @@ int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); void torture_init_end(void); +unsigned long get_torture_init_jiffies(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); bool torture_must_stop(void); diff --git a/kernel/torture.c b/kernel/torture.c index dede150aef01..3a0a8cc60401 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static unsigned long torture_init_jiffies; + static void torture_print_module_parms(void) { @@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads. torture_print_module_parms(); return true; } @@ -836,6 +839,15 @@ void torture_init_end(void) } EXPORT_SYMBOL_GPL(torture_init_end); +/* + * Get the torture_init_begin()-time value of the jiffies counter. + */ +unsigned long get_torture_init_jiffies(void) +{ + return READ_ONCE(torture_init_jiffies); +} +EXPORT_SYMBOL_GPL(get_torture_init_jiffies); + /* * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from -- cgit v1.2.3 From b8726c5aa6e8e0f4e8abc27f0c81db9f294958dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2024 15:00:16 -0800 Subject: rcutorture: Add a test_boost_holdoff module parameter This commit adds a test_boost_holdoff module parameter that tells the RCU priority-boosting tests to wait for the specified number of seconds past the start of the rcutorture test. This can be useful when rcutorture is built into the kernel (as opposed to being modprobed), especially on large systems where early start of RCU priority boosting can delay the boot sequence, which adds a full CPU's worth of load onto the system. This can in turn result in pointless stall warnings. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ kernel/rcu/rcutorture.c | 19 ++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec8..ed1a0df03b18 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5758,6 +5758,11 @@ rcutorture.test_boost_duration= [KNL] Duration (s) of each individual boost test. + rcutorture.test_boost_holdoff= [KNL] + Holdoff time (s) from start of test to the start + of RCU priority-boost testing. Defaults to zero, + that is, no holdoff. + rcutorture.test_boost_interval= [KNL] Interval (s) between each boost test. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..fbf1d7fcf61d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -1148,8 +1149,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1225,6 +1237,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -2512,7 +2525,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2526,7 +2539,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, -- cgit v1.2.3 From 84ae91018af56184afabb1bc08b5c117a0634e5e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Nov 2024 13:55:32 -0800 Subject: rcutorture: Include grace-period sequence numbers in failure/close-call This commit includes the grace-period sequence numbers at the beginning and end of each segment in the "Failure/close-call rcutorture reader segments" list. These are in hexadecimal, and only the bottom byte. Currently, only RCU is supported, with its three sequence numbers (normal, expedited, and polled). Note that if all the grace-period sequence numbers remain the same across a given reader segment, only one copy of the number will be printed. Of course, if there is a change, both sets of values will be printed. Because the overhead of collecting this information can suppress heisenbugs, this information is collected and printed only in kernels built with CONFIG_RCU_TORTURE_TEST_LOG_GP=y. [ paulmck: Apply Nathan Chancellor feedback for IS_ENABLED(). ] [ paulmck: Apply feedback from kernel test robot. ] Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig.debug | 14 ++++++++++++++ kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 34 ++++++++++++++++++++++++++++++++++ kernel/rcu/tiny.c | 14 ++++++++++++++ kernel/rcu/tree.c | 20 ++++++++++++++++++++ 5 files changed, 84 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 6af90510a1ca..25a9dc2be0dc 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU Say Y here if you want CPU IDs logged. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..a6098997a14b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif +unsigned long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long seqs, char *cp); #ifdef CONFIG_TINY_SRCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fbf1d7fcf61d..2113583cae34 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -273,6 +273,8 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; + unsigned long rt_gp_seq; + unsigned long rt_gp_seq_end; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -407,6 +409,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); + unsigned long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long seqs, char *cp); long cbflood_max; int irq_capable; int can_boost; @@ -611,6 +615,8 @@ static struct rcu_torture_ops rcu_ops = { .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) ? has_rcu_reader_blocked : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -656,6 +662,8 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" @@ -1978,6 +1986,12 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); } } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -3566,6 +3580,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3661,6 +3676,25 @@ rcu_torture_cleanup(void) else pr_cont(" ..."); } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[16+1]; + char buf2[16+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, buf1); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, buf2); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..f9c4a24dc59c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -257,6 +257,20 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) EXPORT_SYMBOL_GPL(kvfree_call_rcu); #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long rcutorture_gather_gp_seqs(void) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xff; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +{ + snprintf(cp, 8, "g%02lx", seqs & 0xff); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..e40c4b5c3267 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xff) << 16) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xff) << 8) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xff); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +{ + unsigned int egp = (seqs >> 8) & 0xff; + unsigned int ggp = (seqs >> 16) & 0xff; + unsigned int pgp = seqs & 0xff; + + snprintf(cp, 16, "g%02x:e%02x:p%02x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on -- cgit v1.2.3 From 2db7ab8c108669d0b7d87c617edf0a8e132bd1c7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Dec 2024 19:47:44 -0800 Subject: rcutorture: Expand failure/close-call grace-period output With only eight bits per grace-period sequence number, wrap can happen in 64 grace periods. This commit therefore increases this to sixteen bits for normal grace-period sequence numbers and the combined short-form polling sequence numbers, thus deferring wrap for at least 16,384 grace periods. Because expedited grace periods go faster, expand these to 24 bits, deferring wrap for at least 4,194,304 expedited grace periods. These longer wrap times makes it easier to correlate these numbers to trace-event output. Note that the low-order two bits are reserved for intra-grace-period state, hence the above wrap numbers being a factor of four smaller than you might expect. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 4 ++-- kernel/rcu/rcutorture.c | 12 ++++++------ kernel/rcu/tiny.c | 8 ++++---- kernel/rcu/tree.c | 18 +++++++++--------- 4 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a6098997a14b..705fcbe6f500 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -590,8 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif -unsigned long rcutorture_gather_gp_seqs(void); -void rcutorture_format_gp_seqs(unsigned long seqs, char *cp); +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp); #ifdef CONFIG_TINY_SRCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2113583cae34..fb1b80498ae0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -273,8 +273,8 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; - unsigned long rt_gp_seq; - unsigned long rt_gp_seq_end; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -409,8 +409,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); - unsigned long (*gather_gp_seqs)(void); - void (*format_gp_seqs)(unsigned long seqs, char *cp); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp); long cbflood_max; int irq_capable; int can_boost; @@ -3678,8 +3678,8 @@ rcu_torture_cleanup(void) } if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { - char buf1[16+1]; - char buf2[16+1]; + char buf1[20+1]; + char buf2[20+1]; char sepchar = '-'; cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, buf1); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f9c4a24dc59c..8cbec3401184 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -258,15 +258,15 @@ EXPORT_SYMBOL_GPL(kvfree_call_rcu); #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) -unsigned long rcutorture_gather_gp_seqs(void) +unsigned long long rcutorture_gather_gp_seqs(void) { - return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xff; + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; } EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); -void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) { - snprintf(cp, 8, "g%02lx", seqs & 0xff); + snprintf(cp, 8, "g%04llx", seqs & 0xffffULL); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e40c4b5c3267..83cba3d2cc48 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -539,22 +539,22 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* Gather grace-period sequence numbers for rcutorture diagnostics. */ -unsigned long rcutorture_gather_gp_seqs(void) +unsigned long long rcutorture_gather_gp_seqs(void) { - return ((READ_ONCE(rcu_state.gp_seq) & 0xff) << 16) | - ((READ_ONCE(rcu_state.expedited_sequence) & 0xff) << 8) | - (READ_ONCE(rcu_state.gp_seq_polled) & 0xff); + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); } EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); /* Format grace-period sequence numbers for rcutorture diagnostics. */ -void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) { - unsigned int egp = (seqs >> 8) & 0xff; - unsigned int ggp = (seqs >> 16) & 0xff; - unsigned int pgp = seqs & 0xff; + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; - snprintf(cp, 16, "g%02x:e%02x:p%02x", ggp, egp, pgp); + snprintf(cp, 20, "g%04x:e%06x:p%04x", ggp, egp, pgp); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -- cgit v1.2.3 From 65e6ff0f31184bd9ce01c7bfef28558e6b70f96a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2024 14:41:55 -0800 Subject: rcutorture: Add ftrace-compatible timestamp to GP# failure/close-call output This commit adds an ftrace-compatible microsecond-scale timestamp to the failure/close-call output, but only in kernels built with CONFIG_RCU_TORTURE_TEST_LOG_GP=y. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fb1b80498ae0..1fdadc1df9ad 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -275,6 +275,7 @@ struct rt_read_seg { int rt_end_cpu; unsigned long long rt_gp_seq; unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -1989,6 +1990,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, // Sample grace-period sequence number, as good a place as any. if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); if (!first) rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; } @@ -3663,7 +3665,11 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); -- cgit v1.2.3 From 7acc2d90151fe6f5d8409df44e10cd24a0296e9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Dec 2024 10:23:06 -0800 Subject: rcutorture: Make cur_ops->format_gp_seqs take buffer length The Tree and Tiny implementations of rcutorture_format_gp_seqs() use hard-coded constants for the length of the buffer that they format into. This is of course an accident waiting to happen, so this commit therefore makes them take a length argument. The rcutorture calling code uses ARRAY_SIZE() to safely compute this new argument. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 2 +- kernel/rcu/rcutorture.c | 8 +++++--- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 705fcbe6f500..82d8b494cc30 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -591,7 +591,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void rcu_gp_set_torture_wait(int duration) { } #endif unsigned long long rcutorture_gather_gp_seqs(void); -void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1fdadc1df9ad..9c9a349b9c7f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -411,7 +411,7 @@ struct rcu_torture_ops { void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); unsigned long long (*gather_gp_seqs)(void); - void (*format_gp_seqs)(unsigned long long seqs, char *cp); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); long cbflood_max; int irq_capable; int can_boost; @@ -3688,8 +3688,10 @@ rcu_torture_cleanup(void) char buf2[20+1]; char sepchar = '-'; - cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, buf1); - cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, buf2); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { if (buf2[0]) { for (j = 0; buf2[j]; j++) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 8cbec3401184..8a52aca686a5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -264,9 +264,9 @@ unsigned long long rcutorture_gather_gp_seqs(void) } EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); -void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) { - snprintf(cp, 8, "g%04llx", seqs & 0xffffULL); + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 83cba3d2cc48..bb061e5870c3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -548,13 +548,13 @@ unsigned long long rcutorture_gather_gp_seqs(void) EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); /* Format grace-period sequence numbers for rcutorture diagnostics. */ -void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) { unsigned int egp = (seqs >> 16) & 0xffffffULL; unsigned int ggp = (seqs >> 40) & 0xffffULL; unsigned int pgp = seqs & 0xffffULL; - snprintf(cp, 20, "g%04x:e%06x:p%04x", ggp, egp, pgp); + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -- cgit v1.2.3 From 5d45bdf292e62dda86b4f0a5d456287d22a0d2b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Dec 2024 09:48:06 -0800 Subject: rcutorture: Move RCU_TORTURE_TEST_{CHK_RDR_STATE,LOG_CPU} to bool The RCU_TORTURE_TEST_CHK_RDR_STATE and RCU_TORTURE_TEST_LOG_CPU Kconfig options are pointlessly defined as tristate. This commit therefore converts them to bool. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412241458.150d082b-lkp@intel.com Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 25a9dc2be0dc..12e4c64ebae1 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -54,7 +54,7 @@ config RCU_TORTURE_TEST Say N if you are unsure. config RCU_TORTURE_TEST_CHK_RDR_STATE - tristate "Check rcutorture reader state" + bool "Check rcutorture reader state" depends on RCU_TORTURE_TEST default n help @@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE Say N if you are unsure. config RCU_TORTURE_TEST_LOG_CPU - tristate "Log CPU for rcutorture failures" + bool "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST default n help -- cgit v1.2.3 From 38b43eca6665434225284c0b583a39340f4e1f37 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 10:14:53 -0800 Subject: rcutorture: Complain when invalid SRCU reader_flavor is specified Currently, rcutorture ignores reader_flavor bits that are not in the SRCU_READ_FLAVOR_ALL bitmask, which could confuse rcutorture users into believing buggy patches had been fully tested. This commit therefore produces a splat in this case. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9c9a349b9c7f..be4e3c6b912f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -689,6 +689,8 @@ static int srcu_torture_read_lock(void) int idx; int ret = 0; + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); -- cgit v1.2.3 From 536e8b9b80bc7a0a8e87af2d5fb7fe3e230669ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Jan 2025 11:26:23 -0800 Subject: srcu: Add FORCE_NEED_SRCU_NMI_SAFE Kconfig for testing The srcu_read_lock_nmisafe() and srcu_read_unlock_nmisafe() functions map to __srcu_read_lock() and __srcu_read_unlock() on systems like x86 that have NMI-safe this_cpu_inc() operations. This makes the underlying __srcu_read_lock_nmisafe() and __srcu_read_unlock_nmisafe() functions difficult to test on (for example) x86 systems, allowing bugs to creep in. This commit therefore creates a FORCE_NEED_SRCU_NMI_SAFE Kconfig that forces those underlying functions to be used even on systems where they are not needed, thus providing better testing coverage. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..c8e540af3a35 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -65,6 +65,17 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU -- cgit v1.2.3 From 85aad7cc417877054c65bd490dc037b087ef21b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Dec 2024 14:15:07 -0800 Subject: rcu: Fix get_state_synchronize_rcu_full() GP-start detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full() functions use the root rcu_node structure's ->gp_seq field to detect the beginnings and ends of grace periods, respectively. This choice is necessary for the poll_state_synchronize_rcu_full() function because (give or take counter wrap), the following sequence is guaranteed not to trigger: get_state_synchronize_rcu_full(&rgos); synchronize_rcu(); WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&rgos)); The RCU callbacks that awaken synchronize_rcu() instances are guaranteed not to be invoked before the root rcu_node structure's ->gp_seq field is updated to indicate the end of the grace period. However, these callbacks might start being invoked immediately thereafter, in particular, before rcu_state.gp_seq has been updated. Therefore, poll_state_synchronize_rcu_full() must refer to the root rcu_node structure's ->gp_seq field. Because this field is updated under this structure's ->lock, any code following a call to poll_state_synchronize_rcu_full() will be fully ordered after the full grace-period computation, as is required by RCU's memory-ordering semantics. By symmetry, the get_state_synchronize_rcu_full() function should also use this same root rcu_node structure's ->gp_seq field. But it turns out that symmetry is profoundly (though extremely infrequently) destructive in this case. To see this, consider the following sequence of events: 1. CPU 0 starts a new grace period, and updates rcu_state.gp_seq accordingly. 2. As its first step of grace-period initialization, CPU 0 examines the current CPU hotplug state and decides that it need not wait for CPU 1, which is currently offline. 3. CPU 1 comes online, and updates its state. But this does not affect the current grace period, but rather the one after that. After all, CPU 1 was offline when the current grace period started, so all pre-existing RCU readers on CPU 1 must have completed or been preempted before it last went offline. The current grace period therefore has nothing it needs to wait for on CPU 1. 4. CPU 1 switches to an rcutorture kthread which is running rcutorture's rcu_torture_reader() function, which starts a new RCU reader. 5. CPU 2 is running rcutorture's rcu_torture_writer() function and collects a new polled grace-period "cookie" using get_state_synchronize_rcu_full(). Because the newly started grace period has not completed initialization, the root rcu_node structure's ->gp_seq field has not yet been updated to indicate that this new grace period has already started. This cookie is therefore set up for the end of the current grace period (rather than the end of the following grace period). 6. CPU 0 finishes grace-period initialization. 7. If CPU 1’s rcutorture reader is preempted, it will be added to the ->blkd_tasks list, but because CPU 1’s ->qsmask bit is not set in CPU 1's leaf rcu_node structure, the ->gp_tasks pointer will not be updated.  Thus, this grace period will not wait on it.  Which is only fair, given that the CPU did not come online until after the grace period officially started. 8. CPUs 0 and 2 then detect the new grace period and then report a quiescent state to the RCU core. 9. Because CPU 1 was offline at the start of the current grace period, CPUs 0 and 2 are the only CPUs that this grace period needs to wait on. So the grace period ends and post-grace-period cleanup starts. In particular, the root rcu_node structure's ->gp_seq field is updated to indicate that this grace period has now ended. 10. CPU 2 continues running rcu_torture_writer() and sees that, from the viewpoint of the root rcu_node structure consulted by the poll_state_synchronize_rcu_full() function, the grace period has ended.  It therefore updates state accordingly. 11. CPU 1 is still running the same RCU reader, which notices this update and thus complains about the too-short grace period. The fix is for the get_state_synchronize_rcu_full() function to use rcu_state.gp_seq instead of the root rcu_node structure's ->gp_seq field. With this change in place, if step 5's cookie indicates that the grace period has not yet started, then any prior code executed by CPU 2 must have happened before CPU 1 came online. This will in turn prevent CPU 1's code in steps 3 and 11 from spanning CPU 2's grace-period wait, thus preventing CPU 1 from being subjected to a too-short grace period. This commit therefore makes this change. Note that there is no change to the poll_state_synchronize_rcu_full() function, which as noted above, must continue to use the root rcu_node structure's ->gp_seq field. This is of course an asymmetry between these two functions, but is an asymmetry that is absolutely required for correct operation. It is a common human tendency to greatly value symmetry, and sometimes symmetry is a wonderful thing. Other times, symmetry results in poor performance. But in this case, symmetry is just plain wrong. Nevertheless, the asymmetry does require an additional adjustment. It is possible for get_state_synchronize_rcu_full() to see a given grace period as having started, but for an immediately following poll_state_synchronize_rcu_full() to see it as having not yet started. Given the current rcu_seq_done_exact() implementation, this will result in a false-positive indication that the grace period is done from poll_state_synchronize_rcu_full(). This is dealt with by making rcu_seq_done_exact() reach back three grace periods rather than just two of them. However, simply changing get_state_synchronize_rcu_full() function to use rcu_state.gp_seq instead of the root rcu_node structure's ->gp_seq field results in a theoretical bug in kernels booted with rcutree.rcu_normal_wake_from_gp=1 due to the following sequence of events: o The rcu_gp_init() function invokes rcu_seq_start() to officially start a new grace period. o A new RCU reader begins, referencing X from some RCU-protected list. The new grace period is not obligated to wait for this reader. o An updater removes X, then calls synchronize_rcu(), which queues a wait element. o The grace period ends, awakening the updater, which frees X while the reader is still referencing it. The reason that this is theoretical is that although the grace period has officially started, none of the CPUs are officially aware of this, and thus will have to assume that the RCU reader pre-dated the start of the grace period. Detailed explanation can be found at [2] and [3]. Except for kernels built with CONFIG_PROVE_RCU=y, which use the polled grace-period APIs, which can and do complain bitterly when this sequence of events occurs. Not only that, there might be some future RCU grace-period mechanism that pulls this sequence of events from theory into practice. This commit therefore also pulls the call to rcu_sr_normal_gp_init() to precede that to rcu_seq_start(). Although this fixes commit 91a967fd6934 ("rcu: Add full-sized polling for get_completed*() and poll_state*()"), it is not clear that it is worth backporting this commit. First, it took me many weeks to convince rcutorture to reproduce this more frequently than once per year. Second, this cannot be reproduced at all without frequent CPU-hotplug operations, as in waiting all of 50 milliseconds from the end of the previous operation until starting the next one. Third, the TREE03.boot settings cause multi-millisecond delays during RCU grace-period initialization, which greatly increase the probability of the above sequence of events. (Don't do this in production workloads!) Fourth, the TREE03 rcutorture scenario was modified to use four-CPU guest OSes, to have a single-rcu_node combining tree, no testing of RCU priority boosting, and no random preemption, and these modifications were necessary to reproduce this issue in a reasonable timeframe. Fifth, extremely heavy use of get_state_synchronize_rcu_full() and/or poll_state_synchronize_rcu_full() is required to reproduce this, and as of v6.12, only kfree_rcu() uses it, and even then not particularly heavily. [boqun: Apply the fix [1], and add the comment before the moved rcu_sr_normal_gp_init(). Additional links are added for explanation.] Signed-off-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Joel Fernandes (Google) Tested-by: Uladzislau Rezki (Sony) Link: https://lore.kernel.org/rcu/d90bd6d9-d15c-4b9b-8a69-95336e74e8f4@paulmck-laptop/ [1] Link: https://lore.kernel.org/rcu/20250303001507.GA3994772@joelnvbox/ [2] Link: https://lore.kernel.org/rcu/Z8bcUsZ9IpRi1QoP@pc636/ [3] Reviewed-by: Joel Fernandes Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 2 +- kernel/rcu/tree.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..f87c9d6d36fc 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e4c0ce600b2b..131fb463ba68 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1801,10 +1801,14 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -3357,14 +3361,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); -- cgit v1.2.3 From 23c22d91561dd555d07381fb1f567539769c2ea0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Dec 2024 13:09:30 -0800 Subject: rcu-tasks: Move RCU Tasks self-tests to core_initcall() The timer and hrtimer softirq processing has moved to dedicated threads for kernels built with CONFIG_IRQ_FORCED_THREADING=y. This results in timers not expiring until later in early boot, which in turn causes the RCU Tasks self-tests to hang in kernels built with CONFIG_PROVE_RCU=y, which further causes the entire kernel to hang. One fix would be to make timers work during this time, but there are no known users of RCU Tasks grace periods during that time, so no justification for the added complexity. Not yet, anyway. This commit therefore moves the call to rcu_init_tasks_generic() from kernel_init_freeable() to a core_initcall(). This works because the timer and hrtimer kthreads are created at early_initcall() time. Fixes: 49a17639508c3 ("softirq: Use a dedicated thread for timer wakeups on PREEMPT_RT.") Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Cc: Tested-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 6 ------ init/main.c | 1 - kernel/rcu/tasks.h | 5 ++++- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..36849a4ea141 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -121,12 +121,6 @@ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -#ifdef CONFIG_TASKS_RCU_GENERIC -void rcu_init_tasks_generic(void); -#else -static inline void rcu_init_tasks_generic(void) { } -#endif - #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); diff --git a/init/main.c b/init/main.c index 2a1757826397..7f0a2a3dbd29 100644 --- a/init/main.c +++ b/init/main.c @@ -1553,7 +1553,6 @@ static noinline void __init kernel_init_freeable(void) init_mm_internals(); - rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..466668eb4fad 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void) #endif } -void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} -- cgit v1.2.3 From 69381f38284f107e5e55bff7e51ecd1ef7e3ced8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 11:08:42 -0800 Subject: rcu/nocb: Print segment lengths in show_rcu_nocb_gp_state() Analysis of an rcutorture callback-based forward-progress test failure was hampered by the lack of ->cblist segment lengths. This commit therefore adds this information, so that what would have been ".W85620.N." (there are some callbacks waiting for grace period sequence number 85620 and some number more that have not yet been assigned to a grace period) now prints as ".W2(85620).N6." (there are 2 callbacks waiting for grace period 85620 and 6 not yet assigned to a grace period). Note that "D" (done), "N" (next and not yet assigned to a grace period, and "B" (bypass, also not yet assigned to a grace period) have just the number of callbacks without the parenthesized grace-period sequence number. In contrast, "W" (waiting for the current grace period) and "R" (ready to wait for the next grace period to start) both have parenthesized grace-period sequence numbers. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_nocb.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..5ff3bc56ff51 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, -- cgit v1.2.3 From 59bed79ffdbc26af3dfba3c6453a4356c9fd6b6f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Feb 2025 02:15:09 -0800 Subject: context_tracking: Make RCU watch ct_kernel_exit_state() warning The WARN_ON_ONCE() in ct_kernel_exit_state() follows the call to ct_state_inc(), which means that RCU is not watching this WARN_ON_ONCE(). This can (and does) result in extraneous lockdep warnings when this WARN_ON_ONCE() triggers. These extraneous warnings are the opposite of helpful. Therefore, invert the WARN_ON_ONCE() condition and move it before the call to ct_state_inc(). This does mean that the ct_state_inc() return value can no longer be used in the WARN_ON_ONCE() condition, so discard this return value and instead use a call to rcu_is_watching_curr_cpu(). This call is executed only in CONFIG_RCU_EQS_DEBUG=y kernels, so there is no added overhead in production use. [Boqun: Add the subsystem tag in the title] Reported-by: Breno Leitao Signed-off-by: Paul E. McKenney Reviewed-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/bd911cd9-1fe9-447c-85e0-ea811a1dc896@paulmck-laptop Signed-off-by: Boqun Feng --- kernel/context_tracking.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d26..fb5be6e9b423 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) */ static noinstr void ct_kernel_exit_state(int offset) { - int seq; - /* * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); + // RCU is still watching. Better not be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); + (void)ct_state_inc(offset); + // RCU is no longer watching. } /* -- cgit v1.2.3 From 6ea9a1781c70a8be1fcdc49134fc1bf4baba8bca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Feb 2025 10:33:28 -0800 Subject: Flush console log from kernel_power_off() Kernels built with CONFIG_PREEMPT_RT=y can lose significant console output and shutdown time, which hides shutdown-time RCU issues from rcutorture. Therefore, make pr_flush() public and invoke it after then last print in kernel_power_off(). [ paulmck: Apply John Ogness feedback. ] [ paulmck: Appy Sebastian Andrzej Siewior feedback. ] [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Reviewed-by: John Ogness Reviewed-by: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Link: https://lore.kernel.org/r/5f743488-dc2a-4f19-bdda-cf50b9314832@paulmck-laptop Signed-off-by: Boqun Feng --- include/linux/printk.h | 6 ++++++ kernel/printk/printk.c | 4 +--- kernel/reboot.c | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 4217a9f412b2..5b462029d03c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -207,6 +207,7 @@ void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); +bool pr_flush(int timeout_ms, bool reset_on_progress); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -315,6 +316,11 @@ static inline void nbcon_atomic_flush_unsafe(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + #endif bool this_cpu_in_panic(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 07668433644b..057db78876cd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2461,7 +2461,6 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); -static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2474,7 +2473,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre static u64 syslog_seq; -static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -4466,7 +4464,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ -static bool pr_flush(int timeout_ms, bool reset_on_progress) +bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } diff --git a/kernel/reboot.c b/kernel/reboot.c index b5a8569e5d81..41ab9e1ba357 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -704,6 +704,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); + pr_flush(1000, true); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } -- cgit v1.2.3 From 8d67c1558a71475e638fee09e588e12834043069 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 27 Feb 2025 14:16:11 +0100 Subject: rcutorture: Allow a negative value for nfakewriters Currently "nfakewriters" parameter can be set to any value but there is no possibility to adjust it automatically based on how many CPUs a system has where a test is run on. To address this, if the "nfakewriters" is set to negative it will be adjusted to num_online_cpus() during torture initialization. Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) Link: https://lore.kernel.org/r/20250227131613.52683-1-urezki@gmail.com Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..726c2d63ab66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -147,6 +147,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -1728,7 +1729,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -2522,7 +2523,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " "preempt_duration=%d preempt_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, @@ -3597,7 +3598,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3994,6 +3995,14 @@ rcu_torture_init(void) rcu_torture_init_srcu_lockdep(); + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -4050,8 +4059,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -4060,7 +4070,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) -- cgit v1.2.3 From 5a562b8b3f5de4c50f4a9da92bfd3f0a6eebf081 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 27 Feb 2025 14:16:13 +0100 Subject: rcu: Use _full() API to debug synchronize_rcu() Switch for using of get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full() pair to debug a normal synchronize_rcu() call. Just using "not" full APIs to identify if a grace period is passed or not might lead to a false-positive kernel splat. It can happen, because get_state_synchronize_rcu() compresses both normal and expedited states into one single unsigned long value, so a poll_state_synchronize_rcu() can miss GP-completion when synchronize_rcu()/synchronize_rcu_expedited() concurrently run. To address this, switch to poll_state_synchronize_rcu_full() and get_state_synchronize_rcu_full() APIs, which use separate variables for expedited and normal states. Reported-by: cheung wall Closes: https://lore.kernel.org/lkml/Z5ikQeVmVdsWQrdD@pc636/T/ Fixes: 988f569ae041 ("rcu: Reduce synchronize_rcu() latency") Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/r/20250227131613.52683-3-urezki@gmail.com Signed-off-by: Boqun Feng --- include/linux/rcupdate_wait.h | 3 +++ kernel/rcu/tree.c | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index f9bed3d3f78d..4c92d4291cce 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -16,6 +16,9 @@ struct rcu_synchronize { struct rcu_head head; struct completion completion; + + /* This is for debugging. */ + struct rcu_gp_oldstate oldstate; }; void wakeme_after_rcu(struct rcu_head *head); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 131fb463ba68..fb98d4d9d9c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1612,12 +1612,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); /* Finally. */ complete(&rs->completion); @@ -3218,7 +3216,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate); rcu_sr_normal_add_req(&rs); -- cgit v1.2.3 From 9fd858cc5a21605c9e1e45dbe2fb9023ff3ecdc7 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:57 -0800 Subject: osnoise: provide quiescent states To reduce RCU noise for nohz_full configurations, osnoise depends on cond_resched() providing quiescent states for PREEMPT_RCU=n configurations. For PREEMPT_RCU=y configurations -- where cond_resched() is a stub -- we do this by directly calling rcu_momentary_eqs(). With (PREEMPT_LAZY=y, PREEMPT_DYNAMIC=n), however, we have a configuration with (PREEMPTION=y, PREEMPT_RCU=n) where neither of the above can help. Handle that by providing an explicit quiescent state here for all configurations. As mentioned above this is not needed for non-stubbed cond_resched(), but, providing a quiescent state here just pulls in one that a future cond_resched() would provide, so doesn't cause any extra work for this configuration. Cc: Paul E. McKenney Cc: Daniel Bristot de Oliveira Cc: Steven Rostedt Suggested-by: Paul E. McKenney Acked-by: Daniel Bristot de Oliveira Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Acked-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/trace/trace_osnoise.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index f3a2722ee4c0..512034e365ad 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1542,27 +1542,25 @@ static int run_osnoise(void) /* * In some cases, notably when running on a nohz_full CPU with - * a stopped tick PREEMPT_RCU has no way to account for QSs. - * This will eventually cause unwarranted noise as PREEMPT_RCU - * will force preemption as the means of ending the current - * grace period. We avoid this problem by calling - * rcu_momentary_eqs(), which performs a zero duration - * EQS allowing PREEMPT_RCU to end the current grace period. - * This call shouldn't be wrapped inside an RCU critical - * section. + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. * - * Note that in non PREEMPT_RCU kernels QSs are handled through - * cond_resched() + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. */ - if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - if (!disable_irq) - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); - rcu_momentary_eqs(); + rcu_momentary_eqs(); - if (!disable_irq) - local_irq_enable(); - } + if (!disable_irq) + local_irq_enable(); /* * For the non-preemptive kernel config: let threads runs, if -- cgit v1.2.3 From 0be4b19edd74297f9843d756eb879a0960bd4860 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Feb 2025 08:51:45 -0800 Subject: rcutorture: Update rcutorture_one_extend_check() for lazy preemption The rcutorture_one_extend_check() function's last check assumes that if cur_ops->readlock_nesting() returns greater than zero, either the RCUTORTURE_RDR_RCU_1 or the RCUTORTURE_RDR_RCU_2 bit must be set, that is, there must be at least one rcu_read_lock() in effect. This works for preemptible RCU and for non-preemptible RCU running in a non-preemptible kernel. But it fails for non-preemptible RCU running in a preemptible kernel because then RCU's cur_ops->readlock_nesting() function, which is rcu_torture_readlock_nesting(), will return the PREEMPT_MASK mask bits from preempt_count(). The result will be greater than zero if preemption is disabled, including by the RCUTORTURE_RDR_PREEMPT and RCUTORTURE_RDR_SCHED bits. This commit therefore adjusts this check to take into account the case fo non-preemptible RCU running in a preemptible kernel. [boqun: Fix the if condition and add comment] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202502171415.8ec87c87-lkp@intel.com Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Co-developed-by: Joel Fernandes Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..280bff706017 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1873,6 +1873,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) { + int mask; + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) return; @@ -1902,8 +1904,16 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && (preempt_count() & PREEMPT_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->readlock_nesting && - !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } -- cgit v1.2.3 From a56ca5619f9ccce701789e2d231419ed50c0f033 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Sun, 23 Feb 2025 20:29:31 -0800 Subject: rcutorture: Update ->extendables check for lazy preemption The rcutorture_one_extend_check() function's second last check assumes that "preempt_count() & PREEMPT_MASK" is non-zero only if RCUTORTURE_RDR_PREEMPT or RCUTORTURE_RDR_SCHED bit is set. This works for preemptible RCU and for non-preemptible RCU running in a non-preemptible kernel. But it fails for non-preemptible RCU running in a preemptible kernel because then rcu_read_lock() is just preempt_disable(), which increases preempt count. This commit therefore adjusts this check to take into account the case fo non-preemptible RCU running in a preemptible kernel. Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 280bff706017..4cae119dece8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1901,8 +1901,16 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->extendables && - !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && (preempt_count() & PREEMPT_MASK), ROEC_ARGS); /* -- cgit v1.2.3 From 8437bb84bc554fed7a716408cbb2b0e3bd13356e Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:58 -0800 Subject: rcu: limit PREEMPT_RCU configurations PREEMPT_LAZY can be enabled stand-alone or alongside PREEMPT_DYNAMIC which allows for dynamic switching of preemption models. The choice of PREEMPT_RCU or not, however, is fixed at compile time. Given that PREEMPT_RCU makes some trade-offs to optimize for latency as opposed to throughput, configurations with limited preemption might prefer the stronger forward-progress guarantees of PREEMPT_RCU=n. Accordingly, explicitly limit PREEMPT_RCU=y to the latency oriented preemption models: PREEMPT, PREEMPT_RT, and the runtime configurable model PREEMPT_DYNAMIC. This means the throughput oriented models, PREEMPT_NONE, PREEMPT_VOLUNTARY, and PREEMPT_LAZY will run with PREEMPT_RCU=n. Cc: Paul E. McKenney Cc: Peter Zijlstra Reviewed-by: Frederic Weisbecker Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index e2206f3a070c..dd6251678e99 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is -- cgit v1.2.3