From 38f7e14519d39cf524ddc02d4caee9b337dad703 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jul 2024 12:44:31 +0100 Subject: workqueue: Fix UBSAN 'subtraction overflow' error in shift_and_mask() UBSAN reports the following 'subtraction overflow' error when booting in a virtual machine on Android: | Internal error: UBSAN: integer subtraction overflow: 00000000f2005515 [#1] PREEMPT SMP | Modules linked in: | CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.10.0-00006-g3cbe9e5abd46-dirty #4 | Hardware name: linux,dummy-virt (DT) | pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : cancel_delayed_work+0x34/0x44 | lr : cancel_delayed_work+0x2c/0x44 | sp : ffff80008002ba60 | x29: ffff80008002ba60 x28: 0000000000000000 x27: 0000000000000000 | x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000 | x23: 0000000000000000 x22: 0000000000000000 x21: ffff1f65014cd3c0 | x20: ffffc0e84c9d0da0 x19: ffffc0e84cab3558 x18: ffff800080009058 | x17: 00000000247ee1f8 x16: 00000000247ee1f8 x15: 00000000bdcb279d | x14: 0000000000000001 x13: 0000000000000075 x12: 00000a0000000000 | x11: ffff1f6501499018 x10: 00984901651fffff x9 : ffff5e7cc35af000 | x8 : 0000000000000001 x7 : 3d4d455453595342 x6 : 000000004e514553 | x5 : ffff1f6501499265 x4 : ffff1f650ff60b10 x3 : 0000000000000620 | x2 : ffff80008002ba78 x1 : 0000000000000000 x0 : 0000000000000000 | Call trace: | cancel_delayed_work+0x34/0x44 | deferred_probe_extend_timeout+0x20/0x70 | driver_register+0xa8/0x110 | __platform_driver_register+0x28/0x3c | syscon_init+0x24/0x38 | do_one_initcall+0xe4/0x338 | do_initcall_level+0xac/0x178 | do_initcalls+0x5c/0xa0 | do_basic_setup+0x20/0x30 | kernel_init_freeable+0x8c/0xf8 | kernel_init+0x28/0x1b4 | ret_from_fork+0x10/0x20 | Code: f9000fbf 97fffa2f 39400268 37100048 (d42aa2a0) | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: UBSAN: integer subtraction overflow: Fatal exception This is due to shift_and_mask() using a signed immediate to construct the mask and being called with a shift of 31 (WORK_OFFQ_POOL_SHIFT) so that it ends up decrementing from INT_MIN. Use an unsigned constant '1U' to generate the mask in shift_and_mask(). Cc: Tejun Heo Cc: Lai Jiangshan Fixes: 1211f3b21c2a ("workqueue: Preserve OFFQ bits in cancel[_sync] paths") Signed-off-by: Will Deacon Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1745ca788ede..b35f8ce80bc7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -897,7 +897,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { - return (v >> shift) & ((1 << bits) - 1); + return (v >> shift) & ((1U << bits) - 1); } static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) -- cgit v1.2.3 From 98cc1730c89467fc26e2dc2ceb2a014f332daa97 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 25 Jul 2024 09:04:37 +0800 Subject: workqueue: Remove incorrect "WARN_ON_ONCE(!list_empty(&worker->entry));" from dying worker The commit 68f83057b913 ("workqueue: Reap workers via kthread_stop() and remove detach_completion") changes the procedure of destroying workers; the dying workers are kept in the cull_list in wake_dying_workers() with the pool lock held and removed from the cull_list by the newly added reap_dying_workers() without the pool lock. This can cause a warning if the dying worker is wokenup earlier than reaped as reported by Marc: 2024/07/23 18:01:21 [M83LP63]: [ 157.267727] ------------[ cut here ]------------ 2024/07/23 18:01:21 [M83LP63]: [ 157.267735] WARNING: CPU: 21 PID: 725 at kernel/workqueue.c:3340 worker_thread+0x54e/0x558 2024/07/23 18:01:21 [M83LP63]: [ 157.267746] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables sunrpc dm_service_time s390_trng vfio_ccw mdev vfio_iommu_type1 vfio sch_fq_codel 2024/07/23 18:01:21 [M83LP63]: loop dm_multipath configfs nfnetlink lcs ctcm fsm zfcp scsi_transport_fc ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common scm_block eadm_sch scsi_dh_rdac scsi_dh_emc scsi_dh_alua pkey zcrypt rng_core autofs4 2024/07/23 18:01:21 [M83LP63]: [ 157.267792] CPU: 21 PID: 725 Comm: kworker/dying Not tainted 6.10.0-rc2-00239-g68f83057b913 #95 2024/07/23 18:01:21 [M83LP63]: [ 157.267796] Hardware name: IBM 3906 M04 704 (LPAR) 2024/07/23 18:01:21 [M83LP63]: [ 157.267802] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 2024/07/23 18:01:21 [M83LP63]: [ 157.267797] Krnl PSW : 0704d00180000000 000003d600fcd9fa (worker_thread+0x552/0x558) 2024/07/23 18:01:21 [M83LP63]: [ 157.267806] Krnl GPRS: 6479696e6700776f 000002c901b62780 000003d602493ec8 000002c914954600 2024/07/23 18:01:21 [M83LP63]: [ 157.267809] 0000000000000000 0000000000000008 000002c901a85400 000002c90719e840 2024/07/23 18:01:21 [M83LP63]: [ 157.267811] 000002c90719e880 000002c901a85420 000002c91127adf0 000002c901a85400 2024/07/23 18:01:21 [M83LP63]: [ 157.267813] 000002c914954600 0000000000000000 000003d600fcd772 000003560452bd98 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] Krnl Code: 000003d600fcd9ec: c0e500674262 brasl %r14,000003d601cb5eb0 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcd9f2: a7f4ffc8 brc 15,000003d600fcd982 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] #000003d600fcd9f6: af000000 mc 0,0 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] >000003d600fcd9fa: a7f4fec2 brc 15,000003d600fcd77e 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcd9fe: 0707 bcr 0,%r7 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcda00: c00400682e10 brcl 0,000003d601cd3620 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcda06: eb7ff0500024 stmg %r7,%r15,80(%r15) 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcda0c: b90400ef lgr %r14,%r15 2024/07/23 18:01:21 [M83LP63]: [ 157.267853] Call Trace: 2024/07/23 18:01:21 [M83LP63]: [ 157.267855] [<000003d600fcd9fa>] worker_thread+0x552/0x558 2024/07/23 18:01:21 [M83LP63]: [ 157.267859] ([<000003d600fcd772>] worker_thread+0x2ca/0x558) 2024/07/23 18:01:21 [M83LP63]: [ 157.267862] [<000003d600fd6c80>] kthread+0x120/0x128 2024/07/23 18:01:21 [M83LP63]: [ 157.267865] [<000003d600f5305c>] __ret_from_fork+0x3c/0x58 2024/07/23 18:01:21 [M83LP63]: [ 157.267868] [<000003d601cc746a>] ret_from_fork+0xa/0x30 2024/07/23 18:01:21 [M83LP63]: [ 157.267873] Last Breaking-Event-Address: 2024/07/23 18:01:21 [M83LP63]: [ 157.267874] [<000003d600fcd778>] worker_thread+0x2d0/0x558 Since the procedure of destroying workers is changed, the WARN_ON_ONCE() becomes incorrect and should be removed. Cc: Marc Hartmayer Link: https://lore.kernel.org/lkml/87le1sjd2e.fsf@linux.ibm.com/ Reported-by: Marc Hartmayer Fixes: 68f83057b913 ("workqueue: Reap workers via kthread_stop() and remove detach_completion") Cc: stable@vger.kernel.org # v6.11+ Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b35f8ce80bc7..d56bd2277e58 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3351,7 +3351,6 @@ woke_up: set_pf_worker(false); ida_free(&pool->worker_ida, worker->id); - WARN_ON_ONCE(!list_empty(&worker->entry)); return 0; } -- cgit v1.2.3 From 8bc35475ef1a23b0e224f3242eb11c76cab0ea88 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Aug 2024 09:37:25 -1000 Subject: workqueue: Fix spruious data race in __flush_work() When flushing a work item for cancellation, __flush_work() knows that it exclusively owns the work item through its PENDING bit. 134874e2eee9 ("workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items") added a read of @work->data to determine whether to use busy wait for BH work items that are being canceled. While the read is safe when @from_cancel, @work->data was read before testing @from_cancel to simplify code structure: data = *work_data_bits(work); if (from_cancel && !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { While the read data was never used if !@from_cancel, this could trigger KCSAN data race detection spuriously: ================================================================== BUG: KCSAN: data-race in __flush_work / __flush_work write to 0xffff8881223aa3e8 of 8 bytes by task 3998 on cpu 0: instrument_write include/linux/instrumented.h:41 [inline] ___set_bit include/asm-generic/bitops/instrumented-non-atomic.h:28 [inline] insert_wq_barrier kernel/workqueue.c:3790 [inline] start_flush_work kernel/workqueue.c:4142 [inline] __flush_work+0x30b/0x570 kernel/workqueue.c:4178 flush_work kernel/workqueue.c:4229 [inline] ... read to 0xffff8881223aa3e8 of 8 bytes by task 50 on cpu 1: __flush_work+0x42a/0x570 kernel/workqueue.c:4188 flush_work kernel/workqueue.c:4229 [inline] flush_delayed_work+0x66/0x70 kernel/workqueue.c:4251 ... value changed: 0x0000000000400000 -> 0xffff88810006c00d Reorganize the code so that @from_cancel is tested before @work->data is accessed. The only problem is triggering KCSAN detection spuriously. This shouldn't need READ_ONCE() or other access qualifiers. No functional changes. Signed-off-by: Tejun Heo Reported-by: syzbot+b3e4f2f51ed645fd5df2@syzkaller.appspotmail.com Fixes: 134874e2eee9 ("workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items") Link: http://lkml.kernel.org/r/000000000000ae429e061eea2157@google.com Cc: Jens Axboe --- kernel/workqueue.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d56bd2277e58..ef174d8c1f63 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4166,7 +4166,6 @@ already_gone: static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; - unsigned long data; if (WARN_ON(!wq_online)) return false; @@ -4184,29 +4183,35 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) * was queued on a BH workqueue, we also know that it was running in the * BH context and thus can be busy-waited. */ - data = *work_data_bits(work); - if (from_cancel && - !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { - /* - * On RT, prevent a live lock when %current preempted soft - * interrupt processing or prevents ksoftirqd from running by - * keeping flipping BH. If the BH work item runs on a different - * CPU then this has no effect other than doing the BH - * disable/enable dance for nothing. This is copied from - * kernel/softirq.c::tasklet_unlock_spin_wait(). - */ - while (!try_wait_for_completion(&barr.done)) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - local_bh_disable(); - local_bh_enable(); - } else { - cpu_relax(); + if (from_cancel) { + unsigned long data = *work_data_bits(work); + + if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && + (data & WORK_OFFQ_BH)) { + /* + * On RT, prevent a live lock when %current preempted + * soft interrupt processing or prevents ksoftirqd from + * running by keeping flipping BH. If the BH work item + * runs on a different CPU then this has no effect other + * than doing the BH disable/enable dance for nothing. + * This is copied from + * kernel/softirq.c::tasklet_unlock_spin_wait(). + */ + while (!try_wait_for_completion(&barr.done)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } } + goto out_destroy; } - } else { - wait_for_completion(&barr.done); } + wait_for_completion(&barr.done); + +out_destroy: destroy_work_on_stack(&barr.work); return true; } -- cgit v1.2.3 From c4c8f369b6a6d21ce27286de1501137771e01dc3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 5 Aug 2024 09:30:29 +0200 Subject: workqueue: Correct declaration of cpu_pwq in struct workqueue_struct cpu_pwq is used in various percpu functions that expect variable in __percpu address space. Correct the declaration of cpu_pwq to struct pool_workqueue __rcu * __percpu *cpu_pwq to declare the variable as __percpu pointer. The patch also fixes following sparse errors: workqueue.c:380:37: warning: duplicate [noderef] workqueue.c:380:37: error: multiple address spaces given: __rcu & __percpu workqueue.c:2271:15: error: incompatible types in comparison expression (different address spaces): workqueue.c:2271:15: struct pool_workqueue [noderef] __rcu * workqueue.c:2271:15: struct pool_workqueue [noderef] __percpu * and uncovers a couple of exisiting "incorrect type in assignment" warnings (from __rcu address space), which this patch does not address. Found by GCC's named address space checks. There were no changes in the resulting object files. Signed-off-by: Uros Bizjak Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef174d8c1f63..e7b005ff3750 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -377,7 +377,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ + struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; -- cgit v1.2.3 From 073107b39e553a5ef911f019d4e433fd5a8601b7 Mon Sep 17 00:00:00 2001 From: Sangmoon Kim Date: Tue, 6 Aug 2024 14:12:09 +0900 Subject: workqueue: add cmdline parameter workqueue.panic_on_stall When we want to debug the workqueue stall, we can immediately make a panic to get the information we want. In some systems, it may be necessary to quickly reboot the system to escape from a workqueue lockup situation. In this case, we can control the number of stall detections to generate panic. workqueue.panic_on_stall sets the number times of the stall to trigger panic. 0 disables the panic on stall. Signed-off-by: Sangmoon Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1745ca788ede..80cb0a923046 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7398,6 +7398,9 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; +static unsigned int wq_panic_on_stall; +module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); + /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. @@ -7449,6 +7452,16 @@ static void show_cpu_pools_hogs(void) rcu_read_unlock(); } +static void panic_on_wq_watchdog(void) +{ + static unsigned int wq_stall; + + if (wq_panic_on_stall) { + wq_stall++; + BUG_ON(wq_stall >= wq_panic_on_stall); + } +} + static void wq_watchdog_reset_touched(void) { int cpu; @@ -7521,6 +7534,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (cpu_pool_stall) show_cpu_pools_hogs(); + if (lockup_detected) + panic_on_wq_watchdog(); + wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } -- cgit v1.2.3 From b188c57af2b5c17a1e8f71a0358f330446a4f788 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 9 Aug 2024 15:28:23 -0700 Subject: workqueue: Split alloc_workqueue into internal function and lockdep init Will help enable user-defined lockdep maps for workqueues. Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo --- kernel/workqueue.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 80cb0a923046..3db4703f4652 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5612,9 +5612,9 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } __printf(1, 4) -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...) +static struct workqueue_struct *__alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) { va_list args; struct workqueue_struct *wq; @@ -5680,12 +5680,11 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); - wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (flags & WQ_UNBOUND) { if (alloc_node_nr_active(wq->node_nr_active) < 0) - goto err_unreg_lockdep; + goto err_free_wq; } /* @@ -5724,9 +5723,6 @@ err_unlock_free_node_nr_active: kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); } -err_unreg_lockdep: - wq_unregister_lockdep(wq); - wq_free_lockdep(wq); err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); @@ -5737,6 +5733,25 @@ err_destroy: destroy_workqueue(wq); return NULL; } + +__printf(1, 4) +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, max_active); + wq = __alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + wq_init_lockdep(wq); + + return wq; +} EXPORT_SYMBOL_GPL(alloc_workqueue); static bool pwq_busy(struct pool_workqueue *pwq) -- cgit v1.2.3 From 4f022f430e21e456893283036bc2ea78ac6bd2a1 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 9 Aug 2024 15:28:24 -0700 Subject: workqueue: Change workqueue lockdep map to pointer Will help enable user-defined lockdep maps for workqueues. Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo --- kernel/workqueue.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3db4703f4652..4c8ba5c4229f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -364,7 +364,8 @@ struct workqueue_struct { #ifdef CONFIG_LOCKDEP char *lock_name; struct lock_class_key key; - struct lockdep_map lockdep_map; + struct lockdep_map __lockdep_map; + struct lockdep_map *lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ @@ -3203,7 +3204,7 @@ __acquires(&pool->lock) lockdep_start_depth = lockdep_depth(current); /* see drain_dead_softirq_workfn() */ if (!bh_draining) - lock_map_acquire(&pwq->wq->lockdep_map); + lock_map_acquire(pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding @@ -3237,7 +3238,7 @@ __acquires(&pool->lock) pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); if (!bh_draining) - lock_map_release(&pwq->wq->lockdep_map); + lock_map_release(pwq->wq->lockdep_map); if (unlikely((worker->task && in_atomic()) || lockdep_depth(current) != lockdep_start_depth || @@ -3873,8 +3874,8 @@ static void touch_wq_lockdep_map(struct workqueue_struct *wq) if (wq->flags & WQ_BH) local_bh_disable(); - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); + lock_map_acquire(wq->lockdep_map); + lock_map_release(wq->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); @@ -3908,7 +3909,7 @@ void __flush_workqueue(struct workqueue_struct *wq) struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), + .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)), }; int next_color; @@ -4768,7 +4769,8 @@ static void wq_init_lockdep(struct workqueue_struct *wq) lock_name = wq->name; wq->lock_name = lock_name; - lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); + wq->lockdep_map = &wq->__lockdep_map; + lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0); } static void wq_unregister_lockdep(struct workqueue_struct *wq) -- cgit v1.2.3 From ec0a7d44b358afaaf52856d03c72e20587bc888b Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 9 Aug 2024 15:28:25 -0700 Subject: workqueue: Add interface for user-defined workqueue lockdep map Add an interface for a user-defined workqueue lockdep map, which is helpful when multiple workqueues are created for the same purpose. This also helps avoid leaking lockdep maps on each workqueue creation. v2: - Add alloc_workqueue_lockdep_map (Tejun) v3: - Drop __WQ_USER_OWNED_LOCKDEP (Tejun) - static inline alloc_ordered_workqueue_lockdep_map (Tejun) Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/workqueue.c | 28 +++++++++++++++++++++++++ 2 files changed, 80 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4eb8f9563136..8ccbf510880b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -507,6 +507,58 @@ void workqueue_softirq_dead(unsigned int cpu); __printf(1, 4) struct workqueue_struct * alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); +#ifdef CONFIG_LOCKDEP +/** + * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @lockdep_map: user-defined lockdep_map + * @...: args for @fmt + * + * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for + * workqueues created with the same purpose and to avoid leaking a lockdep_map + * on each workqueue creation. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(1, 5) struct workqueue_struct * +alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, + struct lockdep_map *lockdep_map, ...); + +/** + * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with + * user-defined lockdep_map + * + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) + * @lockdep_map: user-defined lockdep_map + * @args: args for @fmt + * + * Same as alloc_ordered_workqueue but with the a user-define lockdep_map. + * Useful for workqueues created with the same purpose and to avoid leaking a + * lockdep_map on each workqueue creation. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(1, 4) static inline struct workqueue_struct * +alloc_ordered_workqueue_lockdep_map(const char *fmt, unsigned int flags, + struct lockdep_map *lockdep_map, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, lockdep_map); + wq = alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | flags, + 1, lockdep_map, args); + va_end(args); + + return wq; +} +#endif + /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c8ba5c4229f..7468ba5e2417 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4775,11 +4775,17 @@ static void wq_init_lockdep(struct workqueue_struct *wq) static void wq_unregister_lockdep(struct workqueue_struct *wq) { + if (wq->lockdep_map != &wq->__lockdep_map) + return; + lockdep_unregister_key(&wq->key); } static void wq_free_lockdep(struct workqueue_struct *wq) { + if (wq->lockdep_map != &wq->__lockdep_map) + return; + if (wq->lock_name != wq->name) kfree(wq->lock_name); } @@ -5756,6 +5762,28 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, } EXPORT_SYMBOL_GPL(alloc_workqueue); +#ifdef CONFIG_LOCKDEP +__printf(1, 5) +struct workqueue_struct * +alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, + int max_active, struct lockdep_map *lockdep_map, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, lockdep_map); + wq = __alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + wq->lockdep_map = lockdep_map; + + return wq; +} +EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map); +#endif + static bool pwq_busy(struct pool_workqueue *pwq) { int i; -- cgit v1.2.3 From 9b59a85a84dc37ca4f2c54df5e06aff4c1eae5d3 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 20 Aug 2024 12:38:08 -0700 Subject: workqueue: Don't call va_start / va_end twice Calling va_start / va_end multiple times is undefined and causes problems with certain compiler / platforms. Change alloc_ordered_workqueue_lockdep_map to a macro and updated __alloc_workqueue to take a va_list argument. Cc: Sergey Senozhatsky Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 ++-------------- kernel/workqueue.c | 6 +----- 2 files changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 04dff07a9e2b..f3cc15940b4d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -543,20 +543,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -__printf(1, 4) static inline struct workqueue_struct * -alloc_ordered_workqueue_lockdep_map(const char *fmt, unsigned int flags, - struct lockdep_map *lockdep_map, ...) -{ - struct workqueue_struct *wq; - va_list args; - - va_start(args, lockdep_map); - wq = alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | flags, - 1, lockdep_map, args); - va_end(args); - - return wq; -} +#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...) \ + alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, lockdep_map, ##args) #endif /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7468ba5e2417..7cc77adb18bb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5619,12 +5619,10 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } -__printf(1, 4) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, - int max_active, ...) + int max_active, va_list args) { - va_list args; struct workqueue_struct *wq; size_t wq_size; int name_len; @@ -5656,9 +5654,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, goto err_free_wq; } - va_start(args, max_active); name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); - va_end(args); if (name_len >= WQ_NAME_LEN) pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", -- cgit v1.2.3 From 84c425bef3409d69ddb171c89664f66030bbf9d9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 15 Aug 2024 16:02:58 +0900 Subject: workqueue: fix null-ptr-deref on __alloc_workqueue() error wq->lockdep_map is set only after __alloc_workqueue() successfully returns. However, on its error path __alloc_workqueue() may call destroy_workqueue() which expects wq->lockdep_map to be already set, which results in a null-ptr-deref in touch_wq_lockdep_map(). Add a simple NULL-check to touch_wq_lockdep_map(). Oops: general protection fault, probably for non-canonical address KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:__lock_acquire+0x81/0x7800 [..] Call Trace: ? __die_body+0x66/0xb0 ? die_addr+0xb2/0xe0 ? exc_general_protection+0x300/0x470 ? asm_exc_general_protection+0x22/0x30 ? __lock_acquire+0x81/0x7800 ? mark_lock+0x94/0x330 ? __lock_acquire+0x12fd/0x7800 ? __lock_acquire+0x3439/0x7800 lock_acquire+0x14c/0x3e0 ? __flush_workqueue+0x167/0x13a0 ? __init_swait_queue_head+0xaf/0x150 ? __flush_workqueue+0x167/0x13a0 __flush_workqueue+0x17d/0x13a0 ? __flush_workqueue+0x167/0x13a0 ? lock_release+0x50f/0x830 ? drain_workqueue+0x94/0x300 drain_workqueue+0xe3/0x300 destroy_workqueue+0xac/0xc40 ? workqueue_sysfs_register+0x159/0x2f0 __alloc_workqueue+0x1506/0x1760 alloc_workqueue+0x61/0x150 ... Signed-off-by: Sergey Senozhatsky Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7cc77adb18bb..03dc2c95d62e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3871,6 +3871,9 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, static void touch_wq_lockdep_map(struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP + if (unlikely(!wq->lockdep_map)) + return; + if (wq->flags & WQ_BH) local_bh_disable(); -- cgit v1.2.3 From b4722b8593b8815785bbadf87f13c88e89a0ebef Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 11 Sep 2024 13:07:28 +0800 Subject: kernel/workqueue.c: fix DEFINE_PER_CPU_SHARED_ALIGNED expansion Make tags always produces below annoying warnings: ctags: Warning: kernel/workqueue.c:470: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:474: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:478: null expansion of name pattern "\1" In commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions"), codes in places have been adjusted including cpu_worker_pools definition. I noticed in commit 4cb1ef64609f ("workqueue: Implement BH workqueues to eventually replace tasklets"), cpu_worker_pools definition was unfolded back. Not sure if it was intentionally done or ignored carelessly. Makes change to mute them specifically. Signed-off-by: Baoquan He Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 03dc2c95d62e..95a7372431f6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -477,16 +477,13 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* to raise softirq for the BH worker pools on other CPUs */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], - bh_pool_irq_works); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); /* the BH worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - bh_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ -- cgit v1.2.3 From 73613840a8896f4f859eea489cb4a7a656939e70 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 12 Sep 2024 11:23:29 +0800 Subject: workqueue: Clear worker->pool in the worker thread context Marc Hartmayer reported: [ 23.133876] Unable to handle kernel pointer dereference in virtual kernel address space [ 23.133950] Failing address: 0000000000000000 TEID: 0000000000000483 [ 23.133954] Fault in home space mode while using kernel ASCE. [ 23.133957] AS:000000001b8f0007 R3:0000000056cf4007 S:0000000056cf3800 P:000000000000003d [ 23.134207] Oops: 0004 ilc:2 [#1] SMP (snip) [ 23.134516] Call Trace: [ 23.134520] [<0000024e326caf28>] worker_thread+0x48/0x430 [ 23.134525] ([<0000024e326caf18>] worker_thread+0x38/0x430) [ 23.134528] [<0000024e326d3a3e>] kthread+0x11e/0x130 [ 23.134533] [<0000024e3264b0dc>] __ret_from_fork+0x3c/0x60 [ 23.134536] [<0000024e333fb37a>] ret_from_fork+0xa/0x38 [ 23.134552] Last Breaking-Event-Address: [ 23.134553] [<0000024e333f4c04>] mutex_unlock+0x24/0x30 [ 23.134562] Kernel panic - not syncing: Fatal exception: panic_on_oops With debuging and analysis, worker_thread() accesses to the nullified worker->pool when the newly created worker is destroyed before being waken-up, in which case worker_thread() can see the result detach_worker() reseting worker->pool to NULL at the begining. Move the code "worker->pool = NULL;" out from detach_worker() to fix the problem. worker->pool had been designed to be constant for regular workers and changeable for rescuer. To share attaching/detaching code for regular and rescuer workers and to avoid worker->pool being accessed inadvertently when the worker has been detached, worker->pool is reset to NULL when detached no matter the worker is rescuer or not. To maintain worker->pool being reset after detached, move the code "worker->pool = NULL;" in the worker thread context after detached. It is either be in the regular worker thread context after PF_WQ_WORKER is cleared or in rescuer worker thread context with wq_pool_attach_mutex held. So it is safe to do so. Cc: Marc Hartmayer Link: https://lore.kernel.org/lkml/87wmjj971b.fsf@linux.ibm.com/ Reported-by: Marc Hartmayer Fixes: f4b7b53c94af ("workqueue: Detach workers directly in idle_cull_fn()") Cc: stable@vger.kernel.org # v6.11+ Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e7b005ff3750..6f2545037e57 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2709,7 +2709,6 @@ static void detach_worker(struct worker *worker) unbind_worker(worker); list_del(&worker->node); - worker->pool = NULL; } /** @@ -2729,6 +2728,7 @@ static void worker_detach_from_pool(struct worker *worker) mutex_lock(&wq_pool_attach_mutex); detach_worker(worker); + worker->pool = NULL; mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ @@ -3349,7 +3349,11 @@ woke_up: if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); - + /* + * The worker is dead and PF_WQ_WORKER is cleared, worker->pool + * shouldn't be accessed, reset it to NULL in case otherwise. + */ + worker->pool = NULL; ida_free(&pool->worker_ida, worker->id); return 0; } -- cgit v1.2.3