From 8b62645b09f870d70c7910e7550289d444239a46 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Fri, 20 Sep 2024 16:06:59 -0300 Subject: bpf: Use raw_spinlock_t in ringbuf The function __bpf_ringbuf_reserve is invoked from a tracepoint, which disables preemption. Using spinlock_t in this context can lead to a "sleep in atomic" warning in the RT variant. This issue is illustrated in the example below: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 556208, name: test_progs preempt_count: 1, expected: 0 RCU nest depth: 1, expected: 1 INFO: lockdep is turned off. Preemption disabled at: [] migrate_enable+0xc0/0x39c CPU: 7 PID: 556208 Comm: test_progs Tainted: G Hardware name: Qualcomm SA8775P Ride (DT) Call trace: dump_backtrace+0xac/0x130 show_stack+0x1c/0x30 dump_stack_lvl+0xac/0xe8 dump_stack+0x18/0x30 __might_resched+0x3bc/0x4fc rt_spin_lock+0x8c/0x1a4 __bpf_ringbuf_reserve+0xc4/0x254 bpf_ringbuf_reserve_dynptr+0x5c/0xdc bpf_prog_ac3d15160d62622a_test_read_write+0x104/0x238 trace_call_bpf+0x238/0x774 perf_call_bpf_enter.isra.0+0x104/0x194 perf_syscall_enter+0x2f8/0x510 trace_sys_enter+0x39c/0x564 syscall_trace_enter+0x220/0x3c0 do_el0_svc+0x138/0x1dc el0_svc+0x54/0x130 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Switch the spinlock to raw_spinlock_t to avoid this error. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Brian Grech Signed-off-by: Wander Lairson Costa Signed-off-by: Wander Lairson Costa Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240920190700.617253-1-wander@redhat.com --- kernel/bpf/ringbuf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..de3b681d1d13 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } -- cgit v1.2.3 From 678379e1d4f7443b170939525d3312cfc37bf86b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Aug 2024 15:17:00 -0400 Subject: close_range(): fix the logics in descriptor table trimming Cloning a descriptor table picks the size that would cover all currently opened files. That's fine for clone() and unshare(), but for close_range() there's an additional twist - we clone before we close, and it would be a shame to have close_range(3, ~0U, CLOSE_RANGE_UNSHARE) leave us with a huge descriptor table when we are not going to keep anything past stderr, just because some large file descriptor used to be open before our call has taken it out. Unfortunately, it had been dealt with in an inherently racy way - sane_fdtable_size() gets a "don't copy anything past that" argument (passed via unshare_fd() and dup_fd()), close_range() decides how much should be trimmed and passes that to unshare_fd(). The problem is, a range that used to extend to the end of descriptor table back when close_range() had looked at it might very well have stuff grown after it by the time dup_fd() has allocated a new files_struct and started to figure out the capacity of fdtable to be attached to that. That leads to interesting pathological cases; at the very least it's a QoI issue, since unshare(CLONE_FILES) is atomic in a sense that it takes a snapshot of descriptor table one might have observed at some point. Since CLOSE_RANGE_UNSHARE close_range() is supposed to be a combination of unshare(CLONE_FILES) with plain close_range(), ending up with a weird state that would never occur with unshare(2) is confusing, to put it mildly. It's not hard to get rid of - all it takes is passing both ends of the range down to sane_fdtable_size(). There we are under ->files_lock, so the race is trivially avoided. So we do the following: * switch close_files() from calling unshare_fd() to calling dup_fd(). * undo the calling convention change done to unshare_fd() in 60997c3d45d9 "close_range: add CLOSE_RANGE_UNSHARE" * introduce struct fd_range, pass a pointer to that to dup_fd() and sane_fdtable_size() instead of "trim everything past that point" they are currently getting. NULL means "we are not going to be punching any holes"; NR_OPEN_MAX is gone. * make sane_fdtable_size() use find_last_bit() instead of open-coding it; it's easier to follow that way. * while we are at it, have dup_fd() report errors by returning ERR_PTR(), no need to use a separate int *errorp argument. Fixes: 60997c3d45d9 "close_range: add CLOSE_RANGE_UNSHARE" Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/file.c | 95 ++++++++++++++++++------------------------------- include/linux/fdtable.h | 8 ++--- kernel/fork.c | 32 ++++++++--------- 3 files changed, 52 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/fs/file.c b/fs/file.c index 5125607d040a..eb093e736972 100644 --- a/fs/file.c +++ b/fs/file.c @@ -272,59 +272,45 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) return test_bit(fd, fdt->open_fds); } -static unsigned int count_open_files(struct fdtable *fdt) -{ - unsigned int size = fdt->max_fds; - unsigned int i; - - /* Find the last open fd */ - for (i = size / BITS_PER_LONG; i > 0; ) { - if (fdt->open_fds[--i]) - break; - } - i = (i + 1) * BITS_PER_LONG; - return i; -} - /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * - * 'max_fds' will normally already be properly aligned, but it - * turns out that in the close_range() -> __close_range() -> - * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end - * up having a 'max_fds' value that isn't already aligned. - * - * Rather than make close_range() have to worry about this, - * just make that BITS_PER_LONG alignment be part of a sane - * fdtable size. Becuase that's really what it is. + * punch_hole is optional - when close_range() is asked to unshare + * and close, we don't need to copy descriptors in that range, so + * a smaller cloned descriptor table might suffice if the last + * currently opened descriptor falls into that range. */ -static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { - unsigned int count; - - count = count_open_files(fdt); - if (max_fds < NR_OPEN_DEFAULT) - max_fds = NR_OPEN_DEFAULT; - return ALIGN(min(count, max_fds), BITS_PER_LONG); + unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); + + if (last == fdt->max_fds) + return NR_OPEN_DEFAULT; + if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { + last = find_last_bit(fdt->open_fds, punch_hole->from); + if (last == punch_hole->from) + return NR_OPEN_DEFAULT; + } + return ALIGN(last + 1, BITS_PER_LONG); } /* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. + * Allocate a new descriptor table and copy contents from the passed in + * instance. Returns a pointer to cloned table on success, ERR_PTR() + * on failure. For 'punch_hole' see sane_fdtable_size(). */ -struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; + int error; - *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) - goto out; + return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); @@ -341,7 +327,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. @@ -354,14 +340,14 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { - *errorp = -ENOMEM; + error = -ENOMEM; goto out_release; } /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { __free_fdtable(new_fdt); - *errorp = -EMFILE; + error = -EMFILE; goto out_release; } @@ -372,7 +358,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); @@ -406,8 +392,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int out_release: kmem_cache_free(files_cachep, newf); -out: - return NULL; + return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) @@ -748,37 +733,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) if (fd > max_fd) return -EINVAL; - if (flags & CLOSE_RANGE_UNSHARE) { - int ret; - unsigned int max_unshare_fds = NR_OPEN_MAX; + if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { + struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ - if (!(flags & CLOSE_RANGE_CLOEXEC)) { - /* - * If the requested range is greater than the current - * maximum, we're closing everything so only copy all - * file descriptors beneath the lowest file descriptor. - */ - rcu_read_lock(); - if (max_fd >= last_fd(files_fdtable(cur_fds))) - max_unshare_fds = fd; - rcu_read_unlock(); - } - - ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); - if (ret) - return ret; + if (flags & CLOSE_RANGE_CLOEXEC) + punch_hole = NULL; + fds = dup_fd(cur_fds, punch_hole); + if (IS_ERR(fds)) + return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ - if (fds) - swap(cur_fds, fds); + swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 2944d4aa413b..b1c5722f2b3c 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -22,7 +22,6 @@ * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG -#define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; @@ -106,7 +105,10 @@ struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); -struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; +struct fd_range { + unsigned int from, to; +}; +struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), @@ -115,8 +117,6 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern struct file *file_close_fd(unsigned int fd); -extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; diff --git a/kernel/fork.c b/kernel/fork.c index 60c0b4868fd4..89ceb4a68af2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1756,33 +1756,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; - int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0; if (no_files) { tsk->files = NULL; - goto out; + return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf); tsk->files = newf; - error = 0; -out: - return error; + return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@ -3238,17 +3235,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; } return 0; @@ -3306,7 +3302,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3398,7 +3394,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error; -- cgit v1.2.3 From e9bd9c498cb0f5843996dbe5cbce7a1836a83c70 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 24 Sep 2024 14:08:43 -0700 Subject: bpf: sync_linked_regs() must preserve subreg_def Range propagation must not affect subreg_def marks, otherwise the following example is rewritten by verifier incorrectly when BPF_F_TEST_RND_HI32 flag is set: 0: call bpf_ktime_get_ns call bpf_ktime_get_ns 1: r0 &= 0x7fffffff after verifier r0 &= 0x7fffffff 2: w1 = w0 rewrites w1 = w0 3: if w0 < 10 goto +0 --------------> r11 = 0x2f5674a6 (r) 4: r1 >>= 32 r11 <<= 32 (r) 5: r0 = r1 r1 |= r11 (r) 6: exit; if w0 < 0xa goto pc+0 r1 >>= 32 r0 = r1 exit (or zero extension of w1 at (2) is missing for architectures that require zero extension for upper register half). The following happens w/o this patch: - r0 is marked as not a subreg at (0); - w1 is marked as subreg at (2); - w1 subreg_def is overridden at (3) by copy_register_state(); - w1 is read at (5) but mark_insn_zext() does not mark (2) for zero extension, because w1 subreg_def is not set; - because of BPF_F_TEST_RND_HI32 flag verifier inserts random value for hi32 bits of (2) (marked (r)); - this random value is read at (5). Fixes: 75748837b7e5 ("bpf: Propagate scalar ranges through register assignments.") Reported-by: Lonial Con Signed-off-by: Lonial Con Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Closes: https://lore.kernel.org/bpf/7e2aa30a62d740db182c170fdd8f81c596df280d.camel@gmail.com Link: https://lore.kernel.org/bpf/20240924210844.1758441-1-eddyz87@gmail.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a7ed527e47e..434de48cd24b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15326,8 +15326,12 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { + s32 saved_subreg_def = reg->subreg_def; + copy_register_state(reg, known_reg); + reg->subreg_def = saved_subreg_def; } else { + s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; fake_reg.type = SCALAR_VALUE; @@ -15340,6 +15344,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); -- cgit v1.2.3 From 3c5d61ae919cc377c71118ccc76fa6e8518023f8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 30 Sep 2024 13:37:10 +0200 Subject: rcu/kvfree: Refactor kvfree_rcu_queue_batch() Improve readability of kvfree_rcu_queue_batch() function in away that, after a first batch queuing, the loop is break and success value is returned to a caller. There is no reason to loop and check batches further as all outstanding objects have already been picked and attached to a certain batch to complete an offloading. Fixes: 2b55d6a42d14 ("rcu/kvfree: Add kvfree_rcu_barrier() API") Suggested-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/ZvWUt2oyXRsvJRNc@pc636/T/ Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Vlastimil Babka --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a60616e69b66..b1f883fcd918 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3607,11 +3607,12 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) } // One work is per one batch, so there are three - // "free channels", the batch can handle. It can - // be that the work is in the pending state when - // channels have been detached following by each - // other. + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; } } -- cgit v1.2.3 From 9b5ce1a37e904fac32d560668134965f4e937f6c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 1 Oct 2024 03:34:01 +0200 Subject: sched: Fix sched_delayed vs cfs_bandwidth Meeting an unfinished DELAY_DEQUEUE treated entity in unthrottle_cfs_rq() leads to a couple terminal scenarios. Finish it first, so ENQUEUE_WAKEUP can proceed as it would have sans DELAY_DEQUEUE treatment. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: Venkat Rao Bagalkote Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Tested-by: Venkat Rao Bagalkote Link: https://lore.kernel.org/r/7515d2e64c989b9e3b828a9e21bcd959b99df06a.camel@gmx.de --- kernel/sched/fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 225b31aaee55..b63a7ac31162 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6058,10 +6058,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) { - SCHED_WARN_ON(se->sched_delayed); + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) -- cgit v1.2.3 From d4ac164bde7a12ec0a238a7ead5aa26819bbb1c1 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 25 Sep 2024 16:54:40 +0800 Subject: sched/eevdf: Fix wakeup-preempt by checking cfs_rq->nr_running Commit 85e511df3cec ("sched/eevdf: Allow shorter slices to wakeup-preempt") introduced a mechanism that a wakee with shorter slice could preempt the current running task. It also lower the bar for the current task to be preempted, by checking the rq->nr_running instead of cfs_rq->nr_running when the current task has ran out of time slice. But there is a scenario that is problematic. Say, if there is 1 cfs task and 1 rt task, before 85e511df3cec, update_deadline() will not trigger a reschedule, and after 85e511df3cec, since rq->nr_running is 2 and resched is true, a resched_curr() would happen. Some workloads (like the hackbench reported by lkp) do not like over-scheduling. We can see that the preemption rate has been increased by 2.2%: 1.654e+08 +2.2% 1.69e+08 hackbench.time.involuntary_context_switches Restore its previous check criterion. Fixes: 85e511df3cec ("sched/eevdf: Allow shorter slices to wakeup-preempt") Closes: https://lore.kernel.org/oe-lkp/202409231416.9403c2e9-oliver.sang@intel.com Reported-by: kernel test robot Suggested-by: K Prateek Nayak Signed-off-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Honglei Wang Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20240925085440.358138-1-yu.c.chen@intel.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b63a7ac31162..ab497fafa7be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (rq->nr_running == 1) + if (cfs_rq->nr_running == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { -- cgit v1.2.3 From ca9984c5f0ab3690d98b13937b2485a978c8dd73 Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 11 Sep 2024 10:41:18 +0200 Subject: bpf: devmap: provide rxq after redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rxq contains a pointer to the device from where the redirect happened. Currently, the BPF program that was executed after a redirect via BPF_MAP_TYPE_DEVMAP* does not have it set. This is particularly bad since accessing ingress_ifindex, e.g. SEC("xdp") int prog(struct xdp_md *pkt) { return bpf_redirect_map(&dev_redirect_map, 0, 0); } SEC("xdp/devmap") int prog_after_redirect(struct xdp_md *pkt) { bpf_printk("ifindex %i", pkt->ingress_ifindex); return XDP_PASS; } depends on access to rxq, so a NULL pointer gets dereferenced: <1>[ 574.475170] BUG: kernel NULL pointer dereference, address: 0000000000000000 <1>[ 574.475188] #PF: supervisor read access in kernel mode <1>[ 574.475194] #PF: error_code(0x0000) - not-present page <6>[ 574.475199] PGD 0 P4D 0 <4>[ 574.475207] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI <4>[ 574.475217] CPU: 4 UID: 0 PID: 217 Comm: kworker/4:1 Not tainted 6.11.0-rc5-reduced-00859-g780801200300 #23 <4>[ 574.475226] Hardware name: Intel(R) Client Systems NUC13ANHi7/NUC13ANBi7, BIOS ANRPL357.0026.2023.0314.1458 03/14/2023 <4>[ 574.475231] Workqueue: mld mld_ifc_work <4>[ 574.475247] RIP: 0010:bpf_prog_5e13354d9cf5018a_prog_after_redirect+0x17/0x3c <4>[ 574.475257] Code: cc cc cc cc cc cc cc 80 00 00 00 cc cc cc cc cc cc cc cc f3 0f 1e fa 0f 1f 44 00 00 66 90 55 48 89 e5 f3 0f 1e fa 48 8b 57 20 <48> 8b 52 00 8b 92 e0 00 00 00 48 bf f8 a6 d5 c4 5d a0 ff ff be 0b <4>[ 574.475263] RSP: 0018:ffffa62440280c98 EFLAGS: 00010206 <4>[ 574.475269] RAX: ffffa62440280cd8 RBX: 0000000000000001 RCX: 0000000000000000 <4>[ 574.475274] RDX: 0000000000000000 RSI: ffffa62440549048 RDI: ffffa62440280ce0 <4>[ 574.475278] RBP: ffffa62440280c98 R08: 0000000000000002 R09: 0000000000000001 <4>[ 574.475281] R10: ffffa05dc8b98000 R11: ffffa05f577fca40 R12: ffffa05dcab24000 <4>[ 574.475285] R13: ffffa62440280ce0 R14: ffffa62440549048 R15: ffffa62440549000 <4>[ 574.475289] FS: 0000000000000000(0000) GS:ffffa05f4f700000(0000) knlGS:0000000000000000 <4>[ 574.475294] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 574.475298] CR2: 0000000000000000 CR3: 000000025522e000 CR4: 0000000000f50ef0 <4>[ 574.475303] PKRU: 55555554 <4>[ 574.475306] Call Trace: <4>[ 574.475313] <4>[ 574.475318] ? __die+0x23/0x70 <4>[ 574.475329] ? page_fault_oops+0x180/0x4c0 <4>[ 574.475339] ? skb_pp_cow_data+0x34c/0x490 <4>[ 574.475346] ? kmem_cache_free+0x257/0x280 <4>[ 574.475357] ? exc_page_fault+0x67/0x150 <4>[ 574.475368] ? asm_exc_page_fault+0x26/0x30 <4>[ 574.475381] ? bpf_prog_5e13354d9cf5018a_prog_after_redirect+0x17/0x3c <4>[ 574.475386] bq_xmit_all+0x158/0x420 <4>[ 574.475397] __dev_flush+0x30/0x90 <4>[ 574.475407] veth_poll+0x216/0x250 [veth] <4>[ 574.475421] __napi_poll+0x28/0x1c0 <4>[ 574.475430] net_rx_action+0x32d/0x3a0 <4>[ 574.475441] handle_softirqs+0xcb/0x2c0 <4>[ 574.475451] do_softirq+0x40/0x60 <4>[ 574.475458] <4>[ 574.475461] <4>[ 574.475464] __local_bh_enable_ip+0x66/0x70 <4>[ 574.475471] __dev_queue_xmit+0x268/0xe40 <4>[ 574.475480] ? selinux_ip_postroute+0x213/0x420 <4>[ 574.475491] ? alloc_skb_with_frags+0x4a/0x1d0 <4>[ 574.475502] ip6_finish_output2+0x2be/0x640 <4>[ 574.475512] ? nf_hook_slow+0x42/0xf0 <4>[ 574.475521] ip6_finish_output+0x194/0x300 <4>[ 574.475529] ? __pfx_ip6_finish_output+0x10/0x10 <4>[ 574.475538] mld_sendpack+0x17c/0x240 <4>[ 574.475548] mld_ifc_work+0x192/0x410 <4>[ 574.475557] process_one_work+0x15d/0x380 <4>[ 574.475566] worker_thread+0x29d/0x3a0 <4>[ 574.475573] ? __pfx_worker_thread+0x10/0x10 <4>[ 574.475580] ? __pfx_worker_thread+0x10/0x10 <4>[ 574.475587] kthread+0xcd/0x100 <4>[ 574.475597] ? __pfx_kthread+0x10/0x10 <4>[ 574.475606] ret_from_fork+0x31/0x50 <4>[ 574.475615] ? __pfx_kthread+0x10/0x10 <4>[ 574.475623] ret_from_fork_asm+0x1a/0x30 <4>[ 574.475635] <4>[ 574.475637] Modules linked in: veth br_netfilter bridge stp llc iwlmvm x86_pkg_temp_thermal iwlwifi efivarfs nvme nvme_core <4>[ 574.475662] CR2: 0000000000000000 <4>[ 574.475668] ---[ end trace 0000000000000000 ]--- Therefore, provide it to the program by setting rxq properly. Fixes: cb261b594b41 ("bpf: Run devmap xdp_prog on flush instead of bulk enqueue") Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Florian Kauer Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240911-devel-koalo-fix-ingress-ifindex-v4-1-5c643ae10258@linutronix.de Signed-off-by: Martin KaFai Lau --- kernel/bpf/devmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9e0e3b0a18e4..7878be18e9d2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -333,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, - struct net_device *dev) + struct net_device *tx_dev, + struct net_device *rx_dev) { - struct xdp_txq_info txq = { .dev = dev }; + struct xdp_txq_info txq = { .dev = tx_dev }; + struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; @@ -346,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; + xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -360,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); + trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); @@ -388,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) } if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } -- cgit v1.2.3 From 50a3242d84ee1625b0bfef29b95f935958dccfbe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 10:49:25 -0400 Subject: tracing: Fix trace_check_vprintf() when tp_printk is used When the tp_printk kernel command line is used, the trace events go directly to printk(). It is still checked via the trace_check_vprintf() function to make sure the pointers of the trace event are legit. The addition of reading buffers from previous boots required adding a delta between the addresses of the previous boot and the current boot so that the pointers in the old buffer can still be used. But this required adding a trace_array pointer to acquire the delta offsets. The tp_printk code does not provide a trace_array (tr) pointer, so when the offsets were examined, a NULL pointer dereference happened and the kernel crashed. If the trace_array does not exist, just default the delta offsets to zero, as that also means the trace event is not being read from a previous boot. Link: https://lore.kernel.org/all/Zv3z5UsG_jsO9_Tb@aschofie-mobl2.lan/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241003104925.4e1b1fd9@gandalf.local.home Fixes: 07714b4bb3f98 ("tracing: Handle old buffer mappings for event strings and functions") Reported-by: Alison Schofield Tested-by: Alison Schofield Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c01375adc471..1c69ca1f1088 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,8 +3697,8 @@ static void test_can_verify(void) void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { - long text_delta = iter->tr->text_delta; - long data_delta = iter->tr->data_delta; + long text_delta = 0; + long data_delta = 0; const char *p = fmt; const char *str; bool good; @@ -3710,6 +3710,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, if (static_branch_unlikely(&trace_no_verify)) goto print; + /* + * When the kernel is booted with the tp_printk command line + * parameter, trace events go directly through to printk(). + * It also is checked by this function, but it does not + * have an associated trace_array (tr) for it. + */ + if (iter->tr) { + text_delta = iter->tr->text_delta; + data_delta = iter->tr->data_delta; + } + /* Don't bother checking when doing a ftrace_dump() */ if (iter->fmt == static_fmt_buf) goto print; -- cgit v1.2.3 From 0bb0a5c12ecf36ad561542bbb95f96355e036a02 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:11 +0800 Subject: tracing/timerlat: Fix duplicated kthread creation due to CPU online/offline osnoise_hotplug_workfn() is the asynchronous online callback for "trace/osnoise:online". It may be congested when a CPU goes online and offline repeatedly and is invoked for multiple times after a certain online. This will lead to kthread leak and timer corruption. Add a check in start_kthread() to prevent this situation. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-2-liwei391@huawei.com Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 1439064f65d6..d1a539913a5f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2007,6 +2007,10 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; @@ -2061,11 +2065,10 @@ static int start_per_cpu_kthreads(void) if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (!WARN_ON(!kthread)) kthread_stop(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } for_each_cpu(cpu, current_mask) { -- cgit v1.2.3 From b484a02c9cedf8703eff8f0756f94618004bd165 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:12 +0800 Subject: tracing/timerlat: Drop interface_lock in stop_kthread() stop_kthread() is the offline callback for "trace/osnoise:online", since commit 5bfbcd1ee57b ("tracing/timerlat: Add interface_lock around clearing of kthread in stop_kthread()"), the following ABBA deadlock scenario is introduced: T1 | T2 [BP] | T3 [AP] osnoise_hotplug_workfn() | work_for_cpu_fn() | cpuhp_thread_fun() | _cpu_down() | osnoise_cpu_die() mutex_lock(&interface_lock) | | stop_kthread() | cpus_write_lock() | mutex_lock(&interface_lock) cpus_read_lock() | cpuhp_kick_ap() | As the interface_lock here in just for protecting the "kthread" field of the osn_var, use xchg() instead to fix this issue. Also use for_each_online_cpu() back in stop_per_cpu_kthreads() as it can take cpu_read_lock() again. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-3-liwei391@huawei.com Fixes: 5bfbcd1ee57b ("tracing/timerlat: Add interface_lock around clearing of kthread in stop_kthread()") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index d1a539913a5f..e22567174dd3 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1953,12 +1953,8 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - mutex_lock(&interface_lock); - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (kthread) { - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; - mutex_unlock(&interface_lock); - if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); @@ -1972,7 +1968,6 @@ static void stop_kthread(unsigned int cpu) put_task_struct(kthread); } } else { - mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1994,8 +1989,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - for_each_possible_cpu(cpu) + cpus_read_lock(); + + for_each_online_cpu(cpu) stop_kthread(cpu); + + cpus_read_unlock(); } /* -- cgit v1.2.3 From 829e0c9f0855f26b3ae830d17b24aec103f7e915 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:13 +0800 Subject: tracing/timerlat: Fix a race during cpuhp processing There is another found exception that the "timerlat/1" thread was scheduled on CPU0, and lead to timer corruption finally: ``` ODEBUG: init active (active state 0) object: ffff888237c2e108 object type: hrtimer hint: timerlat_irq+0x0/0x220 WARNING: CPU: 0 PID: 426 at lib/debugobjects.c:518 debug_print_object+0x7d/0xb0 Modules linked in: CPU: 0 UID: 0 PID: 426 Comm: timerlat/1 Not tainted 6.11.0-rc7+ #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:debug_print_object+0x7d/0xb0 ... Call Trace: ? __warn+0x7c/0x110 ? debug_print_object+0x7d/0xb0 ? report_bug+0xf1/0x1d0 ? prb_read_valid+0x17/0x20 ? handle_bug+0x3f/0x70 ? exc_invalid_op+0x13/0x60 ? asm_exc_invalid_op+0x16/0x20 ? debug_print_object+0x7d/0xb0 ? debug_print_object+0x7d/0xb0 ? __pfx_timerlat_irq+0x10/0x10 __debug_object_init+0x110/0x150 hrtimer_init+0x1d/0x60 timerlat_main+0xab/0x2d0 ? __pfx_timerlat_main+0x10/0x10 kthread+0xb7/0xe0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2d/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 ``` After tracing the scheduling event, it was discovered that the migration of the "timerlat/1" thread was performed during thread creation. Further analysis confirmed that it is because the CPU online processing for osnoise is implemented through workers, which is asynchronous with the offline processing. When the worker was scheduled to create a thread, the CPU may has already been removed from the cpu_online_mask during the offline process, resulting in the inability to select the right CPU: T1 | T2 [CPUHP_ONLINE] | cpu_device_down() osnoise_hotplug_workfn() | | cpus_write_lock() | takedown_cpu(1) | cpus_write_unlock() [CPUHP_OFFLINE] | cpus_read_lock() | start_kthread(1) | cpus_read_unlock() | To fix this, skip online processing if the CPU is already offline. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-4-liwei391@huawei.com Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e22567174dd3..a50ed23bee77 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2097,6 +2097,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) mutex_lock(&interface_lock); cpus_read_lock(); + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; -- cgit v1.2.3 From 2a13ca2e8abb12ee43ada8a107dadca83f140937 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Tue, 24 Sep 2024 17:45:14 +0800 Subject: tracing/hwlat: Fix a race during cpuhp processing The cpuhp online/offline processing race also exists in percpu-mode hwlat tracer in theory, apply the fix too. That is: T1 | T2 [CPUHP_ONLINE] | cpu_device_down() hwlat_hotplug_workfn() | | cpus_write_lock() | takedown_cpu(1) | cpus_write_unlock() [CPUHP_OFFLINE] | cpus_read_lock() | start_kthread(1) | cpus_read_unlock() | Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240924094515.3561410-5-liwei391@huawei.com Fixes: ba998f7d9531 ("trace/hwlat: Support hotplug operations") Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_hwlat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b791524a6536..3bd6071441ad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -520,6 +520,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) goto out_unlock; -- cgit v1.2.3 From cc9877fb76771b7cbce6c9ec239f13a1d7759876 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:33:37 -1000 Subject: sched_ext: Improve error reporting during loading When the BPF scheduler fails, ops.exit() allows rich error reporting through scx_exit_info. Use scx.exit() path consistently for all failures which can be caused by the BPF scheduler: - scx_ops_error() is called after ops.init() and ops.cgroup_init() failure to record error information. - ops.init_task() failure now uses scx_ops_error() instead of pr_err(). - The err_disable path updated to automatically trigger scx_ops_error() to cover cases that the error message hasn't already been generated and always return 0 indicating init success so that the error is reported through ops.exit(). Signed-off-by: Tejun Heo Cc: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3cd7c50a51c5..76f04f3ace61 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -625,6 +625,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); goto err_disable; } } @@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_lock_irq(&scx_tasks_lock); scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } @@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_ops_bypass(false); - /* - * Returning an error code here would lose the recorded error - * information. Exit indicating success so that the error is notified - * through ops.exit() with all the details. - */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5241,10 +5241,18 @@ err_disable_unlock_all: scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } -- cgit v1.2.3 From ec010333ce7cf3270ae7193a6724794d5a179625 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:34:38 -1000 Subject: sched_ext: scx_cgroup_exit() may be called without successful scx_cgroup_init() 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") assumed that scx_cgroup_exit() is only called after scx_cgroup_init() finished successfully. This isn't true. scx_cgroup_exit() can be called without scx_cgroup_init() being called at all or after scx_cgroup_init() failed in the middle. As init state is tracked per cgroup, scx_cgroup_exit() can be used safely to clean up in all cases. Remove the incorrect WARN_ON_ONCE(). Signed-off-by: Tejun Heo Fixes: 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 76f04f3ace61..1df2082d1352 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4052,7 +4052,6 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - WARN_ON_ONCE(!scx_cgroup_enabled); scx_cgroup_enabled = false; /* -- cgit v1.2.3 From b62933eee41e2909422c2c3d7fdb56217913faf9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:11 -1000 Subject: sched/core: Make select_task_rq() take the pointer to wake_flags instead of value This will be used to allow select_task_rq() to indicate whether ->select_task_rq() was called by modifying *wake_flags. This makes try_to_wake_up() call all functions that take wake_flags with WF_TTWU set. Previously, only select_task_rq() was. Using the same flags is more consistent, and, as the flag is only tested by ->select_task_rq() implementations, it doesn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..e70b57a5693e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3518,12 +3518,12 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -4120,6 +4120,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4252,7 +4254,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4793,6 +4795,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4807,7 +4810,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4815,7 +4818,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* -- cgit v1.2.3 From f207dc2dcdcf0e1e7d260b392784855ce8d84147 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:12 -1000 Subject: sched/core: Add ENQUEUE_RQ_SELECTED to indicate whether ->select_task_rq() was called During ttwu, ->select_task_rq() can be skipped if only one CPU is allowed or migration is disabled. sched_ext schedulers may perform operations such as direct dispatch from ->select_task_rq() path and it is useful for them to know whether ->select_task_rq() was skipped in the ->enqueue_task() path. Currently, sched_ext schedulers are using ENQUEUE_WAKEUP for this purpose and end up assuming incorrectly that ->select_task_rq() was called for tasks that are bound to a single CPU or migration disabled. Make select_task_rq() indicate whether ->select_task_rq() was called by setting WF_RQ_SELECTED in *wake_flags and make ttwu_do_activate() map that to ENQUEUE_RQ_SELECTED for ->enqueue_task(). This will be used by sched_ext to fix ->select_task_rq() skip detection. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/core.c | 8 ++++++-- kernel/sched/sched.h | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e70b57a5693e..aeb595514461 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3522,10 +3522,12 @@ int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); - else + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..6085ef50febf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ @@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) -- cgit v1.2.3 From 9b671793c7d95f020791415cbbcc82b9c007d19c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:13 -1000 Subject: sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED scx_qmap and other schedulers in the SCX repo are using SCX_ENQ_WAKEUP to tell whether ops.select_cpu() was called. This is incorrect as ops.select_cpu() can be skipped in the wakeup path and leads to e.g. incorrectly skipping direct dispatch for tasks that are bound to a single CPU. sched core has been updated to specify ENQUEUE_RQ_SELECTED if ->select_task_rq() was called. Map it to SCX_ENQ_CPU_SELECTED and update scx_qmap to test it instead of SCX_ENQ_WAKEUP. Signed-off-by: Tejun Heo Acked-by: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 1 + tools/sched_ext/scx_qmap.bpf.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1df2082d1352..410a4df8a121 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -696,6 +696,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 67e2a7968cc9..5d1f880d1149 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -230,8 +230,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_WAKEUP) && + /* if select_cpu() wasn't called, try direct dispatch */ + if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); -- cgit v1.2.3 From 45126b155e3b5201179cdc038504bf93a8ccd921 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2024 18:09:58 +0200 Subject: bpf: Fix memory leak in bpf_core_apply We need to free specs properly. Fixes: 3d2786d65aaa ("bpf: correctly handle malformed BPF_CORE_TYPE_ID_LOCAL relos") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20241007160958.607434-1-jolsa@kernel.org --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..a05da5f43547 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8961,6 +8961,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); + kfree(specs); return -EINVAL; } -- cgit v1.2.3 From b24d7f0da6ef5a23456a301eaf51b170f961d4ae Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 5 Oct 2024 02:06:28 +0200 Subject: bpf, lsm: Remove bpf_lsm_key_free hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The key_free LSM hook has been removed. Remove the corresponding BPF hook. Avoid warnings during the build: BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_lsm_key_free Fixes: 5f8d28f6d7d5 ("lsm: infrastructure management of the key security blob") Signed-off-by: Thomas Weißschuh Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241005-lsm-key_free-v1-1-42ea801dbd63@weissschuh.net --- kernel/bpf/bpf_lsm.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 6292ac5f9bd1..3bc61628ab25 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -339,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ -#ifdef CONFIG_KEYS -BTF_ID(func, bpf_lsm_key_free) -#endif /* CONFIG_KEYS */ - BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) -- cgit v1.2.3 From 912da2c384d510ce40c5af9c3adc316afa4ec547 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Oct 2024 14:32:42 -0400 Subject: ring-buffer: Do not have boot mapped buffers hook to CPU hotplug The boot mapped ring buffer has its buffer mapped at a fixed location found at boot up. It is not dynamic. It cannot grow or be expanded when new CPUs come online. Do not hook fixed memory mapped ring buffers to the CPU hotplug callback, otherwise it can cause a crash when it tries to add the buffer to the memory that is already fully occupied. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241008143242.25e20801@gandalf.local.home Fixes: be68d63a139bd ("ring-buffer: Add ring_buffer_alloc_range()") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 77dc0b25140e..fb04445f92c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2337,9 +2337,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers[cpu]) goto fail_free_buffers; - ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); - if (ret < 0) - goto fail_free_buffers; + /* If already mapped, do not hook to CPU hotplug */ + if (!start) { + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; + } mutex_init(&buffer->mutex); -- cgit v1.2.3 From 214e01ad4ed7158cab66498810094fac5d09b218 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Sep 2024 23:46:34 +0200 Subject: kthread: unpark only parked kthread Calling into kthread unparking unconditionally is mostly harmless when the kthread is already unparked. The wake up is then simply ignored because the target is not in TASK_PARKED state. However if the kthread is per CPU, the wake up is preceded by a call to kthread_bind() which expects the task to be inactive and in TASK_PARKED state, which obviously isn't the case if it is unparked. As a result, calling kthread_stop() on an unparked per-cpu kthread triggers such a warning: WARNING: CPU: 0 PID: 11 at kernel/kthread.c:525 __kthread_bind_mask kernel/kthread.c:525 kthread_stop+0x17a/0x630 kernel/kthread.c:707 destroy_workqueue+0x136/0xc40 kernel/workqueue.c:5810 wg_destruct+0x1e2/0x2e0 drivers/net/wireguard/device.c:257 netdev_run_todo+0xe1a/0x1000 net/core/dev.c:10693 default_device_exit_batch+0xa14/0xa90 net/core/dev.c:11769 ops_exit_list net/core/net_namespace.c:178 [inline] cleanup_net+0x89d/0xcc0 net/core/net_namespace.c:640 process_one_work kernel/workqueue.c:3231 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312 worker_thread+0x86d/0xd70 kernel/workqueue.c:3393 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fix this with skipping unecessary unparking while stopping a kthread. Link: https://lkml.kernel.org/r/20240913214634.12557-1-frederic@kernel.org Fixes: 5c25b5ff89f0 ("workqueue: Tag bound workers with KTHREAD_IS_PER_CPU") Signed-off-by: Frederic Weisbecker Reported-by: syzbot+943d34fa3cf2191e3068@syzkaller.appspotmail.com Tested-by: syzbot+943d34fa3cf2191e3068@syzkaller.appspotmail.com Suggested-by: Thomas Gleixner Cc: Hillf Danton Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/kthread.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index db4ceb0f503c..9bb36897b6c6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -623,6 +623,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); + if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) + return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. -- cgit v1.2.3 From 0665d7a39bdf92c8ac3dc390501f303907c87f62 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 30 Sep 2024 15:06:11 +0800 Subject: resource, kunit: fix user-after-free in resource_test_region_intersects() In resource_test_insert_resource(), the pointer is used in error message after kfree(). This is user-after-free. To fix this, we need to call kunit_add_action_or_reset() to schedule memory freeing after usage. But kunit_add_action_or_reset() itself may fail and free the memory. So, its return value should be checked and abort the test for failure. Then, we found that other usage of kunit_add_action_or_reset() in resource_test_region_intersects() needs to be fixed too. We fix all these user-after-free bugs in this patch. Link: https://lkml.kernel.org/r/20240930070611.353338-1-ying.huang@intel.com Fixes: 99185c10d5d9 ("resource, kunit: add test case for region_intersects()") Signed-off-by: "Huang, Ying" Reported-by: Kees Bakker Closes: https://lore.kernel.org/lkml/87ldzaotcg.fsf@yhuang6-desk2.ccr.corp.intel.com/ Cc: Dan Williams Cc: David Hildenbrand Cc: Bjorn Helgaas Signed-off-by: Andrew Morton --- kernel/resource_kunit.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 42d2d8d20f5d..b8ef75b99eb2 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -169,6 +169,8 @@ static void resource_test_intersection(struct kunit *test) #define RES_TEST_RAM3_SIZE SZ_1M #define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) +KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *); + static void remove_free_resource(void *ctx) { struct resource *res = (struct resource *)ctx; @@ -177,6 +179,14 @@ static void remove_free_resource(void *ctx) kfree(res); } +static void resource_test_add_action_or_abort( + struct kunit *test, void (*action)(void *), void *ctx) +{ + KUNIT_ASSERT_EQ_MSG(test, 0, + kunit_add_action_or_reset(test, action, ctx), + "Fail to add action"); +} + static void resource_test_request_region(struct kunit *test, struct resource *parent, resource_size_t start, resource_size_t size, const char *name, unsigned long flags) @@ -185,7 +195,7 @@ static void resource_test_request_region(struct kunit *test, struct resource *pa res = __request_region(parent, start, size, name, flags); KUNIT_ASSERT_NOT_NULL(test, res); - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_insert_resource(struct kunit *test, struct resource *parent, @@ -202,11 +212,11 @@ static void resource_test_insert_resource(struct kunit *test, struct resource *p res->end = start + size - 1; res->flags = flags; if (insert_resource(parent, res)) { - kfree(res); + resource_test_add_action_or_abort(test, kfree_wrapper, res); KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); } - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_region_intersects(struct kunit *test) @@ -220,7 +230,7 @@ static void resource_test_region_intersects(struct kunit *test) "test resources"); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); start = parent->start; - kunit_add_action_or_reset(test, remove_free_resource, parent); + resource_test_add_action_or_abort(test, remove_free_resource, parent); resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); -- cgit v1.2.3 From 797d73ee232dd1833dec4824bc53a22032e97c1c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 8 Oct 2024 15:11:13 +0800 Subject: bpf: Check the remaining info_cnt before repeating btf fields When trying to repeat the btf fields for array of nested struct, it doesn't check the remaining info_cnt. The following splat will be reported when the value of ret * nelems is greater than BTF_FIELDS_MAX: ------------[ cut here ]------------ UBSAN: array-index-out-of-bounds in ../kernel/bpf/btf.c:3951:49 index 11 is out of range for type 'btf_field_info [11]' CPU: 6 UID: 0 PID: 411 Comm: test_progs ...... 6.11.0-rc4+ #1 Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ... Call Trace: dump_stack_lvl+0x57/0x70 dump_stack+0x10/0x20 ubsan_epilogue+0x9/0x40 __ubsan_handle_out_of_bounds+0x6f/0x80 ? kallsyms_lookup_name+0x48/0xb0 btf_parse_fields+0x992/0xce0 map_create+0x591/0x770 __sys_bpf+0x229/0x2410 __x64_sys_bpf+0x1f/0x30 x64_sys_call+0x199/0x9f0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fea56f2cc5d ...... ---[ end trace ]--- Fix it by checking the remaining info_cnt in btf_repeat_fields() before repeating the btf fields. Fixes: 64e8ee814819 ("bpf: look into the types of the fields of a struct type recursively.") Signed-off-by: Hou Tao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241008071114.3718177-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a05da5f43547..5cd1c7a23848 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3523,7 +3523,7 @@ end: * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ -static int btf_repeat_fields(struct btf_field_info *info, +static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; @@ -3543,6 +3543,12 @@ static int btf_repeat_fields(struct btf_field_info *info, } } + /* The type of struct size or variable size is u32, + * so the multiplication will not overflow. + */ + if (field_cnt * (repeat_cnt + 1) > info_cnt) + return -E2BIG; + cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); @@ -3587,7 +3593,7 @@ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type * info[i].off += off; if (nelems > 1) { - err = btf_repeat_fields(info, ret, nelems - 1, t->size); + err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else @@ -3681,10 +3687,10 @@ static int btf_find_field_one(const struct btf *btf, if (ret == BTF_FIELD_IGNORE) return 0; - if (nelems > info_cnt) + if (!info_cnt) return -E2BIG; if (nelems > 1) { - ret = btf_repeat_fields(info, 1, nelems - 1, sz); + ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } -- cgit v1.2.3 From 434247637c66e1be2bc71a9987d4c3f0d8672387 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 8 Oct 2024 17:07:35 -0400 Subject: bpf: use kvzmalloc to allocate BPF verifier environment The kzmalloc call in bpf_check can fail when memory is very fragmented, which in turn can lead to an OOM kill. Use kvzmalloc to fall back to vmalloc when memory is too fragmented to allocate an order 3 sized bpf verifier environment. Admittedly this is not a very common case, and only happens on systems where memory has already been squeezed close to the limit, but this does not seem like much of a hot path, and it's a simple enough fix. Signed-off-by: Rik van Riel Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20241008170735.16766766@imladris.surriel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 434de48cd24b..633fd6da40c2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22315,7 +22315,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -22551,6 +22551,6 @@ err_unlock: mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: - kfree(env); + kvfree(env); return ret; } -- cgit v1.2.3 From 4deecdd29cf29844c7bd164d72dc38d2e672f64e Mon Sep 17 00:00:00 2001 From: Tyrone Wu Date: Tue, 8 Oct 2024 16:43:11 +0000 Subject: bpf: fix unpopulated name_len field in perf_event link info Previously when retrieving `bpf_link_info.perf_event` for kprobe/uprobe/tracepoint, the `name_len` field was not populated by the kernel, leaving it to reflect the value initially set by the user. This behavior was inconsistent with how other input/output string buffer fields function (e.g. `raw_tracepoint.tp_name_len`). This patch fills `name_len` with the actual size of the string name. Fixes: 1b715e1b0ec5 ("bpf: Support ->fill_link_info for perf_event") Signed-off-by: Tyrone Wu Acked-by: Jiri Olsa Acked-by: Yafang Shao Link: https://lore.kernel.org/r/20241008164312.46269-1-wudevelops@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..8cfa7183d2ef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3565,15 +3565,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3581,10 +3582,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3609,7 +3617,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3617,7 +3625,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3639,7 +3647,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; @@ -3648,6 +3656,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; @@ -3673,12 +3682,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, -- cgit v1.2.3 From c425180d888e7d346d3b574a62a91932bca8797f Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Thu, 10 Oct 2024 11:34:05 +0800 Subject: sched_ext: use correct function name in pick_task_scx() warning message pick_next_task_scx() was turned into pick_task_scx() since commit 753e2836d139 ("sched_ext: Unify regular and core-sched pick task paths"). Update the outdated message. Signed-off-by: Honglei Wang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 410a4df8a121..c2596e7581fb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2958,8 +2958,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; -- cgit v1.2.3 From f7345ccc62a4b880cf76458db5f320725f28e400 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 10 Oct 2024 18:36:09 +0200 Subject: rcu/nocb: Fix rcuog wake-up from offline softirq After a CPU has set itself offline and before it eventually calls rcutree_report_cpu_dead(), there are still opportunities for callbacks to be enqueued, for example from a softirq. When that happens on NOCB, the rcuog wake-up is deferred through an IPI to an online CPU in order not to call into the scheduler and risk arming the RT-bandwidth after hrtimers have been migrated out and disabled. But performing a synchronized IPI from a softirq is buggy as reported in the following scenario: WARNING: CPU: 1 PID: 26 at kernel/smp.c:633 smp_call_function_single Modules linked in: rcutorture torture CPU: 1 UID: 0 PID: 26 Comm: migration/1 Not tainted 6.11.0-rc1-00012-g9139f93209d1 #1 Stopper: multi_cpu_stop+0x0/0x320 <- __stop_cpus+0xd0/0x120 RIP: 0010:smp_call_function_single swake_up_one_online __call_rcu_nocb_wake __call_rcu_common ? rcu_torture_one_read call_timer_fn __run_timers run_timer_softirq handle_softirqs irq_exit_rcu ? tick_handle_periodic sysvec_apic_timer_interrupt Fix this with forcing deferred rcuog wake up through the NOCB timer when the CPU is offline. The actual wake up will happen from rcutree_report_cpu_dead(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409231644.4c55582d-lkp@intel.com Fixes: 9139f93209d1 ("rcu/nocb: Fix RT throttling hrtimer armed from offline CPU") Reviewed-by: "Joel Fernandes (Google)" Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 97b99cd06923..16865475120b 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -554,13 +554,19 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags)) { + } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { + /* + * Don't do the wake-up upfront on fragile paths. + * Also offline CPUs can't call swake_up_one_online() from + * (soft-)IRQs. Rely on the final deferred wake-up from + * rcutree_report_cpu_dead() + */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); -- cgit v1.2.3 From 6cb86a0fdece87e126323ec1bb19deb16a52aedf Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 10 Oct 2024 15:27:07 +0200 Subject: bpf: fix kfunc btf caching for modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verifier contains a cache for looking up module BTF objects when calling kfuncs defined in modules. This cache uses a 'struct bpf_kfunc_btf_tab', which contains a sorted list of BTF objects that were already seen in the current verifier run, and the BTF objects are looked up by the offset stored in the relocated call instruction using bsearch(). The first time a given offset is seen, the module BTF is loaded from the file descriptor passed in by libbpf, and stored into the cache. However, there's a bug in the code storing the new entry: it stores a pointer to the new cache entry, then calls sort() to keep the cache sorted for the next lookup using bsearch(), and then returns the entry that was just stored through the stored pointer. However, because sort() modifies the list of entries in place *by value*, the stored pointer may no longer point to the right entry, in which case the wrong BTF object will be returned. The end result of this is an intermittent bug where, if a BPF program calls two functions with the same signature in two different modules, the function from the wrong module may sometimes end up being called. Whether this happens depends on the order of the calls in the BPF program (as that affects whether sort() reorders the array of BTF objects), making it especially hard to track down. Simon, credited as reporter below, spent significant effort analysing and creating a reproducer for this issue. The reproducer is added as a selftest in a subsequent patch. The fix is straight forward: simply don't use the stored pointer after calling sort(). Since we already have an on-stack pointer to the BTF object itself at the point where the function return, just use that, and populate it from the cache entry in the branch where the lookup succeeds. Fixes: 2357672c54c3 ("bpf: Introduce BPF support for kernel module function calls") Reported-by: Simon Sundberg Acked-by: Jiri Olsa Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20241010-fix-kfunc-btf-caching-for-modules-v2-1-745af6c1af98@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 633fd6da40c2..bf9996ea34fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2750,10 +2750,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, b->module = mod; b->offset = offset; + /* sort() reorders entries by value, so b may no longer point + * to the right entry after this + */ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); + } else { + btf = b->btf; } - return b->btf; + + return btf; } void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) -- cgit v1.2.3 From 54baa7ac0cebe53a03ba3083905021f92d2420db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Oct 2024 11:41:44 -1000 Subject: Revert "sched_ext: Use shorter slice while bypassing" This reverts commit 6f34d8d382d64e7d8e77f5a9ddfd06f4c04937b0. Slice length is ignored while bypassing and tasks are switched on every tick and thus the patch does not make any difference. The perceived difference was from test noise. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c2596e7581fb..2cb304b37014 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,7 +9,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { - SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, @@ -1949,7 +1948,6 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; @@ -1967,7 +1965,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (bypassing) + if (scx_rq_bypassing(rq)) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2022,7 +2020,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; + p->scx.slice = SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } -- cgit v1.2.3 From 3fdb9ebcec10a91e7825b95840c5a627dabcbca7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Oct 2024 11:41:44 -1000 Subject: sched_ext: Start schedulers with consistent p->scx.slice values The disable path caps p->scx.slice to SCX_SLICE_DFL. As the field is already being ignored at this stage during disable, the only effect this has is that when the next BPF scheduler is loaded, it won't see unreasonable left-over slices. Ultimately, this shouldn't matter but it's better to start in a known state. Drop p->scx.slice capping from the disable path and instead reset it to SCX_SLICE_DFL in the enable path. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2cb304b37014..4e56230e6e4a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4473,7 +4473,6 @@ static void scx_ops_disable_workfn(struct kthread_work *work) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); @@ -5190,6 +5189,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + p->scx.slice = SCX_SLICE_DFL; __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); -- cgit v1.2.3 From cc3e1caca998e445a030253d2dc42a0db6f5af30 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Oct 2024 11:41:44 -1000 Subject: sched_ext: Move scx_buildin_idle_enabled check to scx_bpf_select_cpu_dfl() Move the sanity check from the inner function scx_select_cpu_dfl() to the exported kfunc scx_bpf_select_cpu_dfl(). This doesn't cause behavior differences and will allow using scx_select_cpu_dfl() in bypass mode regardless of scx_builtin_idle_enabled. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4e56230e6e4a..957acb6ef81b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3062,11 +3062,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return prev_cpu; - } - /* * If WAKE_SYNC, the waker's local DSQ is empty, and the system is * under utilized, wake up @p to the local DSQ of the waker. Checking @@ -5870,16 +5865,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + goto prev_cpu; } + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu: *is_idle = false; return prev_cpu; -#endif } __bpf_kfunc_end_defs(); -- cgit v1.2.3 From aebe7ae4cb50551874fdfaf88e4127884298caad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Oct 2024 11:41:44 -1000 Subject: sched_ext: bypass mode shouldn't depend on ops.select_cpu() Bypass mode was depending on ops.select_cpu() which can't be trusted as with the rest of the BPF scheduler. Always enable and use scx_select_cpu_dfl() in bypass mode. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 957acb6ef81b..9b0319421393 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3126,7 +3126,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3191,7 +3191,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); - if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; @@ -4254,21 +4254,23 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - * %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. * - * b. ops.dispatch() is ignored. + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - * can't be trusted. Whenever a tick triggers, the running task is rotated to - * the tail of the queue with core_sched_at touched. + * - ops.dispatch() is ignored. * - * d. pick_next_task() suppresses zero slice warning. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - * operations. + * - pick_next_task() suppresses zero slice warning. * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { @@ -4338,7 +4340,7 @@ static void scx_ops_bypass(bool bypass) rq_unlock_irqrestore(rq, &rf); - /* kick to restore ticks */ + /* resched to restore ticks and idle state */ resched_cpu(cpu); } } -- cgit v1.2.3 From 967da578325d8539d42245d98f126f47abcc0569 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Oct 2024 11:41:44 -1000 Subject: sched_ext: Move scx_tasks_lock handling into scx_task_iter helpers Iterating with scx_task_iter involves scx_tasks_lock and optionally the rq lock of the task being iterated. Both locks can be released during iteration and the iteration can be continued after re-grabbing scx_tasks_lock. Currently, all lock handling is pushed to the caller which is a bit cumbersome and makes it difficult to add lock-aware behaviors. Make the scx_task_iter helpers handle scx_tasks_lock. - scx_task_iter_init/scx_taks_iter_exit() now grabs and releases scx_task_lock, respectively. Renamed to scx_task_iter_start/scx_task_iter_stop() to more clearly indicate that there are non-trivial side-effects. - Add __ prefix to scx_task_iter_rq_unlock() to indicate that the function is internal. - Add scx_task_iter_unlock/relock(). The former drops both rq lock (if held) and scx_tasks_lock and the latter re-locks only scx_tasks_lock. This doesn't cause behavior changes and will be used to implement stall avoidance. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 110 +++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9b0319421393..fc94048c9a98 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1276,76 +1276,86 @@ struct scx_task_iter { }; /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + spin_lock_irq(&scx_tasks_lock); + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; } +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } +} + /** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void scx_task_iter_relock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; - return true; - } else { - return false; - } + spin_lock_irq(&scx_tasks_lock); } /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - - scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { @@ -1373,14 +1383,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; - scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -4462,8 +4472,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_ops_init_task_enabled = false; - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; @@ -4478,8 +4487,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ @@ -5130,8 +5138,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5141,15 +5148,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; - scx_task_iter_rq_unlock(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); scx_ops_error("ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); goto err_disable_unlock_all; @@ -5158,10 +5163,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); @@ -5178,8 +5182,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; @@ -5194,8 +5197,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); -- cgit v1.2.3 From b07996c7abac0fe3f70bf74b0b3f76eb7852ef5a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Oct 2024 11:41:44 -1000 Subject: sched_ext: Don't hold scx_tasks_lock for too long While enabling and disabling a BPF scheduler, every task is iterated a couple times by walking scx_tasks. Except for one, all iterations keep holding scx_tasks_lock. On multi-socket systems under heavy rq lock contention and high number of threads, this can can lead to RCU and other stalls. The following is triggered on a 2 x AMD EPYC 7642 system (192 logical CPUs) running `stress-ng --workload 150 --workload-threads 10` with >400k idle threads and RCU stall period reduced to 5s: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: 91-...!: (10 ticks this GP) idle=0754/1/0x4000000000000000 softirq=18204/18206 fqs=17 rcu: 186-...!: (17 ticks this GP) idle=ec54/1/0x4000000000000000 softirq=25863/25866 fqs=17 rcu: (detected by 80, t=10042 jiffies, g=89305, q=33 ncpus=192) Sending NMI from CPU 80 to CPUs 91: NMI backtrace for cpu 91 CPU: 91 UID: 0 PID: 284038 Comm: sched_ext_ops_h Kdump: loaded Not tainted 6.12.0-rc2-work-g6bf5681f7ee2-dirty #471 Hardware name: Supermicro Super Server/H11DSi, BIOS 2.8 12/14/2023 Sched_ext: simple (disabling+all) RIP: 0010:queued_spin_lock_slowpath+0x17b/0x2f0 Code: 02 c0 10 03 00 83 79 08 00 75 08 f3 90 83 79 08 00 74 f8 48 8b 11 48 85 d2 74 09 0f 0d 0a eb 0a 31 d2 eb 06 31 d2 eb 02 f3 90 <8b> 07 66 85 c0 75 f7 39 d8 75 0d be 01 00 00 00 89 d8 f0 0f b1 37 RSP: 0018:ffffc9000fadfcb8 EFLAGS: 00000002 RAX: 0000000001700001 RBX: 0000000001700000 RCX: ffff88bfcaaf10c0 RDX: 0000000000000000 RSI: 0000000000000101 RDI: ffff88bfca8f0080 RBP: 0000000001700000 R08: 0000000000000090 R09: ffffffffffffffff R10: ffff88a74761b268 R11: 0000000000000000 R12: ffff88a6b6765460 R13: ffffc9000fadfd60 R14: ffff88bfca8f0080 R15: ffff88bfcaac0000 FS: 0000000000000000(0000) GS:ffff88bfcaac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5c55f526a0 CR3: 0000000afd474000 CR4: 0000000000350eb0 Call Trace: do_raw_spin_lock+0x9c/0xb0 task_rq_lock+0x50/0x190 scx_task_iter_next_locked+0x157/0x170 scx_ops_disable_workfn+0x2c2/0xbf0 kthread_worker_fn+0x108/0x2a0 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 Sending NMI from CPU 80 to CPUs 186: NMI backtrace for cpu 186 CPU: 186 UID: 0 PID: 51248 Comm: fish Kdump: loaded Not tainted 6.12.0-rc2-work-g6bf5681f7ee2-dirty #471 scx_task_iter can safely drop locks while iterating. Make scx_task_iter_next() drop scx_tasks_lock every 32 iterations to avoid stalls. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fc94048c9a98..cb1ab668e965 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -18,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -1273,6 +1279,7 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; + u32 cnt; }; /** @@ -1301,6 +1308,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; + iter->cnt = 0; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) @@ -1355,14 +1363,21 @@ static void scx_task_iter_stop(struct scx_task_iter *iter) * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_start() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cpu_relax(); + cond_resched(); + scx_task_iter_relock(iter); + } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) -- cgit v1.2.3 From ad6b5b6ea9b764018249285a4fe0a2226bef4caa Mon Sep 17 00:00:00 2001 From: Tyrone Wu Date: Fri, 11 Oct 2024 00:08:02 +0000 Subject: bpf: Fix unpopulated path_size when uprobe_multi fields unset Previously when retrieving `bpf_link_info.uprobe_multi` with `path` and `path_size` fields unset, the `path_size` field is not populated (remains 0). This behavior was inconsistent with how other input/output string buffer fields work, as the field should be populated in cases when: - both buffer and length are set (currently works as expected) - both buffer and length are unset (not working as expected) This patch now fills the `path_size` field when `path` and `path_size` are unset. Fixes: e56fdbfb06e2 ("bpf: Add link_info support for uprobe multi link") Signed-off-by: Tyrone Wu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241011000803.681190-1-wudevelops@gmail.com --- kernel/trace/bpf_trace.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..3bd402fa62a4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3133,7 +3133,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; - long left; + char *p, *buf; + long left = 0; if (!upath ^ !upath_size) return -EINVAL; @@ -3147,26 +3148,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; - if (upath) { - char *p, *buf; - - upath_size = min_t(u32, upath_size, PATH_MAX); - - buf = kmalloc(upath_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - p = d_path(&umulti_link->path, buf, upath_size); - if (IS_ERR(p)) { - kfree(buf); - return PTR_ERR(p); - } - upath_size = buf + upath_size - p; - left = copy_to_user(upath, p, upath_size); + upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { kfree(buf); - if (left) - return -EFAULT; - info->uprobe_multi.path_size = upath_size; + return PTR_ERR(p); } + upath_size = buf + upath_size - p; + + if (upath) + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; -- cgit v1.2.3 From d16b7eb6f523eeac3cff13001ef2a59cd462aa73 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Fri, 4 Oct 2024 08:37:29 -0400 Subject: sched/deadline: Use hrtick_enabled_dl() before start_hrtick_dl() The deadline server code moved one of the start_hrtick_dl() calls but dropped the dl specific hrtick_enabled check. This causes hrticks to get armed even when sched_feat(HRTICK_DL) is false. Fix it. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20241004123729.460668-1-pauld@redhat.com --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ce93d0bf452..be1b917dc8ce 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, &p->dl); } -- cgit v1.2.3 From 73ab05aa46b02d96509cb029a8d04fca7bbde8c7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 9 Oct 2024 21:44:32 -0400 Subject: sched/core: Disable page allocation in task_tick_mm_cid() With KASAN and PREEMPT_RT enabled, calling task_work_add() in task_tick_mm_cid() may cause the following splat. [ 63.696416] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ 63.696416] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 610, name: modprobe [ 63.696416] preempt_count: 10001, expected: 0 [ 63.696416] RCU nest depth: 1, expected: 1 This problem is caused by the following call trace. sched_tick() [ acquire rq->__lock ] -> task_tick_mm_cid() -> task_work_add() -> __kasan_record_aux_stack() -> kasan_save_stack() -> stack_depot_save_flags() -> alloc_pages_mpol_noprof() -> __alloc_pages_noprof() -> get_page_from_freelist() -> rmqueue() -> rmqueue_pcplist() -> __rmqueue_pcplist() -> rmqueue_bulk() -> rt_spin_lock() The rq lock is a raw_spinlock_t. We can't sleep while holding it. IOW, we can't call alloc_pages() in stack_depot_save_flags(). The task_tick_mm_cid() function with its task_work_add() call was introduced by commit 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") in v6.4 kernel. Fortunately, there is a kasan_record_aux_stack_noalloc() variant that calls stack_depot_save_flags() while not allowing it to allocate new pages. To allow task_tick_mm_cid() to use task_work without page allocation, a new TWAF_NO_ALLOC flag is added to enable calling kasan_record_aux_stack_noalloc() instead of kasan_record_aux_stack() if set. The task_tick_mm_cid() function is modified to add this new flag. The possible downside is the missing stack trace in a KASAN report due to new page allocation required when task_work_add_noallloc() is called which should be rare. Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241010014432.194742-1-longman@redhat.com --- include/linux/task_work.h | 5 ++++- kernel/sched/core.c | 4 +++- kernel/task_work.c | 15 +++++++++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index cf5e7e891a77..2964171856e0 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func) } enum task_work_notify_mode { - TWA_NONE, + TWA_NONE = 0, TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, + + TWA_FLAGS = 0xff00, + TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..0259301e572e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10458,7 +10458,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5d14d639ac71..c969f1f26be5 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; + int flags = notify & TWA_FLAGS; + notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + /* + * Record the work call stack in order to print it in KASAN + * reports. + * + * Note that stack allocation can fail if TWAF_NO_ALLOC flag + * is set and new page is needed to expand the stack buffer. + */ + if (flags & TWAF_NO_ALLOC) + kasan_record_aux_stack_noalloc(work); + else + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); -- cgit v1.2.3 From 98442f0ccd828ac42e89281a815e9e7a97533822 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Oct 2024 11:54:38 +0200 Subject: sched: Fix delayed_dequeue vs switched_from_fair() Commit 2e0199df252a ("sched/fair: Prepare exit/cleanup paths for delayed_dequeue") and its follow up fixes try to deal with a rather unfortunate situation where is task is enqueued in a new class, even though it shouldn't have been. Mostly because the existing ->switched_to/from() hooks are in the wrong place for this case. This all led to Paul being able to trigger failures at something like once per 10k CPU hours of RCU torture. For now, do the ugly thing and move the code to the right place by ignoring the switch hooks. Note: Clean up the whole sched_class::switch*_{to,from}() thing. Fixes: 2e0199df252a ("sched/fair: Prepare exit/cleanup paths for delayed_dequeue") Reported-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241003185037.GA5594@noisy.programming.kicks-ass.net --- kernel/sched/core.c | 29 ++++++++++++++++++----------- kernel/sched/ext.c | 4 ++-- kernel/sched/fair.c | 16 ---------------- kernel/sched/sched.h | 2 +- kernel/sched/syscalls.c | 13 +++++++++---- 5 files changed, 30 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0259301e572e..a860996622a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7010,20 +7010,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + #ifdef CONFIG_SCHED_CLASS_EXT - else if (task_should_scx(p)) - p->sched_class = &ext_sched_class; + if (task_should_scx(p)) + return &ext_sched_class; #endif - else - p->sched_class = &fair_sched_class; - p->prio = prio; + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7069,7 +7069,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7127,6 +7127,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7164,7 +7169,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + check_class_changing(rq, p, prev_class); if (queued) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3cd7c50a51c5..6f9de573ee93 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4471,7 +4471,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); - __setscheduler_prio(p, p->prio); + p->sched_class = __setscheduler_class(p, p->prio); check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -5186,7 +5186,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - __setscheduler_prio(p, p->prio); + p->sched_class = __setscheduler_class(p, p->prio); check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab497fafa7be..c157d4860a3b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13177,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); - /* - * Since this is called after changing class, this is a little weird - * and we cannot use DEQUEUE_DELAYED. - */ - if (p->se.sched_delayed) { - /* First, dequeue it from its new class' structures */ - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); - /* - * Now, clean up the fair_sched_class side of things - * related to sched_delayed being true and that wasn't done - * due to the generic dequeue not using DEQUEUE_DELAYED. - */ - finish_delayed_dequeue_entity(&p->se); - p->se.rel_deadline = 0; - __block_task(rq, p); - } } static void switched_to_fair(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..fba524c81c63 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3797,7 +3797,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern void __setscheduler_prio(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index aa70beee9895..0470bcc3d204 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -706,6 +706,12 @@ change: queue_flags &= ~DEQUEUE_MOVE; } + prev_class = p->sched_class; + next_class = __setscheduler_class(p, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -713,11 +719,10 @@ change: if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); + p->sched_class = next_class; + p->prio = newprio; } __setscheduler_uclamp(p, attr); check_class_changing(rq, p, prev_class); -- cgit v1.2.3 From f5aaff7bfa11fb0b2ee6b8fd7bbc16cfceea2ad3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Oct 2024 08:28:36 +0000 Subject: sched/core: Dequeue PSI signals for blocked tasks that are delayed psi_dequeue() in for blocked task expects psi_sched_switch() to clear the TSK_.*RUNNING PSI flags and set the TSK_IOWAIT flags however psi_sched_switch() uses "!task_on_rq_queued(prev)" to detect if the task is blocked or still runnable which is no longer true with DELAY_DEQUEUE since a blocking task can be left queued on the runqueue. This can lead to PSI splats similar to: psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=0 set=4 when the task is requeued since the TSK_RUNNING flag was not cleared when the task was blocked. Explicitly communicate that the task was blocked to psi_sched_switch() even if it was delayed and is still on the runqueue. [ prateek: Broke off the relevant part from [1], commit message ] Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/ Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/ Signed-off-by: Peter Zijlstra (Intel) Not-yet-signed-off-by: Peter Zijlstra Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Johannes Weiner Link: https://lore.kernel.org/lkml/20241004123506.GR18071@noisy.programming.kicks-ass.net/ [1] --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a860996622a6..9e09140ccb34 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6537,6 +6537,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6622,6 +6623,7 @@ static void __sched notrace __schedule(int sched_mode) * After this, schedule() must not care about p->state any more. */ block_task(rq, prev, flags); + block = true; } switch_count = &prev->nvcsw; } @@ -6667,7 +6669,7 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); -- cgit v1.2.3 From c6508124193d42bbc3224571eb75bfa4c1821fbb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 11 Oct 2024 10:49:33 +0200 Subject: sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug Since sched_delayed tasks remain queued even after blocking, the load balancer can migrate them between runqueues while PSI considers them to be asleep. As a result, it misreads the migration requeue followed by a wakeup as a double queue: psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4 First, call psi_enqueue() after p->sched_class->enqueue_task(). A wakeup will clear p->se.sched_delayed while a migration will not, so psi can use that flag to tell them apart. Then teach psi to migrate any "sleep" state when delayed-dequeue tasks are being migrated. Delayed-dequeue tasks can be revived by ttwu_runnable(), which will call down with a new ENQUEUE_DELAYED. Instead of further complicating the wakeup conditional in enqueue_task(), identify migration contexts instead and default to wakeup handling for all other cases. It's not just the warning in dmesg, the task state corruption causes a permanent CPU pressure indication, which messes with workload/machine health monitoring. Debugged-by-and-original-fix-by: K Prateek Nayak Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/ Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/ Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20241010193712.GC181795@cmpxchg.org --- kernel/sched/core.c | 12 ++++++------ kernel/sched/stats.h | 48 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9e09140ccb34..71232f8f9b96 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2012,11 +2012,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } - p->sched_class->enqueue_task(rq, p, flags); /* * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear @@ -2024,6 +2019,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); + if (!(flags & ENQUEUE_RESTORE)) { + sched_info_enqueue(rq, p); + psi_enqueue(p, flags & ENQUEUE_MIGRATED); + } + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } @@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); + psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); } /* diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..767e098a3bd1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, bool migrate) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; - - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!migrate); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (migrate) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, bool migrate) { if (static_branch_likely(&psi_disabled)) return; + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + if (migrate) + psi_task_change(p, p->psi_flags, 0); + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) - return; - - psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, -- cgit v1.2.3 From cd9626e9ebc77edec33023fe95dab4b04ffc819d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Oct 2024 11:38:10 +0200 Subject: sched/fair: Fix external p->on_rq users Sean noted that ever since commit 152e11f6df29 ("sched/fair: Implement delayed dequeue") KVM's preemption notifiers have started mis-classifying preemption vs blocking. Notably p->on_rq is no longer sufficient to determine if a task is runnable or blocked -- the aforementioned commit introduces tasks that remain on the runqueue even through they will not run again, and should be considered blocked for many cases. Add the task_is_runnable() helper to classify things and audit all external users of the p->on_rq state. Also add a few comments. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20241010091843.GK33184@noisy.programming.kicks-ass.net --- include/linux/sched.h | 5 +++++ kernel/events/core.c | 2 +- kernel/freezer.c | 7 ++++++- kernel/rcu/tasks.h | 9 +++++++++ kernel/sched/core.c | 12 +++++++++--- kernel/time/tick-sched.c | 6 ++++++ kernel/trace/trace_selftest.c | 2 +- virt/kvm/kvm_main.c | 2 +- 8 files changed, 38 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..8a9517e6640c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +static inline bool task_is_runnable(struct task_struct *p) +{ + return p->on_rq && !p->se.sched_delayed; +} + extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); diff --git a/kernel/events/core.c b/kernel/events/core.c index e3589c4287cb..cdd09769e6c5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->on_rq) { + if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 44bbd7dbd2c8..8d530d0949ff 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); - if (p->on_rq) + /* + * Allow freezing the sched_delayed tasks; they will not execute until + * ttwu() fixes them up, so it is safe to swap their state now, instead + * of waiting for them to get fully dequeued. + */ + if (task_is_runnable(p)) return 0; if (p != current && task_curr(p)) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6333f4ccf024..4d7ee95df06e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -985,6 +985,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t) if (!READ_ONCE(t->on_rq)) return false; + /* + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but + * since it is a spurious state (it will transition into the + * traditional blocked state or get woken up without outside + * dependencies), not considering it such should only affect timing. + * + * Be conservative for now and not include it. + */ + /* * Idle tasks (or idle injection) within the idle loop are RCU-tasks * quiescent states. But CPU boot code performed by the idle task diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 71232f8f9b96..7db711ba6d12 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -4317,9 +4322,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753a184c7090..f203f000da1a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk) * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask + * + * XXX given a task picks up the dependency on schedule(), should we + * only care about tasks that are currently on the CPU instead of all + * that are on the runqueue? + * + * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c4ad7cd7e778..1469dd8075fa 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tr->max_latency = 0; - while (p->on_rq) { + while (task_is_runnable(p)) { /* * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 05cbb2548d99..0c666f1870af 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6387,7 +6387,7 @@ static void kvm_sched_out(struct preempt_notifier *pn, WRITE_ONCE(vcpu->scheduled_out, true); - if (current->on_rq && vcpu->wants_to_run) { + if (task_is_runnable(current) && vcpu->wants_to_run) { WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true); } -- cgit v1.2.3 From 2cf9733891a460a16a209fcc20fbd138605b13b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Oct 2024 16:52:24 -0400 Subject: ring-buffer: Fix refcount setting of boot mapped buffers A ring buffer which has its buffered mapped at boot up to fixed memory should not be freed. Other buffers can be. The ref counting setup was wrong for both. It made the not mapped buffers ref count have zero, and the boot mapped buffer a ref count of 1. But an normally allocated buffer should be 1, where it can be removed. Keep the ref count of a normal boot buffer with its setup ref count (do not decrement it), and increment the fixed memory boot mapped buffer's ref count. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241011165224.33dd2624@gandalf.local.home Fixes: e645535a954ad ("tracing: Add option to use memmapped memory for trace boot instance") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c69ca1f1088..a8f52b6527ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10621,10 +10621,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); -- cgit v1.2.3 From 60e339be100d7d49e13616bd8b4b1b864f0a64a0 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 14 Oct 2024 12:58:30 -0500 Subject: sched_ext: Remove unnecessary cpu_relax() As described in commit b07996c7abac ("sched_ext: Don't hold scx_tasks_lock for too long"), we're doing a cond_resched() every 32 calls to scx_task_iter_next() to avoid RCU and other stalls. That commit also added a cpu_relax() to the codepath where we drop and reacquire the lock, but as Waiman described in [0], cpu_relax() should only be necessary in busy loops to avoid pounding on a cacheline (or to allow a hypertwin to more fully utilize a core). Let's remove the unnecessary cpu_relax(). [0]: https://lore.kernel.org/all/35b3889b-904a-4d26-981f-c8aa1557a7c7@redhat.com/ Cc: Waiman Long Signed-off-by: David Vernet Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index cb1ab668e965..6eae3b69bf6e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1374,7 +1374,6 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); - cpu_relax(); cond_resched(); scx_task_iter_relock(iter); } -- cgit v1.2.3 From d8794ac20a299b647ba9958f6d657051fc51a540 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 9 Oct 2024 15:23:01 +0800 Subject: posix-clock: Fix missing timespec64 check in pc_clock_settime() As Andrew pointed out, it will make sense that the PTP core checked timespec64 struct's tv_sec and tv_nsec range before calling ptp->info->settime64(). As the man manual of clock_settime() said, if tp.tv_sec is negative or tp.tv_nsec is outside the range [0..999,999,999], it should return EINVAL, which include dynamic clocks which handles PTP clock, and the condition is consistent with timespec64_valid(). As Thomas suggested, timespec64_valid() only check the timespec is valid, but not ensure that the time is in a valid range, so check it ahead using timespec64_valid_strict() in pc_clock_settime() and return -EINVAL if not valid. There are some drivers that use tp->tv_sec and tp->tv_nsec directly to write registers without validity checks and assume that the higher layer has checked it, which is dangerous and will benefit from this, such as hclge_ptp_settime(), igb_ptp_settime_i210(), _rcar_gen4_ptp_settime(), and some drivers can remove the checks of itself. Cc: stable@vger.kernel.org Fixes: 0606f422b453 ("posix clocks: Introduce dynamic clocks") Acked-by: Richard Cochran Suggested-by: Andrew Lunn Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241009072302.1754567-2-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski --- kernel/time/posix-clock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c2f3d0c490d5..316a4e8c97d3 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -318,6 +318,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) goto out; } + if (!timespec64_valid_strict(ts)) + return -EINVAL; + if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else -- cgit v1.2.3 From 09661f75e75cb6c1d2d8326a70c311d46729235f Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 15 Oct 2024 13:24:29 +0200 Subject: ring-buffer: Fix reader locking when changing the sub buffer order The function ring_buffer_subbuf_order_set() updates each ring_buffer_per_cpu and installs new sub buffers that match the requested page order. This operation may be invoked concurrently with readers that rely on some of the modified data, such as the head bit (RB_PAGE_HEAD), or the ring_buffer_per_cpu.pages and reader_page pointers. However, no exclusive access is acquired by ring_buffer_subbuf_order_set(). Modifying the mentioned data while a reader also operates on them can then result in incorrect memory access and various crashes. Fix the problem by taking the reader_lock when updating a specific ring_buffer_per_cpu in ring_buffer_subbuf_order_set(). Link: https://lore.kernel.org/linux-trace-kernel/20240715145141.5528-1-petr.pavlu@suse.com/ Link: https://lore.kernel.org/linux-trace-kernel/20241010195849.2f77cc3f@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20241011112850.17212b25@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241015112440.26987-1-petr.pavlu@suse.com Fixes: 8e7b58c27b3c ("ring-buffer: Just update the subbuffers when changing their allocation order") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fb04445f92c3..3ea4f7bb1837 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6728,39 +6728,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6769,11 +6768,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } -- cgit v1.2.3 From ae67b9fb8c4e981e929a665dcaa070f4b05ebdb4 Mon Sep 17 00:00:00 2001 From: Dimitar Kanaliev Date: Mon, 14 Oct 2024 15:11:53 +0300 Subject: bpf: Fix truncation bug in coerce_reg_to_size_sx() coerce_reg_to_size_sx() updates the register state after a sign-extension operation. However, there's a bug in the assignment order of the unsigned min/max values, leading to incorrect truncation: 0: (85) call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: (57) r0 &= 1 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1,var_off=(0x0; 0x1)) 2: (07) r0 += 254 ; R0_w=scalar(smin=umin=smin32=umin32=254,smax=umax=smax32=umax32=255,var_off=(0xfe; 0x1)) 3: (bf) r0 = (s8)r0 ; R0_w=scalar(smin=smin32=-2,smax=smax32=-1,umin=umin32=0xfffffffe,umax=0xffffffff,var_off=(0xfffffffffffffffe; 0x1)) In the current implementation, the unsigned 32-bit min/max values (u32_min_value and u32_max_value) are assigned directly from the 64-bit signed min/max values (s64_min and s64_max): reg->umin_value = reg->u32_min_value = s64_min; reg->umax_value = reg->u32_max_value = s64_max; Due to the chain assigmnent, this is equivalent to: reg->u32_min_value = s64_min; // Unintended truncation reg->umin_value = reg->u32_min_value; reg->u32_max_value = s64_max; // Unintended truncation reg->umax_value = reg->u32_max_value; Fixes: 1f9a1ea821ff ("bpf: Support new sign-extension load insns") Reported-by: Shung-Hsi Yu Reported-by: Zac Ecob Signed-off-by: Dimitar Kanaliev Acked-by: Yonghong Song Reviewed-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20241014121155.92887-2-dimitar.kanaliev@siteground.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf9996ea34fe..a8a0b6e4110e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6339,10 +6339,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->smin_value = reg->s32_min_value = s64_min; - reg->smax_value = reg->s32_max_value = s64_max; - reg->umin_value = reg->u32_min_value = s64_min; - reg->umax_value = reg->u32_max_value = s64_max; + reg->s32_min_value = reg->smin_value = s64_min; + reg->s32_max_value = reg->smax_value = s64_max; + reg->u32_min_value = reg->umin_value = s64_min; + reg->u32_max_value = reg->umax_value = s64_max; reg->var_off = tnum_range(s64_min, s64_max); return; } -- cgit v1.2.3 From 9495a5b731fcaf580448a3438d63601c88367661 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 16 Oct 2024 14:00:47 -0700 Subject: bpf: Fix iter/task tid filtering In userspace, you can add a tid filter by setting the "task.tid" field for "bpf_iter_link_info". However, `get_pid_task` when called for the `BPF_TASK_ITER_TID` type should have been using `PIDTYPE_PID` (tid) instead of `PIDTYPE_TGID` (pid). Fixes: f0d74c4da1f0 ("bpf: Parameterize task iterators.") Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241016210048.1213935-1-linux@jordanrome.com --- kernel/bpf/task_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..5af9e130e500 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -99,7 +99,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co rcu_read_lock(); pid = find_pid_ns(common->pid, common->ns); if (pid) { - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, PIDTYPE_PID); *tid = common->pid; } rcu_read_unlock(); -- cgit v1.2.3 From 3878ae04e9fc24dacb77a1d32bd87e7d8108599e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Oct 2024 15:49:11 +0200 Subject: bpf: Fix incorrect delta propagation between linked registers Nathaniel reported a bug in the linked scalar delta tracking, which can lead to accepting a program with OOB access. The specific code is related to the sync_linked_regs() function and the BPF_ADD_CONST flag, which signifies a constant offset between two scalar registers tracked by the same register id. The verifier attempts to track "similar" scalars in order to propagate bounds information learned about one scalar to others. For instance, if r1 and r2 are known to contain the same value, then upon encountering 'if (r1 != 0x1234) goto xyz', not only does it know that r1 is equal to 0x1234 on the path where that conditional jump is not taken, it also knows that r2 is. Additionally, with env->bpf_capable set, the verifier will track scalars which should be a constant delta apart (if r1 is known to be one greater than r2, then if r1 is known to be equal to 0x1234, r2 must be equal to 0x1233.) The code path for the latter in adjust_reg_min_max_vals() is reached when processing both 32 and 64-bit addition operations. While adjust_reg_min_max_vals() knows whether dst_reg was produced by a 32 or a 64-bit addition (based on the alu32 bool), the only information saved in dst_reg is the id of the source register (reg->id, or'ed by BPF_ADD_CONST) and the value of the constant offset (reg->off). Later, the function sync_linked_regs() will attempt to use this information to propagate bounds information from one register (known_reg) to others, meaning, for all R in linked_regs, it copies known_reg range (and possibly adjusting delta) into R for the case of R->id == known_reg->id. For the delta adjustment, meaning, matching reg->id with BPF_ADD_CONST, the verifier adjusts the register as reg = known_reg; reg += delta where delta is computed as (s32)reg->off - (s32)known_reg->off and placed as a scalar into a fake_reg to then simulate the addition of reg += fake_reg. This is only correct, however, if the value in reg was created by a 64-bit addition. When reg contains the result of a 32-bit addition operation, its upper 32 bits will always be zero. sync_linked_regs() on the other hand, may cause the verifier to believe that the addition between fake_reg and reg overflows into those upper bits. For example, if reg was generated by adding the constant 1 to known_reg using a 32-bit alu operation, then reg->off is 1 and known_reg->off is 0. If known_reg is known to be the constant 0xFFFFFFFF, sync_linked_regs() will tell the verifier that reg is equal to the constant 0x100000000. This is incorrect as the actual value of reg will be 0, as the 32-bit addition will wrap around. Example: 0: (b7) r0 = 0; R0_w=0 1: (18) r1 = 0x80000001; R1_w=0x80000001 3: (37) r1 /= 1; R1_w=scalar() 4: (bf) r2 = r1; R1_w=scalar(id=1) R2_w=scalar(id=1) 5: (bf) r4 = r1; R1_w=scalar(id=1) R4_w=scalar(id=1) 6: (04) w2 += 2147483647; R2_w=scalar(id=1+2147483647,smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 7: (04) w4 += 0 ; R4_w=scalar(id=1+0,smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 8: (15) if r2 == 0x0 goto pc+1 10: R0=0 R1=0xffffffff80000001 R2=0x7fffffff R4=0xffffffff80000001 R10=fp0 What can be seen here is that r1 is copied to r2 and r4, such that {r1,r2,r4}.id are all the same which later lets sync_linked_regs() to be invoked. Then, in a next step constants are added with alu32 to r2 and r4, setting their ->off, as well as id |= BPF_ADD_CONST. Next, the conditional will bind r2 and propagate ranges to its linked registers. The verifier now believes the upper 32 bits of r4 are r4=0xffffffff80000001, while actually r4=r1=0x80000001. One approach for a simple fix suitable also for stable is to limit the constant delta tracking to only 64-bit alu addition. If necessary at some later point, BPF_ADD_CONST could be split into BPF_ADD_CONST64 and BPF_ADD_CONST32 to avoid mixing the two under the tradeoff to further complicate sync_linked_regs(). However, none of the added tests from dedf56d775c0 ("selftests/bpf: Add tests for add_const") make this necessary at this point, meaning, BPF CI also passes with just limiting tracking to 64-bit alu addition. Fixes: 98d7ca374ba4 ("bpf: Track delta between "linked" registers.") Reported-by: Nathaniel Theis Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20241016134913.32249-1-daniel@iogearbox.net --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8a0b6e4110e..411ab1b57af4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14270,12 +14270,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So remember constant delta between r2 and r1 and update r1 after - * 'if' condition. + * So for 64-bit alu remember constant delta between r2 and r1 and + * update r1 after 'if' condition. */ - if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && - dst_reg->id && is_reg_const(src_reg, alu32)) { - u64 val = reg_const_value(src_reg, alu32); + if (env->bpf_capable && + BPF_OP(insn->code) == BPF_ADD && !alu32 && + dst_reg->id && is_reg_const(src_reg, false)) { + u64 val = reg_const_value(src_reg, false); if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */ -- cgit v1.2.3 From 3e9e708757ca3b7eb65a820031d62fea1a265709 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Oct 2024 15:49:12 +0200 Subject: bpf: Fix print_reg_state's constant scalar dump print_reg_state() should not consider adding reg->off to reg->var_off.value when dumping scalars. Scalars can be produced with reg->off != 0 through BPF_ADD_CONST, and thus as-is this can skew the register log dump. Fixes: 98d7ca374ba4 ("bpf: Track delta between "linked" registers.") Reported-by: Nathaniel Theis Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241016134913.32249-2-daniel@iogearbox.net --- kernel/bpf/log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 5aebfc3051e3..4a858fdb6476 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -688,8 +688,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value); return; } -- cgit v1.2.3 From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:43:00 -0400 Subject: fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks The function graph infrastructure allocates a shadow stack for every task when enabled. This includes the idle tasks. The first time the function graph is invoked, the shadow stacks are created and never freed until the task exits. This includes the idle tasks. Only the idle tasks that were for online CPUs had their shadow stacks created when function graph tracing started. If function graph tracing is enabled and a CPU comes online, the idle task representing that CPU will not have its shadow stack created, and all function graph tracing for that idle task will be silently dropped. Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks. This will include idle tasks for CPUs that come online during tracing. This issue can be reproduced by: # cd /sys/kernel/tracing # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 0 > set_ftrace_pid # echo function_graph > current_tracer # echo 1 > options/funcgraph-proc # echo 1 > /sys/devices/system/cpu/cpu1 # grep '' per_cpu/cpu1/trace | head Before, nothing would show up. After: 1) -0 | 0.811 us | __enqueue_entity(); 1) -0 | 5.626 us | } /* enqueue_entity */ 1) -0 | | dl_server_update_idle_time() { 1) -0 | | dl_scaled_delta_exec() { 1) -0 | 0.450 us | arch_scale_cpu_capacity(); 1) -0 | 1.242 us | } 1) -0 | 1.908 us | } 1) -0 | | dl_server_start() { 1) -0 | | enqueue_dl_entity() { 1) -0 | | task_contending() { Note, if tracing stops and restarts, the old way would then initialize the onlined CPUs. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Thomas Gleixner Link: https://lore.kernel.org/20241018214300.6df82178@rorschach Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..43f4e3f57438 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; mutex_lock(&ftrace_lock); + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } + if (!fgraph_array[0]) { /* The array must always have real data on it */ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) -- cgit v1.2.3 From fae4078c289a2f24229c0de652249948b1cd6bdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Oct 2024 21:52:12 -0400 Subject: fgraph: Allocate ret_stack_list with proper size The ret_stack_list is an array of ret_stack shadow stacks for the function graph usage. When the first function graph is enabled, all tasks in the system get a shadow stack. The ret_stack_list is a 32 element array of pointers to these shadow stacks. It allocates the shadow stack in batches (32 stacks at a time), assigns them to running tasks, and continues until all tasks are covered. When the function graph shadow stack changed from an array of ftrace_ret_stack structures to an array of longs, the allocation of ret_stack_list went from allocating an array of 32 elements to just a block defined by SHADOW_STACK_SIZE. Luckily, that's defined as PAGE_SIZE and is much more than enough to hold 32 pointers. But it is way overkill for the amount needed to allocate. Change the allocation of ret_stack_list back to a kcalloc() of FTRACE_RETSTACK_ALLOC_SIZE pointers. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241018215212.23f13f40@rorschach Fixes: 42675b723b484 ("function_graph: Convert ret_stack to a series of longs") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 43f4e3f57438..41e7a15dcb50 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1162,7 +1162,8 @@ static int start_graph_tracing(void) unsigned long **ret_stack_list; int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; -- cgit v1.2.3 From 373b9338c9722a368925d83bc622c596896b328e Mon Sep 17 00:00:00 2001 From: Qiao Ma Date: Tue, 15 Oct 2024 14:01:48 +0800 Subject: uprobe: avoid out-of-bounds memory access of fetching args Uprobe needs to fetch args into a percpu buffer, and then copy to ring buffer to avoid non-atomic context problem. Sometimes user-space strings, arrays can be very large, but the size of percpu buffer is only page size. And store_trace_args() won't check whether these data exceeds a single page or not, caused out-of-bounds memory access. It could be reproduced by following steps: 1. build kernel with CONFIG_KASAN enabled 2. save follow program as test.c ``` \#include \#include \#include // If string length large than MAX_STRING_SIZE, the fetch_store_strlen() // will return 0, cause __get_data_size() return shorter size, and // store_trace_args() will not trigger out-of-bounds access. // So make string length less than 4096. \#define STRLEN 4093 void generate_string(char *str, int n) { int i; for (i = 0; i < n; ++i) { char c = i % 26 + 'a'; str[i] = c; } str[n-1] = '\0'; } void print_string(char *str) { printf("%s\n", str); } int main() { char tmp[STRLEN]; generate_string(tmp, STRLEN); print_string(tmp); return 0; } ``` 3. compile program `gcc -o test test.c` 4. get the offset of `print_string()` ``` objdump -t test | grep -w print_string 0000000000401199 g F .text 000000000000001b print_string ``` 5. configure uprobe with offset 0x1199 ``` off=0x1199 cd /sys/kernel/debug/tracing/ echo "p /root/test:${off} arg1=+0(%di):ustring arg2=\$comm arg3=+0(%di):ustring" > uprobe_events echo 1 > events/uprobes/enable echo 1 > tracing_on ``` 6. run `test`, and kasan will report error. ================================================================== BUG: KASAN: use-after-free in strncpy_from_user+0x1d6/0x1f0 Write of size 8 at addr ffff88812311c004 by task test/499CPU: 0 UID: 0 PID: 499 Comm: test Not tainted 6.12.0-rc3+ #18 Hardware name: Red Hat KVM, BIOS 1.16.0-4.al8 04/01/2014 Call Trace: dump_stack_lvl+0x55/0x70 print_address_description.constprop.0+0x27/0x310 kasan_report+0x10f/0x120 ? strncpy_from_user+0x1d6/0x1f0 strncpy_from_user+0x1d6/0x1f0 ? rmqueue.constprop.0+0x70d/0x2ad0 process_fetch_insn+0xb26/0x1470 ? __pfx_process_fetch_insn+0x10/0x10 ? _raw_spin_lock+0x85/0xe0 ? __pfx__raw_spin_lock+0x10/0x10 ? __pte_offset_map+0x1f/0x2d0 ? unwind_next_frame+0xc5f/0x1f80 ? arch_stack_walk+0x68/0xf0 ? is_bpf_text_address+0x23/0x30 ? kernel_text_address.part.0+0xbb/0xd0 ? __kernel_text_address+0x66/0xb0 ? unwind_get_return_address+0x5e/0xa0 ? __pfx_stack_trace_consume_entry+0x10/0x10 ? arch_stack_walk+0xa2/0xf0 ? _raw_spin_lock_irqsave+0x8b/0xf0 ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? depot_alloc_stack+0x4c/0x1f0 ? _raw_spin_unlock_irqrestore+0xe/0x30 ? stack_depot_save_flags+0x35d/0x4f0 ? kasan_save_stack+0x34/0x50 ? kasan_save_stack+0x24/0x50 ? mutex_lock+0x91/0xe0 ? __pfx_mutex_lock+0x10/0x10 prepare_uprobe_buffer.part.0+0x2cd/0x500 uprobe_dispatcher+0x2c3/0x6a0 ? __pfx_uprobe_dispatcher+0x10/0x10 ? __kasan_slab_alloc+0x4d/0x90 handler_chain+0xdd/0x3e0 handle_swbp+0x26e/0x3d0 ? __pfx_handle_swbp+0x10/0x10 ? uprobe_pre_sstep_notifier+0x151/0x1b0 irqentry_exit_to_user_mode+0xe2/0x1b0 asm_exc_int3+0x39/0x40 RIP: 0033:0x401199 Code: 01 c2 0f b6 45 fb 88 02 83 45 fc 01 8b 45 fc 3b 45 e4 7c b7 8b 45 e4 48 98 48 8d 50 ff 48 8b 45 e8 48 01 d0 ce RSP: 002b:00007ffdf00576a8 EFLAGS: 00000206 RAX: 00007ffdf00576b0 RBX: 0000000000000000 RCX: 0000000000000ff2 RDX: 0000000000000ffc RSI: 0000000000000ffd RDI: 00007ffdf00576b0 RBP: 00007ffdf00586b0 R08: 00007feb2f9c0d20 R09: 00007feb2f9c0d20 R10: 0000000000000001 R11: 0000000000000202 R12: 0000000000401040 R13: 00007ffdf0058780 R14: 0000000000000000 R15: 0000000000000000 This commit enforces the buffer's maxlen less than a page-size to avoid store_trace_args() out-of-memory access. Link: https://lore.kernel.org/all/20241015060148.1108331-1-mqaio@linux.alibaba.com/ Fixes: dcad1a204f72 ("tracing/uprobes: Fetch args before reserving a ring buffer") Signed-off-by: Qiao Ma Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c40531d2cbad..13f9270ed5ab 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -875,6 +875,7 @@ struct uprobe_cpu_buffer { }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; +#define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { @@ -979,6 +980,11 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, ucb = uprobe_buffer_get(); ucb->dsize = tu->tp.size + dsize; + if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { + ucb->dsize = MAX_UCB_BUFFER_SIZE; + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + } + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); *ucbp = ucb; @@ -998,9 +1004,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) - return; - if (trace_trigger_soft_disabled(trace_file)) return; -- cgit v1.2.3 From 1f97c03f43fadc407de5b5cb01c07755053e1c22 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 22 Oct 2024 21:01:33 +0800 Subject: bpf: Preserve param->string when parsing mount options In bpf_parse_param(), keep the value of param->string intact so it can be freed later. Otherwise, the kmalloc area pointed to by param->string will be leaked as shown below: unreferenced object 0xffff888118c46d20 (size 8): comm "new_name", pid 12109, jiffies 4295580214 hex dump (first 8 bytes): 61 6e 79 00 38 c9 5c 7e any.8.\~ backtrace (crc e1b7f876): [<00000000c6848ac7>] kmemleak_alloc+0x4b/0x80 [<00000000de9f7d00>] __kmalloc_node_track_caller_noprof+0x36e/0x4a0 [<000000003e29b886>] memdup_user+0x32/0xa0 [<0000000007248326>] strndup_user+0x46/0x60 [<0000000035b3dd29>] __x64_sys_fsconfig+0x368/0x3d0 [<0000000018657927>] x64_sys_call+0xff/0x9f0 [<00000000c0cabc95>] do_syscall_64+0x3b/0xc0 [<000000002f331597>] entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 6c1752e0b6ca ("bpf: Support symbolic BPF FS delegation mount options") Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241022130133.3798232-1-houtao@huaweicloud.com --- kernel/bpf/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d8fc5eba529d..9aaf5124648b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; - char *p; + char *p, *str; int val; /* ignore errors, fallback to hex */ @@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - while ((p = strsep(¶m->string, ":"))) { + str = param->string; + while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { -- cgit v1.2.3 From 6fad274f06f038c29660aa53fbad14241c9fd976 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Oct 2024 17:28:05 +0200 Subject: bpf: Add MEM_WRITE attribute Add a MEM_WRITE attribute for BPF helper functions which can be used in bpf_func_proto to annotate an argument type in order to let the verifier know that the helper writes into the memory passed as an argument. In the past MEM_UNINIT has been (ab)used for this function, but the latter merely tells the verifier that the passed memory can be uninitialized. There have been bugs with overloading the latter but aside from that there are also cases where the passed memory is read + written which currently cannot be expressed, see also 4b3786a6c539 ("bpf: Zero former ARG_PTR_TO_{LONG,INT} args in case of error"). Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241021152809.33343-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++++++--- kernel/bpf/helpers.c | 10 +++++----- kernel/bpf/ringbuf.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 4 ++-- net/core/filter.c | 4 ++-- 6 files changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..bdadb0bb6cec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -635,6 +635,7 @@ enum bpf_type_flag { */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), + /* MEM can be uninitialized. */ MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), /* DYNPTR points to memory local to the bpf program. */ @@ -700,6 +701,13 @@ enum bpf_type_flag { */ MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS), + /* MEM is being written to, often combined with MEM_UNINIT. Non-presence + * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the + * MEM_UNINIT means that memory needs to be initialized since it is also + * read. + */ + MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -758,10 +766,10 @@ enum bpf_arg_type { ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, - /* pointer to memory does not need to be initialized, helper function must fill - * all bytes or clear them in error case. + /* Pointer to memory does not need to be initialized, since helper function + * fills all bytes or clears them in error case. */ - ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, + ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */ ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..ca3f0a2e5ed5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) @@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; @@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; @@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index de3b681d1d13..e1cfe890e0be 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8cfa7183d2ef..67179529080b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5892,7 +5892,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3bd402fa62a4..95b6b3b16bac 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1202,7 +1202,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u64), }; @@ -1219,7 +1219,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg2_size = sizeof(u64), }; diff --git a/net/core/filter.c b/net/core/filter.c index cb272b35d484..16b324d519da 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6368,7 +6368,7 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6380,7 +6380,7 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, -- cgit v1.2.3 From 8ea607330a39184f51737c6ae706db7fdca7628e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Oct 2024 17:28:06 +0200 Subject: bpf: Fix overloading of MEM_UNINIT's meaning Lonial reported an issue in the BPF verifier where check_mem_size_reg() has the following code: if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to * initialize all the memory that the helper could * just partially fill up. */ meta = NULL; This means that writes are not checked when the register containing the size of the passed buffer has not a fixed size. Through this bug, a BPF program can write to a map which is marked as read-only, for example, .rodata global maps. The problem is that MEM_UNINIT's initial meaning that "the passed buffer to the BPF helper does not need to be initialized" which was added back in commit 435faee1aae9 ("bpf, verifier: add ARG_PTR_TO_RAW_STACK type") got overloaded over time with "the passed buffer is being written to". The problem however is that checks such as the above which were added later via 06c1c049721a ("bpf: allow helpers access to variable memory") set meta to NULL in order force the user to always initialize the passed buffer to the helper. Due to the current double meaning of MEM_UNINIT, this bypasses verifier write checks to the memory (not boundary checks though) and only assumes the latter memory is read instead. Fix this by reverting MEM_UNINIT back to its original meaning, and having MEM_WRITE as an annotation to BPF helpers in order to then trigger the BPF verifier checks for writing to memory. Some notes: check_arg_pair_ok() ensures that for ARG_CONST_SIZE{,_OR_ZERO} we can access fn->arg_type[arg - 1] since it must contain a preceding ARG_PTR_TO_MEM. For check_mem_reg() the meta argument can be removed altogether since we do check both BPF_READ and BPF_WRITE. Same for the equivalent check_kfunc_mem_size_reg(). Fixes: 7b3552d3f9f6 ("bpf: Reject writes for PTR_TO_MAP_KEY in check_helper_mem_access") Fixes: 97e6d7dab1ca ("bpf: Check PTR_TO_MEM | MEM_RDONLY in check_helper_mem_access") Fixes: 15baa55ff5b0 ("bpf/verifier: allow all functions to read user provided context") Reported-by: Lonial Con Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241021152809.33343-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 73 ++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 411ab1b57af4..f9e2f1cd4975 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7438,7 +7438,8 @@ mark: } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, + int access_size, enum bpf_access_type access_type, + bool zero_size_allowed, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -7450,7 +7451,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7458,15 +7459,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, - meta && meta->raw_mode ? BPF_WRITE : - BPF_READ)) + if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7477,7 +7476,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7505,7 +7504,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, * Dynamically check it now. */ if (!env->ops->convert_ctx_access) { - enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; int offset = access_size - 1; /* Allow zero-byte read from PTR_TO_CTX */ @@ -7513,7 +7511,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false, false); + access_type, -1, false, false); } fallthrough; @@ -7538,6 +7536,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, + enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7553,15 +7552,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, */ meta->msize_max_value = reg->umax_value; - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. + /* The register is SCALAR_VALUE; the access check happens using + * its boundaries. For unprivileged variable accesses, disable + * raw mode so that the program is required to initialize all + * the memory that the helper could just partially fill up. */ if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ meta = NULL; if (reg->smin_value < 0) { @@ -7581,9 +7577,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); + err = check_helper_mem_access(env, regno - 1, reg->umax_value, + access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); return err; @@ -7594,13 +7589,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; - struct bpf_call_arg_meta meta; int err; if (register_is_null(reg)) return 0; - memset(&meta, 0, sizeof(meta)); /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7610,10 +7603,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7639,13 +7630,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; + return err; } @@ -8948,9 +8938,8 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); + err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, + BPF_READ, false, NULL); break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -8965,9 +8954,9 @@ skip_type_check: return -EACCES; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); + err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { @@ -9009,7 +8998,9 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); if (err) return err; if (arg_type & MEM_ALIGNED) @@ -9017,10 +9008,16 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, false, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, true, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + true, meta); break; case ARG_PTR_TO_DYNPTR: err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); -- cgit v1.2.3 From 73f35080477e893aa6f4c8d388352b871b288fbc Mon Sep 17 00:00:00 2001 From: Mikel Rychliski Date: Mon, 30 Sep 2024 16:26:54 -0400 Subject: tracing/probes: Fix MAX_TRACE_ARGS limit handling When creating a trace_probe we would set nr_args prior to truncating the arguments to MAX_TRACE_ARGS. However, we would only initialize arguments up to the limit. This caused invalid memory access when attempting to set up probes with more than 128 fetchargs. BUG: kernel NULL pointer dereference, address: 0000000000000020 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 UID: 0 PID: 1769 Comm: cat Not tainted 6.11.0-rc7+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:__set_print_fmt+0x134/0x330 Resolve the issue by applying the MAX_TRACE_ARGS limit earlier. Return an error when there are too many arguments instead of silently truncating. Link: https://lore.kernel.org/all/20240930202656.292869-1-mikel@mikelr.com/ Fixes: 035ba76014c0 ("tracing/probes: cleanup: Set trace_probe::nr_args at trace_probe_init") Signed-off-by: Mikel Rychliski Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 7 ++++++- kernel/trace/trace_fprobe.c | 6 +++++- kernel/trace/trace_kprobe.c | 6 +++++- kernel/trace/trace_uprobe.c | 4 +++- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b0e0ec85912e..ebda68ee9abf 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -912,6 +912,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } } + if (argc - 2 > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto error; + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) argc -= 2; argv += 2; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ret = trace_eprobe_tp_update_arg(ep, argv, i); if (ret) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index a079abd8955b..c62d1629cffe 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1187,6 +1187,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto out; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1203,7 +1207,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 61a6da808203..263fac44d3ca 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1013,6 +1013,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto out; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1029,7 +1033,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 13f9270ed5ab..b30fc8fcd095 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -565,6 +565,8 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; + if (argc - 2 > MAX_TRACE_ARGS) + return -E2BIG; if (argv[0][1] == ':') event = &argv[0][2]; @@ -690,7 +692,7 @@ static int __trace_uprobe_create(int argc, const char **argv) tu->filename = filename; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { struct traceprobe_parse_context ctx = { .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, }; -- cgit v1.2.3 From 0b6e2e22cb23105fcb171ab92f0f7516c69c8471 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 7 Oct 2024 15:47:24 +0100 Subject: tracing: Consider the NULL character when validating the event length strlen() returns a string length excluding the null byte. If the string length equals to the maximum buffer length, the buffer will have no space for the NULL terminating character. This commit checks this condition and returns failure for it. Link: https://lore.kernel.org/all/20241007144724.920954-1-leo.yan@arm.com/ Fixes: dec65d79fd26 ("tracing/probe: Check event name length correctly") Signed-off-by: Leo Yan Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 39877c80d6cb..16a5e368e7b7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -276,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; - } else if (len > MAX_EVENT_NAME_LEN) { + } else if (len >= MAX_EVENT_NAME_LEN) { trace_probe_log_err(offset, EVENT_TOO_LONG); return -EINVAL; } -- cgit v1.2.3 From 6e62807c7fbb3c758d233018caf94dfea9c65dbd Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 18 Oct 2024 18:07:48 +0800 Subject: posix-clock: posix-clock: Fix unbalanced locking in pc_clock_settime() If get_clock_desc() succeeds, it calls fget() for the clockid's fd, and get the clk->rwsem read lock, so the error path should release the lock to make the lock balance and fput the clockid's fd to make the refcount balance and release the fd related resource. However the below commit left the error path locked behind resulting in unbalanced locking. Check timespec64_valid_strict() before get_clock_desc() to fix it, because the "ts" is not changed after that. Fixes: d8794ac20a29 ("posix-clock: Fix missing timespec64 check in pc_clock_settime()") Acked-by: Richard Cochran Signed-off-by: Jinjie Ruan Acked-by: Anna-Maria Behnsen [pabeni@redhat.com: fixed commit message typo] Signed-off-by: Paolo Abeni --- kernel/time/posix-clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 316a4e8c97d3..1af0bb2cc45c 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -309,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) struct posix_clock_desc cd; int err; + if (!timespec64_valid_strict(ts)) + return -EINVAL; + err = get_clock_desc(id, &cd); if (err) return err; @@ -318,9 +321,6 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) goto out; } - if (!timespec64_valid_strict(ts)) - return -EINVAL; - if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else -- cgit v1.2.3 From 0ee288e69d033850bc87abe0f9cc3ada24763d7f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 23 Oct 2024 22:03:52 +0200 Subject: bpf,perf: Fix perf_event_detach_bpf_prog error handling Peter reported that perf_event_detach_bpf_prog might skip to release the bpf program for -ENOENT error from bpf_prog_array_copy. This can't happen because bpf program is stored in perf event and is detached and released only when perf event is freed. Let's drop the -ENOENT check and make sure the bpf program is released in any case. Fixes: 170a7e3ea070 ("bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not found") Reported-by: Peter Zijlstra Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241023200352.3488610-1-jolsa@kernel.org Closes: https://lore.kernel.org/lkml/20241022111638.GC16066@noisy.programming.kicks-ass.net/ --- kernel/trace/bpf_trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 95b6b3b16bac..630b763e5240 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2216,8 +2216,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); - if (ret == -ENOENT) - goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { -- cgit v1.2.3 From 9806f283140ef3e4d259b7646bd8c66026bbaac5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 23 Oct 2024 09:19:16 -0700 Subject: bpf: fix do_misc_fixups() for bpf_get_branch_snapshot() We need `goto next_insn;` at the end of patching instead of `continue;`. It currently works by accident by making verifier re-process patched instructions. Reported-by: Shung-Hsi Yu Fixes: 314a53623cd4 ("bpf: inline bpf_get_branch_snapshot() helper") Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20241023161916.2896274-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f9e2f1cd4975..587a6c76e564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21210,7 +21210,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ -- cgit v1.2.3 From 8421d4c8762bd022cb491f2f0f7019ef51b4f0a7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Oct 2024 09:35:58 +0800 Subject: bpf: Check validity of link->type in bpf_link_show_fdinfo() If a newly-added link type doesn't invoke BPF_LINK_TYPE(), accessing bpf_link_type_strs[link->type] may result in an out-of-bounds access. To spot such missed invocations early in the future, checking the validity of link->type in bpf_link_show_fdinfo() and emitting a warning when such invocations are missed. Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241024013558.1135167-3-houtao@huaweicloud.com --- kernel/bpf/syscall.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 67179529080b..c5aa127ed4cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,13 +3069,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, -- cgit v1.2.3