From 243911982aa9faf4361aa952f879331ad66933fe Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Mon, 7 Apr 2025 11:57:51 +0800 Subject: bpf: Check link_create.flags parameter for multi_kprobe The link_create.flags are currently not used for multi-kprobes, so return -EINVAL if it is set, same as for other attach APIs. We allow target_fd, on the other hand, to have an arbitrary value for multi-kprobe, as there are existing users (libbpf) relying on this. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250407035752.1108927-1-chen.dylane@linux.dev --- kernel/trace/bpf_trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187dc37d61d4..ec19942321e6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2987,6 +2987,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; -- cgit v1.2.3 From a76116f422c442ab691b4dcabb25613486d34360 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Mon, 7 Apr 2025 11:57:52 +0800 Subject: bpf: Check link_create.flags parameter for multi_uprobe The link_create.flags are currently not used for multi-uprobes, so return -EINVAL if it is set, same as for other attach APIs. We allow target_fd to have an arbitrary value for multi-uprobe, though, as there are existing users (libbpf) relying on this. Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250407035752.1108927-2-chen.dylane@linux.dev --- kernel/trace/bpf_trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ec19942321e6..0f5906f43d7c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3379,6 +3379,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; -- cgit v1.2.3 From ba2b31b0f39fca12abbd21c53a92838bbc026023 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:45 +0800 Subject: bpf: Factor out htab_elem_value helper() All hash maps store map key and map value together. The relative offset of the map value compared to the map key is round_up(key_size, 8). Therefore, factor out a common helper htab_elem_value() to calculate the address of the map value instead of duplicating the logic. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 64 ++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5a5adc66b8e2..0bebc919bbf7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,20 +175,25 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + roundup(key_size, 8)); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -215,10 +220,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -245,7 +250,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -670,7 +676,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -709,7 +715,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -763,7 +769,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -1039,11 +1045,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1106,7 +1110,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1134,7 +1138,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1220,8 +1224,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); ret = htab_lock_bucket(b, &flags); if (ret) @@ -1500,10 +1503,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1615,15 +1618,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1680,12 +1680,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1720,7 +1720,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1812,7 +1811,7 @@ again_nocopy: off += size; } } else { - value = l->key + roundup_key_size; + value = htab_elem_value(l, key_size); if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { struct bpf_map **inner_map = value; @@ -2063,11 +2062,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2077,10 +2076,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2165,7 +2163,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2180,7 +2177,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2196,7 +2192,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, -- cgit v1.2.3 From 5771e306b6cd8ce5b9935d006765f887f145e6d5 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:46 +0800 Subject: bpf: Rename __htab_percpu_map_update_elem to htab_map_update_elem_in_place Rename __htab_percpu_map_update_elem to htab_map_update_elem_in_place, and add a new percpu argument for the helper to support in-place update for both per-cpu htab and htab of maps. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0bebc919bbf7..9778e9871d86 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1258,12 +1258,12 @@ err_lock_bucket: return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; @@ -1295,19 +1295,18 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ + /* Update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: htab_unlock_bucket(b, flags); return ret; @@ -1386,7 +1385,7 @@ err_lock_bucket: static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -2407,8 +2406,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; -- cgit v1.2.3 From 2c304172e03193bd02363ee8969444261f7b7a57 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:47 +0800 Subject: bpf: Support atomic update for htab of maps As reported by Cody Haas [1], when there is concurrent map lookup and map update operation in an existing element for htab of maps, the map lookup procedure may return -ENOENT unexpectedly. The root cause is twofold: 1) the update of existing element involves two separated list operation In htab_map_update_elem(), it first inserts the new element at the head of list, then it deletes the old element. Therefore, it is possible a lookup operation has already iterated to the middle of the list when a concurrent update operation begins, and the lookup operation will fail to find the target element. 2) the immediate reuse of htab element. It is more subtle. Even through the lookup operation finds the old element, it is possible that the target element has been removed by a concurrent update operation, and the element has been reused immediately by other update operation which runs on the same CPU as the previous update operation, and the element is inserted into the same bucket list. After these steps above, when the lookup operation tries to compare the key in the old element with the expected key, the match will fail because the key in the old element have been overwritten by other update operation. The two-step update process is relatively straightforward to address. The more challenging aspect is the immediate reuse. As Alexei pointed out: So since 2022 both prealloc and no_prealloc reuse elements. We can consider a new flag for the hash map like F_REUSE_AFTER_RCU_GP that will use _rcu() flavor of freeing into bpf_ma, but it has to have a strong reason. Given that htab of maps doesn't support special field in value and directly stores the inner map pointer in htab_element, just do in-place update for htab of maps instead of attempting to address the immediate reuse issue. [1]: https://lore.kernel.org/xdp-newbies/CAH7f-ULFTwKdoH_t2SFc5rWCVYLEg-14d1fBYWH2eekudsnTRg@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9778e9871d86..4879c79dd677 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1076,10 +1076,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1160,24 +1159,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } } htab_unlock_bucket(b, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); - } + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: htab_unlock_bucket(b, flags); @@ -1265,6 +1254,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1296,8 +1286,15 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, if (l_old) { /* Update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu, onallcpus, NULL); @@ -1309,6 +1306,8 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } err: htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -2531,24 +2530,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); -- cgit v1.2.3 From e8a65856c75d518d0bb15f38c90a4fd264ba1d3a Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:48 +0800 Subject: bpf: Add is_fd_htab() helper Add is_fd_htab() helper to check whether the map is htab of maps. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4879c79dd677..097992efef05 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -175,6 +175,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) { return l->key + round_up(key_size, 8); @@ -974,8 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1810,7 +1814,7 @@ again_nocopy: } } else { value = htab_elem_value(l, key_size); - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ -- cgit v1.2.3 From 6704b1e8cfc5eed264065735fe00a1dd8a0bffef Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 1 Apr 2025 14:22:49 +0800 Subject: bpf: Don't allocate per-cpu extra_elems for fd htab The update of element in fd htab is in-place now, therefore, there is no need to allocate per-cpu extra_elems, just remove it. Acked-by: Andrii Nakryiko Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250401062250.543403-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 097992efef05..2e18d7e50d9b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -206,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) @@ -464,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -560,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; -- cgit v1.2.3 From 53ebef53a657d7957d35dc2b953db64f1bb28065 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Fri, 18 Apr 2025 15:49:43 +0800 Subject: bpf: Use proper type to calculate bpf_raw_tp_null_args.mask index The calculation of the index used to access the mask field in 'struct bpf_raw_tp_null_args' is done with 'int' type, which could overflow when the tracepoint being attached has more than 8 arguments. While none of the tracepoints mentioned in raw_tp_null_args[] currently have more than 8 arguments, there do exist tracepoints that had more than 8 arguments (e.g. iocost_iocg_forgive_debt), so use the correct type for calculation and avoid Smatch static checker warning. Reported-by: Dan Carpenter Signed-off-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20250418074946.35569-1-shung-hsi.yu@suse.com Closes: https://lore.kernel.org/r/843a3b94-d53d-42db-93d4-be10a4090146@stanley.mountain/ --- kernel/bpf/btf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfa..656ee11aff67 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6829,10 +6829,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; - if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ - if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } -- cgit v1.2.3 From 6aca583f90b0eb159cfd79c1b7f28d7c0108aed6 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Wed, 23 Apr 2025 15:31:51 +0800 Subject: bpf: Streamline allowed helpers between tracing and base sets Many conditional checks in switch-case are redundant with bpf_base_func_proto and should be removed. Regarding the permission checks bpf_base_func_proto: The permission checks in bpf_prog_load (as outlined below) ensure that the trace has both CAP_BPF and CAP_PERFMON capabilities, thus enabling the use of corresponding prototypes in bpf_base_func_proto without adverse effects. bpf_prog_load ...... bpf_cap = bpf_token_capable(token, CAP_BPF); ...... if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !bpf_cap) goto put_token; ...... if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; ...... Signed-off-by: Feng Yang Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20250423073151.297103-1-yangfeng59949@163.com --- kernel/trace/bpf_trace.c | 72 ------------------------------------------------ 1 file changed, 72 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0f5906f43d7c..52c432a44aeb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1430,56 +1430,14 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto *func_proto; switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_map_lookup_percpu_elem: - return &bpf_map_lookup_percpu_elem_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - case BPF_FUNC_ktime_get_boot_ns: - return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_get_current_task: - return &bpf_get_current_task_proto; - case BPF_FUNC_get_current_task_btf: - return &bpf_get_current_task_btf_proto; - case BPF_FUNC_task_pt_regs: - return &bpf_task_pt_regs_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_read_user: - return &bpf_probe_read_user_proto; - case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read_user_str: - return &bpf_probe_read_user_str_proto; - case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1489,10 +1447,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS - case BPF_FUNC_cgrp_storage_get: - return &bpf_cgrp_storage_get_proto; - case BPF_FUNC_cgrp_storage_delete: - return &bpf_cgrp_storage_delete_proto; case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; #endif @@ -1500,20 +1454,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_proto; case BPF_FUNC_send_signal_thread: return &bpf_send_signal_thread_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; - case BPF_FUNC_ringbuf_output: - return &bpf_ringbuf_output_proto; - case BPF_FUNC_ringbuf_reserve: - return &bpf_ringbuf_reserve_proto; - case BPF_FUNC_ringbuf_submit: - return &bpf_ringbuf_submit_proto; - case BPF_FUNC_ringbuf_discard: - return &bpf_ringbuf_discard_proto; - case BPF_FUNC_ringbuf_query: - return &bpf_ringbuf_query_proto; - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return prog->sleepable ? &bpf_get_task_stack_sleepable_proto : &bpf_get_task_stack_proto; @@ -1521,12 +1461,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: return &bpf_copy_from_user_task_proto; - case BPF_FUNC_snprintf_btf: - return &bpf_snprintf_btf_proto; - case BPF_FUNC_per_cpu_ptr: - return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_this_cpu_ptr: - return &bpf_this_cpu_ptr_proto; case BPF_FUNC_task_storage_get: if (bpf_prog_check_recur(prog)) return &bpf_task_storage_get_recur_proto; @@ -1535,18 +1469,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (bpf_prog_check_recur(prog)) return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_snprintf: - return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; case BPF_FUNC_find_vma: return &bpf_find_vma_proto; - case BPF_FUNC_trace_vprintk: - return bpf_get_trace_vprintk_proto(); default: break; } -- cgit v1.2.3 From 1271a40eeafa8e9b5b76c4d02e2b3812cbc3c280 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Wed, 23 Apr 2025 20:13:28 +0800 Subject: bpf: Allow access to const void pointer arguments in tracing programs Adding support to access arguments with const void pointer arguments in tracing programs. Currently we allow tracing programs to access void pointers. If we try to access argument which is pointer to const void like 2nd argument in kfree, verifier will fail to load the program with; 0: R1=ctx() R10=fp0 ; asm volatile ("r2 = *(u64 *)(r1 + 8); "); 0: (79) r2 = *(u64 *)(r1 +8) func 'kfree' arg1 type UNKNOWN is not a struct Changing the is_int_ptr to void and generic integer check and renaming it to is_void_or_int_ptr. Signed-off-by: KaFai Wan Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250423121329.3163461-2-mannkafai@gmail.com --- kernel/bpf/btf.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 656ee11aff67..a91822bae043 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6383,12 +6383,11 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_int_ptr(struct btf *btf, const struct btf_type *t) +static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); - - return btf_type_is_int(t); + return btf_type_is_void(t) || btf_type_is_int(t); } static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, @@ -6776,14 +6775,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } - if (t->type == 0) - /* This is a pointer to void. - * It is the same as scalar from the verifier safety pov. - * No further pointer walking is allowed. - */ - return true; - - if (is_int_ptr(btf, t)) + /* + * If it's a pointer to void, it's the same as scalar from the verifier + * safety POV. Either way, no futher pointer walking is allowed. + */ + if (is_void_or_int_ptr(btf, t)) return true; /* this is a pointer to another type */ -- cgit v1.2.3 From 7b05f43155cb128aa06a226afdbc3daa8d75b358 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 28 Apr 2025 23:06:39 +0200 Subject: bpf: Replace offsetof() with struct_size() Compared to offsetof(), struct_size() provides additional compile-time checks for structs with flexible arrays (e.g., __must_be_array()). No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250428210638.30219-2-thorsten.blum@linux.dev --- kernel/bpf/syscall.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64c3393e8270..df33d19c5c3b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -693,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) if (IS_ERR_OR_NULL(rec)) return NULL; - size = offsetof(struct btf_record, fields[rec->cnt]); + size = struct_size(rec, fields, rec->cnt); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); @@ -748,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return false; if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct btf_record, fields[rec_a->cnt]); + size = struct_size(rec_a, fields, rec_a->cnt); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. -- cgit v1.2.3 From 714070c4cb7a10ff57450a618a936775f3036245 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 28 Apr 2025 17:44:02 +0200 Subject: bpf: Allow XDP dev-bound programs to perform XDP_REDIRECT into maps In the current implementation if the program is dev-bound to a specific device, it will not be possible to perform XDP_REDIRECT into a DEVMAP or CPUMAP even if the program is running in the driver NAPI context and it is not attached to any map entry. This seems in contrast with the explanation available in bpf_prog_map_compatible routine. Fix the issue introducing __bpf_prog_map_compatible utility routine in order to avoid bpf_prog_is_dev_bound() check running bpf_check_tail_call() at program load time (bpf_prog_select_runtime()). Continue forbidding to attach a dev-bound program to XDP maps (BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_DEVMAP and BPF_MAP_TYPE_CPUMAP). Fixes: 3d76a4d3d4e59 ("bpf: XDP metadata RX kfuncs") Signed-off-by: Lorenzo Bianconi Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev --- kernel/bpf/core.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba6b6118cf50..a3e571688421 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2358,8 +2358,8 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, return 0; } -bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; @@ -2368,14 +2368,6 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; - /* XDP programs inserted into maps are not guaranteed to run on - * a particular netdev (and can run outside driver context entirely - * in the case of devmap and cpumap). Until device checks - * are implemented, prohibit adding dev-bound programs to program maps. - */ - if (bpf_prog_is_dev_bound(aux)) - return false; - spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2409,6 +2401,19 @@ bool bpf_prog_map_compatible(struct bpf_map *map, return ret; } +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) +{ + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2421,7 +2426,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) if (!map_type_contains_progs(map)) continue; - if (!bpf_prog_map_compatible(map, fp)) { + if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 41948afcf503b5667637a0b3ab279a061f559bec Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 3 May 2025 17:15:13 +0200 Subject: bpf: Replace offsetof() with struct_size() Compared to offsetof(), struct_size() provides additional compile-time checks for structs with flexible arrays (e.g., __must_be_array()). No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250503151513.343931-2-thorsten.blum@linux.dev --- kernel/bpf/btf.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a91822bae043..6b21ca67070c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -3957,7 +3958,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); @@ -5583,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (id < 0) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5610,7 +5611,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (ret != BTF_FIELD_FOUND) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5647,7 +5648,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) continue; parse: tab_cnt = tab ? tab->cnt : 0; - new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; @@ -8559,7 +8560,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, /* Grow set */ set = krealloc(tab->sets[hook], - offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), + struct_size(set, pairs, set_cnt + add_set->cnt), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -8845,7 +8846,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c } tab = krealloc(btf->dtor_kfunc_tab, - offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + struct_size(tab, dtors, tab_cnt + add_cnt), GFP_KERNEL | __GFP_NOWARN); if (!tab) { ret = -ENOMEM; @@ -9403,8 +9404,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, tab = btf->struct_ops_tab; if (!tab) { - tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), - GFP_KERNEL); + tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL); if (!tab) return -ENOMEM; tab->capacity = 4; @@ -9417,8 +9417,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, if (tab->cnt == tab->capacity) { new_tab = krealloc(tab, - offsetof(struct btf_struct_ops_tab, - ops[tab->capacity * 2]), + struct_size(tab, ops, tab->capacity * 2), GFP_KERNEL); if (!new_tab) return -ENOMEM; -- cgit v1.2.3 From b183c0123d9ba16e147c990c02a9e6f37cac5df4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 May 2025 18:58:48 -0700 Subject: bpf: Check KF_bpf_rbtree_add_impl for the "case KF_ARG_PTR_TO_RB_NODE" In a later patch, two new kfuncs will take the bpf_rb_node pointer arg. struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node); struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node); In the check_kfunc_call, there is a "case KF_ARG_PTR_TO_RB_NODE" to check if the reg->type should be an allocated pointer or should be a non_owning_ref. The later patch will need to ensure that the bpf_rb_node pointer passing to the new bpf_rbtree_{left,right} must be a non_owning_ref. This should be the same requirement as the existing bpf_rbtree_remove. This patch swaps the current "if else" statement. Instead of checking the bpf_rbtree_remove, it checks the bpf_rbtree_add. Then the new bpf_rbtree_{left,right} will fall into the "else" case to make the later patch simpler. bpf_rbtree_add should be the only one that needs an allocated pointer. This should be a no-op change considering there are only two kfunc(s) taking bpf_rb_node pointer arg, rbtree_add and rbtree_remove. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250506015857.817950-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 54c6953a8b84..2e1ce7debc16 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13200,22 +13200,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } - if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { + verbose(env, "rbtree_remove node input must be non-owning ref\n"); return -EINVAL; } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "rbtree_remove not allowed in rbtree cb\n"); return -EINVAL; } } -- cgit v1.2.3 From 7faccdf4b47d2c7674692aecdb5847da0f84dbd4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 May 2025 18:58:49 -0700 Subject: bpf: Simplify reg0 marking for the rbtree kfuncs that return a bpf_rb_node pointer The current rbtree kfunc, bpf_rbtree_{first, remove}, returns the bpf_rb_node pointer. The check_kfunc_call currently checks the kfunc btf_id instead of its return pointer type to decide if it needs to do mark_reg_graph_node(reg0) and ref_set_non_owning(reg0). The later patch will add bpf_rbtree_{root,left,right} that will also return a bpf_rb_node pointer. Instead of adding more kfunc btf_id checks to the "if" case, this patch changes the test to check the kfunc's return type. is_rbtree_node_type() function is added to test if a pointer type is a bpf_rb_node. The callers have already skipped the modifiers of the pointer type. A note on the ref_set_non_owning(), although bpf_rbtree_remove() also returns a bpf_rb_node pointer, the bpf_rbtree_remove() has the KF_ACQUIRE flag. Thus, its reg0 will not become non-owning. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250506015857.817950-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e1ce7debc16..bf14da00f09a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11987,6 +11987,11 @@ static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_p return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); } +static bool is_rbtree_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -13750,8 +13755,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct btf_field *field = meta.arg_list_head.field; mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type)) { struct btf_field *field = meta.arg_rbtree_root.field; mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); @@ -13881,7 +13885,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } -- cgit v1.2.3 From 9e3e66c553f705de51707c7ddc7f35ce159a8ef1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 May 2025 18:58:50 -0700 Subject: bpf: Add bpf_rbtree_{root,left,right} kfunc In a bpf fq implementation that is much closer to the kernel fq, it will need to traverse the rbtree: https://lore.kernel.org/bpf/20250418224652.105998-13-martin.lau@linux.dev/ The much simplified logic that uses the bpf_rbtree_{root,left,right} to traverse the rbtree is like: struct fq_flow { struct bpf_rb_node fq_node; struct bpf_rb_node rate_node; struct bpf_refcount refcount; unsigned long sk_long; }; struct fq_flow_root { struct bpf_spin_lock lock; struct bpf_rb_root root __contains(fq_flow, fq_node); }; struct fq_flow *fq_classify(...) { struct bpf_rb_node *tofree[FQ_GC_MAX]; struct fq_flow_root *root; struct fq_flow *gc_f, *f; struct bpf_rb_node *p; int i, fcnt = 0; /* ... */ f = NULL; bpf_spin_lock(&root->lock); p = bpf_rbtree_root(&root->root); while (can_loop) { if (!p) break; gc_f = bpf_rb_entry(p, struct fq_flow, fq_node); if (gc_f->sk_long == sk_long) { f = bpf_refcount_acquire(gc_f); break; } /* To be removed from the rbtree */ if (fcnt < FQ_GC_MAX && fq_gc_candidate(gc_f, jiffies_now)) tofree[fcnt++] = p; if (gc_f->sk_long > sk_long) p = bpf_rbtree_left(&root->root, p); else p = bpf_rbtree_right(&root->root, p); } /* remove from the rbtree */ for (i = 0; i < fcnt; i++) { p = tofree[i]; tofree[i] = bpf_rbtree_remove(&root->root, p); } bpf_spin_unlock(&root->lock); /* bpf_obj_drop the fq_flow(s) that have just been removed * from the rbtree. */ for (i = 0; i < fcnt; i++) { p = tofree[i]; if (p) { gc_f = bpf_rb_entry(p, struct fq_flow, fq_node); bpf_obj_drop(gc_f); } } return f; } The above simplified code needs to traverse the rbtree for two purposes, 1) find the flow with the desired sk_long value 2) while searching for the sk_long, collect flows that are the fq_gc_candidate. They will be removed from the rbtree. This patch adds the bpf_rbtree_{root,left,right} kfunc to enable the rbtree traversal. The returned bpf_rb_node pointer will be a non-owning reference which is the same as the returned pointer of the exisiting bpf_rbtree_first kfunc. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250506015857.817950-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 ++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 22 ++++++++++++++++++---- 2 files changed, 48 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e3a2662f4e33..36150d340c16 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2366,6 +2366,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) return (struct bpf_rb_node *)rb_first_cached(r); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)r->rb_root.rb_node; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_left; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_right; +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -3214,6 +3241,9 @@ BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf14da00f09a..51a17e64a0a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12081,6 +12081,9 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, KF_bpf_rbtree_first, + KF_bpf_rbtree_root, + KF_bpf_rbtree_left, + KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_slice, @@ -12121,6 +12124,9 @@ BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -12156,6 +12162,9 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -12591,7 +12600,10 @@ static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - btf_id == special_kfunc_list[KF_bpf_rbtree_first]; + btf_id == special_kfunc_list[KF_bpf_rbtree_first] || + btf_id == special_kfunc_list[KF_bpf_rbtree_root] || + btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + btf_id == special_kfunc_list[KF_bpf_rbtree_right]; } static bool is_bpf_iter_num_api_kfunc(u32 btf_id) @@ -12691,7 +12703,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, break; case BPF_RB_NODE: ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -13216,11 +13230,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } else { if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + verbose(env, "%s node input must be non-owning ref\n", func_name); return -EINVAL; } if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL; } } -- cgit v1.2.3 From 2ddef1783c43ba81a05dec0a5781ebbc61a3c089 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 May 2025 18:58:51 -0700 Subject: bpf: Allow refcounted bpf_rb_node used in bpf_rbtree_{remove,left,right} The bpf_rbtree_{remove,left,right} requires the root's lock to be held. They also check the node_internal->owner is still owned by that root before proceeding, so it is safe to allow refcounted bpf_rb_node pointer to be used in these kfuncs. In a bpf fq implementation which is much closer to the kernel fq, https://lore.kernel.org/bpf/20250418224652.105998-13-martin.lau@linux.dev/, a networking flow (allocated by bpf_obj_new) can be added to two different rbtrees. There are cases that the flow is searched from one rbtree, held the refcount of the flow, and then removed from another rbtree: struct fq_flow { struct bpf_rb_node fq_node; struct bpf_rb_node rate_node; struct bpf_refcount refcount; unsigned long sk_long; }; int bpf_fq_enqueue(...) { /* ... */ bpf_spin_lock(&root->lock); while (can_loop) { /* ... */ if (!p) break; gc_f = bpf_rb_entry(p, struct fq_flow, fq_node); if (gc_f->sk_long == sk_long) { f = bpf_refcount_acquire(gc_f); break; } /* ... */ } bpf_spin_unlock(&root->lock); if (f) { bpf_spin_lock(&q->lock); bpf_rbtree_remove(&q->delayed, &f->rate_node); bpf_spin_unlock(&q->lock); } } bpf_rbtree_{left,right} do not need this change but are relaxed together with bpf_rbtree_remove instead of adding extra verifier logic to exclude these kfuncs. To avoid bi-sect failure, this patch also changes the selftests together. The "rbtree_api_remove_unadded_node" is not expecting verifier's error. The test now expects bpf_rbtree_remove(&groot, &m->node) to return NULL. The test uses __retval(0) to ensure this NULL return value. Some of the "only take non-owning..." failure messages are changed also. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250506015857.817950-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- tools/testing/selftests/bpf/progs/rbtree_fail.c | 29 +++++++++++++------------ 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51a17e64a0a9..9093a351b0b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13229,8 +13229,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } else { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "%s node input must be non-owning ref\n", func_name); + if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } if (in_rbtree_lock_required_cb(env)) { diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index dbd5eee8e25e..4acb6af2dfe3 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -69,11 +69,11 @@ long rbtree_api_nolock_first(void *ctx) } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__retval(0) long rbtree_api_remove_unadded_node(void *ctx) { struct node_data *n, *m; - struct bpf_rb_node *res; + struct bpf_rb_node *res_n, *res_m; n = bpf_obj_new(typeof(*n)); if (!n) @@ -88,19 +88,20 @@ long rbtree_api_remove_unadded_node(void *ctx) bpf_spin_lock(&glock); bpf_rbtree_add(&groot, &n->node, less); - /* This remove should pass verifier */ - res = bpf_rbtree_remove(&groot, &n->node); - n = container_of(res, struct node_data, node); + res_n = bpf_rbtree_remove(&groot, &n->node); - /* This remove shouldn't, m isn't in an rbtree */ - res = bpf_rbtree_remove(&groot, &m->node); - m = container_of(res, struct node_data, node); + res_m = bpf_rbtree_remove(&groot, &m->node); bpf_spin_unlock(&glock); - if (n) - bpf_obj_drop(n); - if (m) - bpf_obj_drop(m); + bpf_obj_drop(m); + if (res_n) + bpf_obj_drop(container_of(res_n, struct node_data, node)); + if (res_m) { + bpf_obj_drop(container_of(res_m, struct node_data, node)); + /* m was not added to the rbtree */ + return 2; + } + return 0; } @@ -178,7 +179,7 @@ err_out: } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__failure __msg("bpf_rbtree_remove can only take non-owning or refcounted bpf_rb_node pointer") long rbtree_api_add_release_unlock_escape(void *ctx) { struct node_data *n; @@ -202,7 +203,7 @@ long rbtree_api_add_release_unlock_escape(void *ctx) } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__failure __msg("bpf_rbtree_remove can only take non-owning or refcounted bpf_rb_node pointer") long rbtree_api_first_release_unlock_escape(void *ctx) { struct bpf_rb_node *res; -- cgit v1.2.3 From 3fab84f00d3274e1fd19054a409a9c804261e4b9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 May 2025 18:58:53 -0700 Subject: bpf: Simplify reg0 marking for the list kfuncs that return a bpf_list_node pointer The next patch will add bpf_list_{front,back} kfuncs to peek the head and tail of a list. Both of them will return a 'struct bpf_list_node *'. Follow the earlier change for rbtree, this patch checks the return btf type is a 'struct bpf_list_node' pointer instead of checking each kfuncs individually to decide if mark_reg_graph_node should be called. This will make the bpf_list_{front,back} kfunc addition easier in the later patch. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250506015857.817950-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9093a351b0b3..acb2f44316cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11992,6 +11992,11 @@ static bool is_rbtree_node_type(const struct btf_type *t) return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); } +static bool is_list_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -13764,8 +13769,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { + } else if (is_list_node_type(ptr_type)) { struct btf_field *field = meta.arg_list_head.field; mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); -- cgit v1.2.3 From fb5b480205bad3936b054b86f7c9d2bd7835caac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 May 2025 18:58:54 -0700 Subject: bpf: Add bpf_list_{front,back} kfunc In the kernel fq qdisc implementation, it only needs to look at the fields of the first node in a list but does not always need to remove it from the list. It is more convenient to have a peek kfunc for the list. It works similar to the bpf_rbtree_first(). This patch adds bpf_list_{front,back} kfunc. The verifier is changed such that the kfunc returning "struct bpf_list_node *" will be marked as non-owning. The exception is the KF_ACQUIRE kfunc. The net effect is only the new bpf_list_{front,back} kfuncs will have its return pointer marked as non-owning. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250506015857.817950-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 22 ++++++++++++++++++++++ kernel/bpf/verifier.c | 12 ++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 36150d340c16..78cefb41266a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2293,6 +2293,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->next; +} + +__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->prev; +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -3236,6 +3256,8 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index acb2f44316cc..99aa2c890e7b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12079,6 +12079,8 @@ enum special_kfunc_type { KF_bpf_list_push_back_impl, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_front, + KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -12124,6 +12126,8 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) @@ -12160,6 +12164,8 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -12598,7 +12604,9 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || - btf_id == special_kfunc_list[KF_bpf_list_pop_back]; + btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_front] || + btf_id == special_kfunc_list[KF_bpf_list_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) @@ -13903,7 +13911,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (is_rbtree_node_type(ptr_type)) { + } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } -- cgit v1.2.3 From fce7bd8e385a6c282d70bed39d82ce805eeafbee Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Wed, 7 May 2025 03:42:45 +0000 Subject: bpf/verifier: Handle BPF_LOAD_ACQ instructions in insn_def_regno() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for supporting BPF load-acquire and store-release instructions for architectures where bpf_jit_needs_zext() returns true (e.g. riscv64), make insn_def_regno() handle load-acquires properly. Acked-by: Björn Töpel Tested-by: Björn Töpel # QEMU/RVA23 Signed-off-by: Peilin Ye Reviewed-by: Pu Lehui Link: https://lore.kernel.org/r/09cb2aec979aaed9d16db41f0f5b364de39377c0.1746588351.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99aa2c890e7b..28f5a7899bd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3649,16 +3649,16 @@ static int insn_def_regno(const struct bpf_insn *insn) case BPF_ST: return -1; case BPF_STX: - if ((BPF_MODE(insn->code) == BPF_ATOMIC || - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) && - (insn->imm & BPF_FETCH)) { + if (BPF_MODE(insn->code) == BPF_ATOMIC || + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { if (insn->imm == BPF_CMPXCHG) return BPF_REG_0; - else + else if (insn->imm == BPF_LOAD_ACQ) + return insn->dst_reg; + else if (insn->imm & BPF_FETCH) return insn->src_reg; - } else { - return -1; } + return -1; default: return insn->dst_reg; } -- cgit v1.2.3 From ee971630f20fd421fffcdc4543731ebcb54ed6d0 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Tue, 6 May 2025 14:14:33 +0800 Subject: bpf: Allow some trace helpers for all prog types if it works under NMI and doesn't use any context-dependent things, should be fine for any program type. The detailed discussion is in [1]. [1] https://lore.kernel.org/all/CAEf4Bza6gK3dsrTosk6k3oZgtHesNDSrDd8sdeQ-GiS6oJixQg@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Feng Yang Signed-off-by: Andrii Nakryiko Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20250506061434.94277-2-yangfeng59949@163.com --- include/linux/bpf-cgroup.h | 8 -------- kernel/bpf/cgroup.c | 32 -------------------------------- kernel/bpf/helpers.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 41 ++++------------------------------------- net/core/filter.c | 14 -------------- 5 files changed, 46 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9de7adb68294..4847dcade917 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -427,8 +427,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -465,12 +463,6 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } -static inline const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return NULL; -} - static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 84f58f3d028a..62a1d8deb3dc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1653,10 +1653,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; @@ -2204,10 +2200,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; @@ -2351,10 +2343,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2601,23 +2589,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } - -/* Common helpers for cgroup hooks with valid process context. */ -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; - default: - return NULL; - } -} diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 78cefb41266a..fed53da75025 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -1912,6 +1913,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; +const struct bpf_func_proto bpf_perf_event_read_proto __weak; +const struct bpf_func_proto bpf_send_signal_proto __weak; +const struct bpf_func_proto bpf_send_signal_thread_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_proto __weak; +const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1965,6 +1972,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; default: break; } @@ -2022,7 +2031,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; + return &bpf_task_storage_delete_proto; default: break; } @@ -2037,6 +2060,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: @@ -2047,6 +2072,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_copy_from_user: + return &bpf_copy_from_user_proto; + case BPF_FUNC_copy_from_user_task: + return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: @@ -2057,6 +2086,19 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_trace_vprintk_proto(); case BPF_FUNC_perf_event_read_value: return bpf_get_perf_event_read_value_proto(); + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; + case BPF_FUNC_get_task_stack: + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; + case BPF_FUNC_get_branch_snapshot: + return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 52c432a44aeb..868920994517 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -572,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) return value; } -static const struct bpf_func_proto bpf_perf_event_read_proto = { +const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, @@ -882,7 +882,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_proto = { +const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, @@ -894,7 +894,7 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_thread_proto = { +const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1185,7 +1185,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return entry_cnt * br_entry_size; } -static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { +const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, @@ -1430,14 +1430,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto *func_proto; switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_perf_event_read: - return &bpf_perf_event_read_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1446,35 +1440,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif -#ifdef CONFIG_CGROUPS - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; -#endif - case BPF_FUNC_send_signal: - return &bpf_send_signal_proto; - case BPF_FUNC_send_signal_thread: - return &bpf_send_signal_thread_proto; - case BPF_FUNC_get_task_stack: - return prog->sleepable ? &bpf_get_task_stack_sleepable_proto - : &bpf_get_task_stack_proto; - case BPF_FUNC_copy_from_user: - return &bpf_copy_from_user_proto; - case BPF_FUNC_copy_from_user_task: - return &bpf_copy_from_user_task_proto; - case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; - return &bpf_task_storage_delete_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; - case BPF_FUNC_get_branch_snapshot: - return &bpf_get_branch_snapshot_proto; - case BPF_FUNC_find_vma: - return &bpf_find_vma_proto; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 79cab4d78dc3..30e7d3679088 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8022,10 +8022,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; @@ -8051,10 +8047,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { @@ -8488,18 +8480,12 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif default: return bpf_sk_base_func_proto(func_id, prog); } -- cgit v1.2.3 From 8c112a428b94eabb6efde39715349a0b8dec880e Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Tue, 6 May 2025 14:14:34 +0800 Subject: sched_ext: Remove bpf_scx_get_func_proto task_storage_{get,delete} has been moved to bpf_base_func_proto. Suggested-by: Andrii Nakryiko Signed-off-by: Feng Yang Signed-off-by: Andrii Nakryiko Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20250506061434.94277-3-yangfeng59949@163.com --- kernel/sched/ext.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fdbf249d1c68..cc628b009e11 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5586,21 +5586,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } -static const struct bpf_func_proto * -bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; - default: - return bpf_base_func_proto(func_id, prog); - } -} - static const struct bpf_verifier_ops bpf_scx_verifier_ops = { - .get_func_proto = bpf_scx_get_func_proto, + .get_func_proto = bpf_base_func_proto, .is_valid_access = bpf_scx_is_valid_access, .btf_struct_access = bpf_scx_btf_struct_access, }; -- cgit v1.2.3 From 823153334042746604fdb416ea358a90940c1d83 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 9 May 2025 17:35:37 +0200 Subject: bpf: Add support to retrieve ref_ctr_offset for uprobe perf link Adding support to retrieve ref_ctr_offset for uprobe perf link, which got somehow omitted from the initial uprobe link info changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yafang Shao Link: https://lore.kernel.org/bpf/20250509153539.779599-2-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 5 +++-- kernel/trace/trace_uprobe.c | 2 +- tools/include/uapi/linux/bpf.h | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 71d5ac83cf5d..16e95398c91c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6724,6 +6724,7 @@ struct bpf_link_info { __u32 name_len; __u32 offset; /* offset from file_name */ __u64 cookie; + __u64 ref_ctr_offset; } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */ struct { __aligned_u64 func_name; /* in/out */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index df33d19c5c3b..4b5f29168618 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3800,14 +3800,14 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { + u64 ref_ctr_offset, offset; char __user *uname; - u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, &type, NULL); if (err) return err; @@ -3819,6 +3819,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; + info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f6..d9cf6ed2c106 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1489,7 +1489,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; - *probe_addr = 0; + *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 71d5ac83cf5d..16e95398c91c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6724,6 +6724,7 @@ struct bpf_link_info { __u32 name_len; __u32 offset; /* offset from file_name */ __u64 cookie; + __u64 ref_ctr_offset; } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */ struct { __aligned_u64 func_name; /* in/out */ -- cgit v1.2.3 From d060b6aab031b6113f78cd3d1585115f13386eec Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 12 May 2025 21:53:46 +0100 Subject: helpers: make few bpf helpers public Make bpf_dynptr_slice_rdwr, bpf_dynptr_check_off_len and __bpf_dynptr_write available outside of the helpers.c by adding their prototypes into linux/include/bpf.h. bpf_dynptr_check_off_len() implementation is moved to header and made inline explicitly, as small function should typically be inlined. These functions are going to be used from bpf_trace.c in the next patch of this series. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20250512205348.191079-2-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 ++++++++++++++ kernel/bpf/helpers.c | 14 ++------------ 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f0cc89c0622..83c56f40842b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1349,6 +1349,20 @@ u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, + void *src, u32 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, + void *buffer__opt, u32 buffer__szk); + +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +{ + u32 size = __bpf_dynptr_size(ptr); + + if (len > size || offset > size - len) + return -E2BIG; + + return 0; +} #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fed53da75025..c96cc6aeae99 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1714,16 +1714,6 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) -{ - u32 size = __bpf_dynptr_size(ptr); - - if (len > size || offset > size - len) - return -E2BIG; - - return 0; -} - BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1810,8 +1800,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; -- cgit v1.2.3 From a498ee7576de24b4b0916ce56cf2686e261a29f7 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 12 May 2025 21:53:47 +0100 Subject: bpf: Implement dynptr copy kfuncs This patch introduces a new set of kfuncs for working with dynptrs in BPF programs, enabling reading variable-length user or kernel data into dynptr directly. To enable memory-safety, verifier allows only constant-sized reads via existing bpf_probe_read_{user|kernel} etc. kfuncs, dynptr-based kfuncs allow dynamically-sized reads without memory safety shortcomings. The following kfuncs are introduced: * `bpf_probe_read_kernel_dynptr()`: probes kernel-space data into a dynptr * `bpf_probe_read_user_dynptr()`: probes user-space data into a dynptr * `bpf_probe_read_kernel_str_dynptr()`: probes kernel-space string into a dynptr * `bpf_probe_read_user_str_dynptr()`: probes user-space string into a dynptr * `bpf_copy_from_user_dynptr()`: sleepable, copies user-space data into a dynptr for the current task * `bpf_copy_from_user_str_dynptr()`: sleepable, copies user-space string into a dynptr for the current task * `bpf_copy_from_user_task_dynptr()`: sleepable, copies user-space data of the task into a dynptr * `bpf_copy_from_user_task_str_dynptr()`: sleepable, copies user-space string of the task into a dynptr The implementation is built on two generic functions: * __bpf_dynptr_copy * __bpf_dynptr_copy_str These functions take function pointers as arguments, enabling the copying of data from various sources, including both kernel and user space. Use __always_inline for generic functions and callbacks to make sure the compiler doesn't generate indirect calls into callbacks, which is more expensive, especially on some kernel configurations. Inlining allows compiler to put direct calls into all the specific callback implementations (copy_user_data_sleepable, copy_user_data_nofault, and so on). Reviewed-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20250512205348.191079-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 8 ++ kernel/trace/bpf_trace.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c96cc6aeae99..ebb604e39eb1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3378,6 +3378,14 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) +BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) +BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 868920994517..985ce3854af8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3466,6 +3466,142 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) late_initcall(bpf_kprobe_multi_kfuncs_init); +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); + +/* + * The __always_inline is to make sure the compiler doesn't + * generate indirect calls into callbacks, which is expensive, + * on some kernel configurations. This allows compiler to put + * direct calls into all the specific callback implementations + * (copy_user_data_sleepable, copy_user_data_nofault, and so on) + */ +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, + const void *unsafe_src, + copy_fn_t str_copy_fn, + struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + u32 chunk_sz, off; + void *dst_slice; + int cnt, err; + char buf[256]; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return str_copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz - 1) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + /* Expect str_copy_fn to return count of copied bytes, including + * zero terminator. Next iteration increment off by chunk_sz - 1 to + * overwrite NUL. + */ + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (cnt < 0) + return cnt; + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); + if (err) + return err; + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ + return off + cnt; + } + return off; +} + +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, + u32 size, const void *unsafe_src, + copy_fn_t copy_fn, struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + void *dst_slice; + char buf[256]; + u32 off, chunk_sz; + int err; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (err) + return err; + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); + if (err) + return err; + } + return 0; +} + +static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (!tsk) /* Read from the current task */ + return copy_from_user(dst, (const void __user *)unsafe_src, size); + + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); + if (ret != size) + return -EFAULT; + return 0; +} + +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_kernel_nofault(dst, unsafe_src, size); +} + +static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (unlikely(size == 0)) + return 0; + + if (tsk) { + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); + } else { + ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); + /* strncpy_from_user does not guarantee NUL termination */ + if (ret >= 0) + ((char *)dst)[ret] = '\0'; + } + + if (ret < 0) + return ret; + return ret + 1; +} + +static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_kernel_nofault(dst, unsafe_src, size); +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, @@ -3477,4 +3613,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, + copy_kernel_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, + copy_kernel_str_nofault, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, tsk); +} + +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, tsk); +} + __bpf_kfunc_end_defs(); -- cgit v1.2.3 From 3880cdbed1c4607e378f58fa924c5d6df900d1d3 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 13 May 2025 12:27:47 +0800 Subject: bpf: Fix WARN() in get_bpf_raw_tp_regs syzkaller reported an issue: WARNING: CPU: 3 PID: 5971 at kernel/trace/bpf_trace.c:1861 get_bpf_raw_tp_regs+0xa4/0x100 kernel/trace/bpf_trace.c:1861 Modules linked in: CPU: 3 UID: 0 PID: 5971 Comm: syz-executor205 Not tainted 6.15.0-rc5-syzkaller-00038-g707df3375124 #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:get_bpf_raw_tp_regs+0xa4/0x100 kernel/trace/bpf_trace.c:1861 RSP: 0018:ffffc90003636fa8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000003 RCX: ffffffff81c6bc4c RDX: ffff888032efc880 RSI: ffffffff81c6bc83 RDI: 0000000000000005 RBP: ffff88806a730860 R08: 0000000000000005 R09: 0000000000000003 R10: 0000000000000004 R11: 0000000000000000 R12: 0000000000000004 R13: 0000000000000001 R14: ffffc90003637008 R15: 0000000000000900 FS: 0000000000000000(0000) GS:ffff8880d6cdf000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7baee09130 CR3: 0000000029f5a000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ____bpf_get_stack_raw_tp kernel/trace/bpf_trace.c:1934 [inline] bpf_get_stack_raw_tp+0x24/0x160 kernel/trace/bpf_trace.c:1931 bpf_prog_ec3b2eefa702d8d3+0x43/0x47 bpf_dispatcher_nop_func include/linux/bpf.h:1316 [inline] __bpf_prog_run include/linux/filter.h:718 [inline] bpf_prog_run include/linux/filter.h:725 [inline] __bpf_trace_run kernel/trace/bpf_trace.c:2363 [inline] bpf_trace_run3+0x23f/0x5a0 kernel/trace/bpf_trace.c:2405 __bpf_trace_mmap_lock_acquire_returned+0xfc/0x140 include/trace/events/mmap_lock.h:47 __traceiter_mmap_lock_acquire_returned+0x79/0xc0 include/trace/events/mmap_lock.h:47 __do_trace_mmap_lock_acquire_returned include/trace/events/mmap_lock.h:47 [inline] trace_mmap_lock_acquire_returned include/trace/events/mmap_lock.h:47 [inline] __mmap_lock_do_trace_acquire_returned+0x138/0x1f0 mm/mmap_lock.c:35 __mmap_lock_trace_acquire_returned include/linux/mmap_lock.h:36 [inline] mmap_read_trylock include/linux/mmap_lock.h:204 [inline] stack_map_get_build_id_offset+0x535/0x6f0 kernel/bpf/stackmap.c:157 __bpf_get_stack+0x307/0xa10 kernel/bpf/stackmap.c:483 ____bpf_get_stack kernel/bpf/stackmap.c:499 [inline] bpf_get_stack+0x32/0x40 kernel/bpf/stackmap.c:496 ____bpf_get_stack_raw_tp kernel/trace/bpf_trace.c:1941 [inline] bpf_get_stack_raw_tp+0x124/0x160 kernel/trace/bpf_trace.c:1931 bpf_prog_ec3b2eefa702d8d3+0x43/0x47 Tracepoint like trace_mmap_lock_acquire_returned may cause nested call as the corner case show above, which will be resolved with more general method in the future. As a result, WARN_ON_ONCE will be triggered. As Alexei suggested, remove the WARN_ON_ONCE first. Fixes: 9594dc3c7e71 ("bpf: fix nested bpf tracepoints with per-cpu data") Reported-by: syzbot+45b0c89a0fc7ae8dbadc@syzkaller.appspotmail.com Suggested-by: Alexei Starovoitov Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250513042747.757042-1-chen.dylane@linux.dev Closes: https://lore.kernel.org/bpf/8bc2554d-1052-4922-8832-e0078a033e1d@gmail.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 985ce3854af8..b0eb721fcfb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1753,7 +1753,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } -- cgit v1.2.3 From bc049387b41f41bee61e8cc338a5e99ca9798a09 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 13 May 2025 07:28:12 -0700 Subject: bpf: Add support for __prog argument suffix to pass in prog->aux Instead of hardcoding the list of kfuncs that need prog->aux passed to them with a combination of fixup_kfunc_call adjustment + __ign suffix, combine both in __prog suffix, which ignores the argument passed in, and fixes it up to the prog->aux. This allows kfuncs to have the prog->aux passed into them without having to touch the verifier. Cc: Tejun Heo Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250513142812.1021591-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 17 +++++++++++++++++ include/linux/bpf_verifier.h | 1 + kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 33 +++++++++++++++++++++++++++------ 4 files changed, 47 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index a8f5782bd833..ae468b781d31 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -160,6 +160,23 @@ Or:: ... } +2.2.6 __prog Annotation +--------------------------- +This annotation is used to indicate that the argument needs to be fixed up to +the bpf_prog_aux of the caller BPF program. Any value passed into this argument +is ignored, and rewritten by the verifier. + +An example is given below:: + + __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + void *aux__prog) + { + struct bpf_prog_aux *aux = aux__prog; + ... + } + .. _BPF_kfunc_nodef: 2.3 Using an existing kernel function diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9734544b6957..cedd66867ecf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -591,6 +591,7 @@ struct bpf_insn_aux_data { * bpf_fastcall pattern. */ u8 fastcall_spills_num:3; + u8 arg_prog:4; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ebb604e39eb1..c1113b74e1e2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3002,9 +3002,9 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, - void *aux__ign) + void *aux__prog) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 28f5a7899bd6..f6d3655b3a7a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -322,6 +322,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *arg_btf; u32 arg_btf_id; bool arg_owning_ref; + bool arg_prog; struct { struct btf_field *field; @@ -11897,6 +11898,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__prog"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -12938,6 +12944,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; + if (is_kfunc_arg_prog(btf, &args[i])) { + /* Used to reject repeated use of __prog. */ + if (meta->arg_prog) { + verbose(env, "Only 1 prog->aux argument supported per-kfunc\n"); + return -EFAULT; + } + meta->arg_prog = true; + cur_aux(env)->arg_prog = regno; + continue; + } + if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { verbose(env, "R%d is not a scalar\n", regno); @@ -21517,13 +21534,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + } - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - *cnt = 3; + if (env->insn_aux_data[insn_idx].arg_prog) { + u32 regno = env->insn_aux_data[insn_idx].arg_prog; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; + int idx = *cnt; + + insn_buf[idx++] = ld_addrs[0]; + insn_buf[idx++] = ld_addrs[1]; + insn_buf[idx++] = *insn; + *cnt = idx; } return 0; } -- cgit v1.2.3 From 94bde253d3ae5d8a01cb958663b12daef1d06574 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 12 May 2025 22:57:30 +0200 Subject: bpf: Pass the same orig_call value to trampoline functions There is currently some confusion in the s390x JIT regarding whether orig_call can be NULL and what that means. Originally the NULL value was used to distinguish the struct_ops case, but this was superseded by BPF_TRAMP_F_INDIRECT (see commit 0c970ed2f87c ("s390/bpf: Fix indirect trampoline generation"). The remaining reason to have this check is that NULL can actually be passed to the arch_bpf_trampoline_size() call - but not to the respective arch_prepare_bpf_trampoline()! call - by bpf_struct_ops_prepare_trampoline(). Remove this asymmetry by passing stub_func to both functions, so that JITs may rely on orig_call never being NULL. Signed-off-by: Ilya Leoshkevich Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250512221911.61314-2-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index db13ee70d94d..96113633e391 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -601,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; -- cgit v1.2.3 From 1cb0f56d96185cb20e63e191fc291191823e6f52 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 19 May 2025 15:43:57 +0200 Subject: bpf: WARN_ONCE on verifier bugs Throughout the verifier's logic, there are multiple checks for inconsistent states that should never happen and would indicate a verifier bug. These bugs are typically logged in the verifier logs and sometimes preceded by a WARN_ONCE. This patch reworks these checks to consistently emit a verifier log AND a warning when CONFIG_DEBUG_KERNEL is enabled. The consistent use of WARN_ONCE should help fuzzers (ex. syzkaller) expose any situation where they are actually able to reach one of those buggy verifier states. Acked-by: Andrii Nakryiko Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/aCs1nYvNNMq8dAWP@mail.gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++ include/linux/bpf_verifier.h | 11 ++++ kernel/bpf/btf.c | 4 +- kernel/bpf/verifier.c | 141 ++++++++++++++++++------------------------- 4 files changed, 79 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83c56f40842b..5b25d278409b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -346,6 +346,12 @@ static inline const char *btf_field_type_name(enum btf_field_type type) } } +#if IS_ENABLED(CONFIG_DEBUG_KERNEL) +#define BPF_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) +#else +#define BPF_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#endif + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cedd66867ecf..78c97e12ea4e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -839,6 +839,17 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...); +#define verifier_bug_if(cond, env, fmt, args...) \ + ({ \ + bool __cond = (cond); \ + if (unlikely(__cond)) { \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ + } \ + (__cond); \ + }) +#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b21ca67070c..0f7828380895 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7659,7 +7659,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return 0; if (!prog->aux->func_info) { - bpf_log(log, "Verifier bug\n"); + verifier_bug(env, "func_info undefined"); return -EFAULT; } @@ -7683,7 +7683,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { - bpf_log(log, "Verifier bug in function %s()\n", tname); + verifier_bug(env, "unreliable BTF for function %s()", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6d3655b3a7a..d5807d2efc92 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1924,11 +1924,8 @@ static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, u32 steps = 0; while (topmost && topmost->loop_entry) { - if (steps++ > st->dfs_depth) { - WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n"); - verbose(env, "verifier bug: infinite loop in get_loop_entry()\n"); + if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop")) return ERR_PTR(-EFAULT); - } topmost = topmost->loop_entry; } return topmost; @@ -3460,12 +3457,11 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; - if (parent->live & REG_LIVE_DONE) { - verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off); + if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, + "type %s var_off %lld off %d", + reg_type_str(env, parent->type), + parent->var_off.value, parent->off)) return -EFAULT; - } /* The first condition is more likely to be true than the * second, checked it first. */ @@ -3858,14 +3854,14 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s /* atomic instructions push insn_flags twice, for READ and * WRITE sides, but they should agree on stack slot */ - WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && - (env->cur_hist_ent->flags & insn_flags) != insn_flags, - "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", - env->insn_idx, env->cur_hist_ent->flags, insn_flags); + verifier_bug_if((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + env, "insn history: insn_idx %d cur flags %x new flags %x", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; - WARN_ONCE(env->cur_hist_ent->linked_regs != 0, - "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", - env->insn_idx, env->cur_hist_ent->linked_regs); + verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, + "insn history: insn_idx %d linked_regs: %#llx", + env->insn_idx, env->cur_hist_ent->linked_regs); env->cur_hist_ent->linked_regs = linked_regs; return 0; } @@ -3988,8 +3984,7 @@ static inline u32 bt_empty(struct backtrack_state *bt) static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { - verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog enter from frame %d", bt->frame); return -EFAULT; } bt->frame++; @@ -3999,8 +3994,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt) static inline int bt_subprog_exit(struct backtrack_state *bt) { if (bt->frame == 0) { - verbose(bt->env, "BUG subprog exit from frame 0\n"); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog exit from frame 0"); return -EFAULT; } bt->frame--; @@ -4278,14 +4272,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * should be literally next instruction in * caller program */ - WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); + verifier_bug_if(idx + 1 != subseq_idx, env, + "extra insn from subprog"); /* r1-r5 are invalidated after subprog call, * so for global func call it shouldn't be set * anymore */ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "global subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* global subprog always sets R0 */ @@ -4299,16 +4294,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * the current frame should be zero by now */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "static subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* we are now tracking register spills correctly, * so any instance of leftover slots is a bug */ if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + verifier_bug(env, + "static subprog leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* propagate r1-r5 to the caller */ @@ -4331,13 +4327,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "callback unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + verifier_bug(env, "callback leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* clear r1-r5 in callback subprog's mask */ @@ -4356,11 +4352,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* regular helper call sets R0 */ bt_clear_reg(bt, BPF_REG_0); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 + /* if backtracking was looking for registers R1-R5 * they should have been found already. */ - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking call unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } } else if (opcode == BPF_EXIT) { @@ -4378,8 +4374,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking exit unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } @@ -4720,9 +4716,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } - verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); return -EFAULT; } @@ -4758,8 +4753,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * It means the backtracking missed the spot where * particular register was initialized with a constant. */ - verbose(env, "BUG backtracking idx %d\n", i); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking idx %d", i); return -EFAULT; } } @@ -4784,12 +4778,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", - i, func->allocated_stack / BPF_REG_SIZE); - WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE, + env, "stack slot %d, total slots %d", + i, func->allocated_stack / BPF_REG_SIZE)) return -EFAULT; - } if (!is_spilled_scalar_reg(&func->stack[i])) { bt_clear_frame_slot(bt, fr, i); @@ -6562,21 +6554,18 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; sidx = find_subprog(env, next_insn); - if (sidx < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - next_insn); + if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; - } if (subprog[sidx].is_async_cb) { if (subprog[sidx].has_tail_call) { - verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + verifier_bug(env, "subprog has tail_call and async cb"); return -EFAULT; } /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; if (subprog[sidx].is_exception_cb) { - verbose(env, "insn %d cannot call exception cb directly\n", i); + verbose(env, "insn %d cannot call exception cb directly", i); return -EINVAL; } } @@ -6676,11 +6665,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, int start = idx + insn->imm + 1, subprog; subprog = find_subprog(env, start); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - start); + if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start)) return -EFAULT; - } return env->subprog_info[subprog].stack_depth; } #endif @@ -7985,7 +7971,7 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) { - verbose(env, "verifier bug: allocated_stack too small\n"); + verbose(env, "allocated_stack too small\n"); return -EFAULT; } @@ -8414,7 +8400,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (meta->map_ptr) { - verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } meta->map_uid = reg->map_uid; @@ -10286,8 +10272,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } if (state->frame[state->curframe + 1]) { - verbose(env, "verifier bug. Frame %d already allocated\n", - state->curframe + 1); + verifier_bug(env, "Frame %d already allocated", state->curframe + 1); return -EFAULT; } @@ -10401,8 +10386,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (err) return err; } else { - bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", - i, arg->arg_type); + verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); return -EFAULT; } } @@ -10465,13 +10449,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "kfunc %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } else if (!bpf_pseudo_kfunc_call(insn) && !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "helper %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -10523,10 +10507,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, target_insn = *insn_idx + insn->imm + 1; subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program", + target_insn)) return -EFAULT; - } caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); @@ -11125,7 +11108,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { - verbose(env, "verifier bug\n"); + verbose(env, "failed to retrieve map value address\n"); return -EFAULT; } fmt = (char *)(long)fmt_addr + fmt_map_off; @@ -19706,10 +19689,9 @@ process_bpf_exit: return err; break; } else { - if (WARN_ON_ONCE(env->cur_state->loop_entry)) { - verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n"); + if (verifier_bug_if(env->cur_state->loop_entry, env, + "broken loop detection")) return -EFAULT; - } do_print_state = true; continue; } @@ -20767,10 +20749,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (bpf_pseudo_kfunc_call(&insn)) continue; - if (WARN_ON(load_reg == -1)) { - verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); + if (verifier_bug_if(load_reg == -1, env, + "zext_dst is set, but no reg is defined")) return -EFAULT; - } zext_patch[0] = insn; zext_patch[1].dst_reg = load_reg; @@ -21087,11 +21068,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) * propagated in any case. */ subprog = find_subprog(env, i + insn->imm + 1); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i + insn->imm + 1); + if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d", + i + insn->imm + 1)) return -EFAULT; - } /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ @@ -22454,7 +22433,7 @@ next_insn: continue; /* We need two slots in case timed may_goto is supported. */ if (stack_slots > slots) { - verbose(env, "verifier bug: stack_slots supports may_goto only\n"); + verifier_bug(env, "stack_slots supports may_goto only"); return -EFAULT; } -- cgit v1.2.3 From 4e2e6841ff761cc15a54e8bebcf35d7325ec78a2 Mon Sep 17 00:00:00 2001 From: Di Shen Date: Tue, 20 May 2025 13:49:43 +0800 Subject: bpf: Revert "bpf: remove unnecessary rcu_read_{lock,unlock}() in multi-uprobe attach logic" This reverts commit 4a8f635a6054. Althought get_pid_task() internally already calls rcu_read_lock() and rcu_read_unlock(), the find_vpid() was not. The documentation for find_vpid() clearly states: "Must be called with the tasklist_lock or rcu_read_lock() held." Add proper rcu_read_lock/unlock() to protect the find_vpid(). Fixes: 4a8f635a6054 ("bpf: remove unnecessary rcu_read_{lock,unlock}() in multi-uprobe attach logic") Reported-by: Xuewen Yan Signed-off-by: Di Shen Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250520054943.5002-1-xuewen.yan@unisoc.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0eb721fcfb5..289ff0caa83e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3318,7 +3318,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; -- cgit v1.2.3 From a539e2a6d51d1c12d89eec149ccc72ec561639bc Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Tue, 20 May 2025 14:01:17 +0100 Subject: btf: Allow mmap of vmlinux btf User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel. remap_pfn_range is used instead of vm_insert_page due to aarch64 compatibility issues. Signed-off-by: Lorenz Bauer Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/bpf/20250520-vmlinux-mmap-v5-1-e8c941acc414@isovalent.com --- include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bd..1750390735fa 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \ + . = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \ - . = ALIGN(4); \ + . = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ } diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a..941d0d2427e3 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include #include #include +#include +#include +#include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[]; +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; + size_t vm_size = vma->vm_end - vma->vm_start; + phys_addr_t addr = virt_to_phys(__start_BTF); + unsigned long pfn = addr >> PAGE_SHIFT; + + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) + return -EACCES; + + if (pfn + pages < pfn) + return -EINVAL; + + if ((vm_size >> PAGE_SHIFT) > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read, + .mmap = btf_sysfs_vmlinux_mmap, }; struct kobject *btf_kobj; -- cgit v1.2.3 From 079e5c56a5c41d285068939ff7b0041ab10386fa Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 23 May 2025 19:17:05 +0100 Subject: bpf: Fix error return value in bpf_copy_from_user_dynptr On error, copy_from_user returns number of bytes not copied to destination, but current implementation of copy_user_data_sleepable does not handle that correctly and returns it as error value, which may confuse user, expecting meaningful negative error value. Fixes: a498ee7576de ("bpf: Implement dynptr copy kfuncs") Reported-by: Dan Carpenter Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250523181705.261585-1-mykyta.yatsenko5@gmail.com --- kernel/trace/bpf_trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 289ff0caa83e..132c8be6f635 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3555,8 +3555,12 @@ static __always_inline int copy_user_data_sleepable(void *dst, const void *unsaf { int ret; - if (!tsk) /* Read from the current task */ - return copy_from_user(dst, (const void __user *)unsafe_src, size); + if (!tsk) { /* Read from the current task */ + ret = copy_from_user(dst, (const void __user *)unsafe_src, size); + if (ret) + return -EFAULT; + return 0; + } ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); if (ret != size) -- cgit v1.2.3 From 76ea95534995adde1aa3cb1aa97ef33f50a617a9 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Thu, 22 May 2025 23:04:26 +0000 Subject: bpf: Add dmabuf iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dmabuf iterator traverses the list of all DMA buffers. DMA buffers are refcounted through their associated struct file. A reference is taken on each buffer as the list is iterated to ensure each buffer persists for the duration of the bpf program execution without holding the list mutex. Signed-off-by: T.J. Mercier Reviewed-by: Christian König Acked-by: Song Liu Link: https://lore.kernel.org/r/20250522230429.941193-3-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- drivers/dma-buf/dma-buf.c | 68 +++++++++++++++++++++++++++++++ include/linux/dma-buf.h | 2 + kernel/bpf/Makefile | 3 ++ kernel/bpf/dmabuf_iter.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 175 insertions(+) create mode 100644 kernel/bpf/dmabuf_iter.c (limited to 'kernel') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 8d151784e302..6b59506a1b94 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -55,6 +57,72 @@ static void __dma_buf_list_del(struct dma_buf *dmabuf) mutex_unlock(&dmabuf_list_mutex); } +/** + * dma_buf_iter_begin - begin iteration through global list of all DMA buffers + * + * Returns the first buffer in the global list of DMA-bufs that's not in the + * process of being destroyed. Increments that buffer's reference count to + * prevent buffer destruction. Callers must release the reference, either by + * continuing iteration with dma_buf_iter_next(), or with dma_buf_put(). + * + * Return: + * * First buffer from global list, with refcount elevated + * * NULL if no active buffers are present + */ +struct dma_buf *dma_buf_iter_begin(void) +{ + struct dma_buf *ret = NULL, *dmabuf; + + /* + * The list mutex does not protect a dmabuf's refcount, so it can be + * zeroed while we are iterating. We cannot call get_dma_buf() since the + * caller may not already own a reference to the buffer. + */ + mutex_lock(&dmabuf_list_mutex); + list_for_each_entry(dmabuf, &dmabuf_list, list_node) { + if (file_ref_get(&dmabuf->file->f_ref)) { + ret = dmabuf; + break; + } + } + mutex_unlock(&dmabuf_list_mutex); + return ret; +} + +/** + * dma_buf_iter_next - continue iteration through global list of all DMA buffers + * @dmabuf: [in] pointer to dma_buf + * + * Decrements the reference count on the provided buffer. Returns the next + * buffer from the remainder of the global list of DMA-bufs with its reference + * count incremented. Callers must release the reference, either by continuing + * iteration with dma_buf_iter_next(), or with dma_buf_put(). + * + * Return: + * * Next buffer from global list, with refcount elevated + * * NULL if no additional active buffers are present + */ +struct dma_buf *dma_buf_iter_next(struct dma_buf *dmabuf) +{ + struct dma_buf *ret = NULL; + + /* + * The list mutex does not protect a dmabuf's refcount, so it can be + * zeroed while we are iterating. We cannot call get_dma_buf() since the + * caller may not already own a reference to the buffer. + */ + mutex_lock(&dmabuf_list_mutex); + dma_buf_put(dmabuf); + list_for_each_entry_continue(dmabuf, &dmabuf_list, list_node) { + if (file_ref_get(&dmabuf->file->f_ref)) { + ret = dmabuf; + break; + } + } + mutex_unlock(&dmabuf_list_mutex); + return ret; +} + static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) { struct dma_buf *dmabuf; diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 8ff4add71f88..7af2ea839f58 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -634,4 +634,6 @@ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map); int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); +struct dma_buf *dma_buf_iter_begin(void); +struct dma_buf *dma_buf_iter_next(struct dma_buf *dmbuf); #endif /* __DMA_BUF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 70502f038b92..3a335c50e6e3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -53,6 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o +ifeq ($(CONFIG_DMA_SHARED_BUFFER),y) +obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o +endif CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c new file mode 100644 index 000000000000..83ef54d78b62 --- /dev/null +++ b/kernel/bpf/dmabuf_iter.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Google LLC */ +#include +#include +#include +#include +#include + +static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos) + return NULL; + + return dma_buf_iter_begin(); +} + +static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dma_buf *dmabuf = v; + + ++*pos; + + return dma_buf_iter_next(dmabuf); +} + +struct bpf_iter__dmabuf { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct dma_buf *, dmabuf); +}; + +static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_meta meta = { + .seq = seq, + }; + struct bpf_iter__dmabuf ctx = { + .meta = &meta, + .dmabuf = v, + }; + struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop); + + if (prog) + return bpf_iter_run_prog(prog, &ctx); + + return 0; +} + +static int dmabuf_iter_seq_show(struct seq_file *seq, void *v) +{ + return __dmabuf_seq_show(seq, v, false); +} + +static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct dma_buf *dmabuf = v; + + if (dmabuf) + dma_buf_put(dmabuf); +} + +static const struct seq_operations dmabuf_iter_seq_ops = { + .start = dmabuf_iter_seq_start, + .next = dmabuf_iter_seq_next, + .stop = dmabuf_iter_seq_stop, + .show = dmabuf_iter_seq_show, +}; + +static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "dmabuf iter\n"); +} + +static const struct bpf_iter_seq_info dmabuf_iter_seq_info = { + .seq_ops = &dmabuf_iter_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = 0, +}; + +static struct bpf_iter_reg bpf_dmabuf_reg_info = { + .target = "dmabuf", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_dmabuf_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__dmabuf, dmabuf), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &dmabuf_iter_seq_info, +}; + +DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf) +BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf) + +static int __init dmabuf_iter_init(void) +{ + bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0]; + return bpf_iter_reg_target(&bpf_dmabuf_reg_info); +} + +late_initcall(dmabuf_iter_init); -- cgit v1.2.3 From 6eab7ac7c5eea7628b92cd5f9427bbd963a954ec Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Thu, 22 May 2025 23:04:27 +0000 Subject: bpf: Add open coded dmabuf iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This open coded iterator allows for more flexibility when creating BPF programs. It can support output in formats other than text. With an open coded iterator, a single BPF program can traverse multiple kernel data structures (now including dmabufs), allowing for more efficient analysis of kernel data compared to multiple reads from procfs, sysfs, or multiple traditional BPF iterator invocations. Signed-off-by: T.J. Mercier Acked-by: Christian König Acked-by: Song Liu Link: https://lore.kernel.org/r/20250522230429.941193-4-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/dmabuf_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 5 +++++ 2 files changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c index 83ef54d78b62..4dd7ef7c145c 100644 --- a/kernel/bpf/dmabuf_iter.c +++ b/kernel/bpf/dmabuf_iter.c @@ -100,3 +100,51 @@ static int __init dmabuf_iter_init(void) } late_initcall(dmabuf_iter_init); + +struct bpf_iter_dmabuf { + /* + * opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __aligned(8); + +/* Non-opaque version of bpf_iter_dmabuf */ +struct bpf_iter_dmabuf_kern { + struct dma_buf *dmabuf; +} __aligned(8); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->dmabuf = NULL; + return 0; +} + +__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + kit->dmabuf = dma_buf_iter_next(kit->dmabuf); + else + kit->dmabuf = dma_buf_iter_begin(); + + return kit->dmabuf; +} + +__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + dma_buf_put(kit->dmabuf); +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c1113b74e1e2..bd17ed5bfc4b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3386,6 +3386,11 @@ BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +#ifdef CONFIG_DMA_SHARED_BUFFER +BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) +#endif BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From d848bba68034847e39b82694be7fabed0e9d0d1e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 23 May 2025 13:53:21 -0700 Subject: bpf: Remove special_kfunc_set from verifier Currently, the verifier has both special_kfunc_set and special_kfunc_list. When adding a new kfunc usage to the verifier, it is often confusing about whether special_kfunc_set or special_kfunc_list or both should add that kfunc. For example, some kfuncs, e.g., bpf_dynptr_from_skb, bpf_dynptr_clone, bpf_wq_set_callback_impl, does not need to be in special_kfunc_set. To avoid potential future confusion, special_kfunc_set is deleted and btf_id_set_contains(&special_kfunc_set, ...) is removed. The code is refactored with a new func check_special_kfunc(), which contains all codes covered by original branch meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id) There is no functionality change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250523205321.1291431-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 374 ++++++++++++++++++++++++-------------------------- 1 file changed, 177 insertions(+), 197 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d5807d2efc92..639e9bb94af2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12107,44 +12107,6 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock_irqrestore, }; -BTF_SET_START(special_kfunc_set) -BTF_ID(func, bpf_obj_new_impl) -BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front_impl) -BTF_ID(func, bpf_list_push_back_impl) -BTF_ID(func, bpf_list_pop_front) -BTF_ID(func, bpf_list_pop_back) -BTF_ID(func, bpf_list_front) -BTF_ID(func, bpf_list_back) -BTF_ID(func, bpf_cast_to_kern_ctx) -BTF_ID(func, bpf_rdonly_cast) -BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add_impl) -BTF_ID(func, bpf_rbtree_first) -BTF_ID(func, bpf_rbtree_root) -BTF_ID(func, bpf_rbtree_left) -BTF_ID(func, bpf_rbtree_right) -#ifdef CONFIG_NET -BTF_ID(func, bpf_dynptr_from_skb) -BTF_ID(func, bpf_dynptr_from_xdp) -#endif -BTF_ID(func, bpf_dynptr_slice) -BTF_ID(func, bpf_dynptr_slice_rdwr) -BTF_ID(func, bpf_dynptr_clone) -BTF_ID(func, bpf_percpu_obj_new_impl) -BTF_ID(func, bpf_percpu_obj_drop_impl) -BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) -#ifdef CONFIG_CGROUPS -BTF_ID(func, bpf_iter_css_task_new) -#endif -#ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_set_dentry_xattr) -BTF_ID(func, bpf_remove_dentry_xattr) -#endif -BTF_SET_END(special_kfunc_set) - BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) @@ -13452,6 +13414,178 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +/* check special kfuncs and return: + * 1 - not fall-through to 'else' branch, continue verification + * 0 - fall-through to 'else' branch + * < 0 - not fall-through to 'else' branch, return error + */ +static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux, + const struct btf_type *ptr_type, struct btf *desc_btf) +{ + const struct btf_type *ret_t; + int err = 0; + + if (meta->btf != btf_vmlinux) + return 0; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; + struct btf *ret_btf; + u32 ret_btf_id; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + return -ENOMEM; + + if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta->arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; + + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = struct_meta; + } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = meta->arg_btf; + regs[BPF_REG_0].btf_id = meta->arg_btf_id; + + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta->arg_btf, + meta->arg_btf_id); + } else if (is_list_node_type(ptr_type)) { + struct btf_field *field = meta->arg_list_head.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (is_rbtree_node_type(ptr_type)) { + struct btf_field *field = meta->arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->ret_btf_id; + } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); + if (!ret_t || !btf_type_is_struct(ret_t)) { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta->arg_constant.found) { + verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta->arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta->initialized_dynptr.id) { + verbose(env, "verifier internal error: no dynptr id\n"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ + } else { + return 0; + } + + return 1; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -13466,7 +13600,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; - const struct btf_type *ret_t; struct btf *desc_btf; /* skip for now, but return error when we find this in fixup_kfunc_call */ @@ -13686,163 +13819,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); - - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *struct_meta; - struct btf *ret_btf; - u32 ret_btf_id; - - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) - return -ENOMEM; - - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { - verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); - return -EINVAL; - } - - ret_btf = env->prog->aux->btf; - ret_btf_id = meta.arg_constant.value; - - /* This may be NULL due to user not supplying a BTF */ - if (!ret_btf) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); - return -EINVAL; - } - - ret_t = btf_type_by_id(ret_btf, ret_btf_id); - if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); - return -EINVAL; - } - - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { - verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", - ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); - return -EINVAL; - } - - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - /* Charge memory allocated with bpf_global_percpu_ma to - * root memcg. The obj_cgroup for root memcg is NULL. - */ - err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - mutex_lock(&bpf_percpu_ma_lock); - err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { - verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); - return -EINVAL; - } - - if (struct_meta) { - verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); - return -EINVAL; - } - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = ret_btf; - regs[BPF_REG_0].btf_id = ret_btf_id; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) - regs[BPF_REG_0].type |= MEM_PERCPU; - - insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = struct_meta; - } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_btf; - regs[BPF_REG_0].btf_id = meta.arg_btf_id; - - insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_btf, - meta.arg_btf_id); - } else if (is_list_node_type(ptr_type)) { - struct btf_field *field = meta.arg_list_head.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (is_rbtree_node_type(ptr_type)) { - struct btf_field *field = meta.arg_rbtree_root.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.ret_btf_id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { - verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); - return -EINVAL; - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.arg_constant.value; - } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || - meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); - - mark_reg_known_zero(env, regs, BPF_REG_0); - - if (!meta.arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); - return -EFAULT; - } - - regs[BPF_REG_0].mem_size = meta.arg_constant.value; - - /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ - regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; - - if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { - regs[BPF_REG_0].type |= MEM_RDONLY; - } else { - /* this will set env->seen_direct_write to true */ - if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { - verbose(env, "the prog does not allow writes to packet data\n"); - return -EINVAL; - } - } - - if (!meta.initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); - return -EFAULT; - } - regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ - } else { - verbose(env, "kernel function %s unhandled dynamic return type\n", - meta.func_name); - return -EFAULT; - } + err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf); + if (err) { + if (err < 0) + return err; } else if (btf_type_is_void(ptr_type)) { /* kfunc returning 'void *' is equivalent to returning scalar */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -13918,7 +13898,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.btf == btf_vmlinux) { if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = -- cgit v1.2.3 From f95695f2c46592b4260032736a9bfc6e2a01c77c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 23 May 2025 13:53:26 -0700 Subject: bpf: Warn with __bpf_trap() kfunc maybe due to uninitialized variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marc Suñé (Isovalent, part of Cisco) reported an issue where an uninitialized variable caused generating bpf prog binary code not working as expected. The reproducer is in [1] where the flags “-Wall -Werror” are enabled, but there is no warning as the compiler takes advantage of uninitialized variable to do aggressive optimization. The optimized code looks like below: ; { 0: bf 16 00 00 00 00 00 00 r6 = r1 ; bpf_printk("Start"); 1: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 0000000000000008: R_BPF_64_64 .rodata 3: b4 02 00 00 06 00 00 00 w2 = 0x6 4: 85 00 00 00 06 00 00 00 call 0x6 ; DEFINE_FUNC_CTX_POINTER(data) 5: 61 61 4c 00 00 00 00 00 w1 = *(u32 *)(r6 + 0x4c) ; bpf_printk("pre ipv6_hdrlen_offset"); 6: 18 01 00 00 06 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x6 ll 0000000000000030: R_BPF_64_64 .rodata 8: b4 02 00 00 17 00 00 00 w2 = 0x17 9: 85 00 00 00 06 00 00 00 call 0x6 The verifier will report the following failure: 9: (85) call bpf_trace_printk#6 last insn is not an exit or jmp The above verifier log does not give a clear hint about how to fix the problem and user may take quite some time to figure out that the issue is due to compiler taking advantage of uninitialized variable. In llvm internals, uninitialized variable usage may generate 'unreachable' IR insn and these 'unreachable' IR insns may indicate uninitialized variable impact on code optimization. So far, llvm BPF backend ignores 'unreachable' IR hence the above code is generated. With clang21 patch [2], those 'unreachable' IR insn are converted to func __bpf_trap(). In order to maintain proper control flow graph for bpf progs, [2] also adds an 'exit' insn after bpf_trap() if __bpf_trap() is the last insn in the function. The new code looks like: ; { 0: bf 16 00 00 00 00 00 00 r6 = r1 ; bpf_printk("Start"); 1: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 0000000000000008: R_BPF_64_64 .rodata 3: b4 02 00 00 06 00 00 00 w2 = 0x6 4: 85 00 00 00 06 00 00 00 call 0x6 ; DEFINE_FUNC_CTX_POINTER(data) 5: 61 61 4c 00 00 00 00 00 w1 = *(u32 *)(r6 + 0x4c) ; bpf_printk("pre ipv6_hdrlen_offset"); 6: 18 01 00 00 06 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x6 ll 0000000000000030: R_BPF_64_64 .rodata 8: b4 02 00 00 17 00 00 00 w2 = 0x17 9: 85 00 00 00 06 00 00 00 call 0x6 10: 85 10 00 00 ff ff ff ff call -0x1 0000000000000050: R_BPF_64_32 __bpf_trap 11: 95 00 00 00 00 00 00 00 exit In kernel, a new kfunc __bpf_trap() is added. During insn verification, any hit with __bpf_trap() will result in verification failure. The kernel is able to provide better log message for debugging. With llvm patch [2] and without this patch (no __bpf_trap() kfunc for existing kernel), e.g., for old kernels, the verifier outputs 10: kfunc '__bpf_trap' is referenced but wasn't resolved Basically, kernel does not support __bpf_trap() kfunc. This still didn't give clear signals about possible reason. With llvm patch [2] and with this patch, the verifier outputs 10: (85) call __bpf_trap#74479 unexpected __bpf_trap() due to uninitialized variable? It gives much better hints for verification failure. [1] https://github.com/msune/clang_bpf/blob/main/Makefile#L3 [2] https://github.com/llvm/llvm-project/pull/131731 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250523205326.1291640-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 5 +++++ kernel/bpf/verifier.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bd17ed5bfc4b..376403707a85 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3273,6 +3273,10 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) local_irq_restore(*flags__irq_flag); } +__bpf_kfunc void __bpf_trap(void) +{ +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3391,6 +3395,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif +BTF_ID_FLAGS(func, __bpf_trap) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 639e9bb94af2..99582e5a8c69 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12105,6 +12105,7 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF___bpf_trap, }; BTF_ID_LIST(special_kfunc_list) @@ -12170,6 +12171,7 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, __bpf_trap) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -13641,6 +13643,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) { + verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n"); + return -EFAULT; } if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { -- cgit v1.2.3 From 86bc9c742426a16b52a10ef61f5b721aecca2344 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Mon, 26 May 2025 21:33:58 +0800 Subject: bpf: Avoid __bpf_prog_ret0_warn when jit fails syzkaller reported an issue: WARNING: CPU: 3 PID: 217 at kernel/bpf/core.c:2357 __bpf_prog_ret0_warn+0xa/0x20 kernel/bpf/core.c:2357 Modules linked in: CPU: 3 UID: 0 PID: 217 Comm: kworker/u32:6 Not tainted 6.15.0-rc4-syzkaller-00040-g8bac8898fe39 RIP: 0010:__bpf_prog_ret0_warn+0xa/0x20 kernel/bpf/core.c:2357 Call Trace: bpf_dispatcher_nop_func include/linux/bpf.h:1316 [inline] __bpf_prog_run include/linux/filter.h:718 [inline] bpf_prog_run include/linux/filter.h:725 [inline] cls_bpf_classify+0x74a/0x1110 net/sched/cls_bpf.c:105 ... When creating bpf program, 'fp->jit_requested' depends on bpf_jit_enable. This issue is triggered because of CONFIG_BPF_JIT_ALWAYS_ON is not set and bpf_jit_enable is set to 1, causing the arch to attempt JIT the prog, but jit failed due to FAULT_INJECTION. As a result, incorrectly treats the program as valid, when the program runs it calls `__bpf_prog_ret0_warn` and triggers the WARN_ON_ONCE(1). Reported-by: syzbot+0903f6d7f285e41cdf10@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/6816e34e.a70a0220.254cdc.002c.GAE@google.com Fixes: fa9dd599b4da ("bpf: get rid of pure_initcall dependency to enable jits") Signed-off-by: KaFai Wan Link: https://lore.kernel.org/r/20250526133358.2594176-1-mannkafai@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a3e571688421..c20babbf998f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2474,7 +2474,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = false; + bool jit_needed = fp->jit_requested; if (fp->bpf_func) goto finalize; -- cgit v1.2.3 From d4965578267e2e81f67c86e2608481e77e9c8569 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 26 May 2025 14:25:34 +0800 Subject: bpf: Check rcu_read_lock_trace_held() in bpf_map_lookup_percpu_elem() bpf_map_lookup_percpu_elem() helper is also available for sleepable bpf program. When BPF JIT is disabled or under 32-bit host, bpf_map_lookup_percpu_elem() will not be inlined. Using it in a sleepable bpf program will trigger the warning in bpf_map_lookup_percpu_elem(), because the bpf program only holds rcu_read_lock_trace lock. Therefore, add the missed check. Reported-by: syzbot+dce5aae19ae4d6399986@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000176a130617420310@google.com/ Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20250526062534.1105938-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 376403707a85..b71e428ad936 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -130,7 +130,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } -- cgit v1.2.3 From e2d2115e56c4a02377189bfc3a9a7933552a7b0f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 23 May 2025 21:13:35 -0700 Subject: bpf: Do not include stack ptr register in precision backtracking bookkeeping Yi Lai reported an issue ([1]) where the following warning appears in kernel dmesg: [ 60.643604] verifier backtracking bug [ 60.643635] WARNING: CPU: 10 PID: 2315 at kernel/bpf/verifier.c:4302 __mark_chain_precision+0x3a6c/0x3e10 [ 60.648428] Modules linked in: bpf_testmod(OE) [ 60.650471] CPU: 10 UID: 0 PID: 2315 Comm: test_progs Tainted: G OE 6.15.0-rc4-gef11287f8289-dirty #327 PREEMPT(full) [ 60.654385] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 60.656682] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 60.660475] RIP: 0010:__mark_chain_precision+0x3a6c/0x3e10 [ 60.662814] Code: 5a 30 84 89 ea e8 c4 d9 01 00 80 3d 3e 7d d8 04 00 0f 85 60 fa ff ff c6 05 31 7d d8 04 01 48 c7 c7 00 58 30 84 e8 c4 06 a5 ff <0f> 0b e9 46 fa ff ff 48 ... [ 60.668720] RSP: 0018:ffff888116cc7298 EFLAGS: 00010246 [ 60.671075] RAX: 54d70e82dfd31900 RBX: ffff888115b65e20 RCX: 0000000000000000 [ 60.673659] RDX: 0000000000000001 RSI: 0000000000000004 RDI: 00000000ffffffff [ 60.676241] RBP: 0000000000000400 R08: ffff8881f6f23bd3 R09: 1ffff1103ede477a [ 60.678787] R10: dffffc0000000000 R11: ffffed103ede477b R12: ffff888115b60ae8 [ 60.681420] R13: 1ffff11022b6cbc4 R14: 00000000fffffff2 R15: 0000000000000001 [ 60.684030] FS: 00007fc2aedd80c0(0000) GS:ffff88826fa8a000(0000) knlGS:0000000000000000 [ 60.686837] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 60.689027] CR2: 000056325369e000 CR3: 000000011088b002 CR4: 0000000000370ef0 [ 60.691623] Call Trace: [ 60.692821] [ 60.693960] ? __pfx_verbose+0x10/0x10 [ 60.695656] ? __pfx_disasm_kfunc_name+0x10/0x10 [ 60.697495] check_cond_jmp_op+0x16f7/0x39b0 [ 60.699237] do_check+0x58fa/0xab10 ... Further analysis shows the warning is at line 4302 as below: 4294 /* static subprog call instruction, which 4295 * means that we are exiting current subprog, 4296 * so only r1-r5 could be still requested as 4297 * precise, r0 and r6-r10 or any stack slot in 4298 * the current frame should be zero by now 4299 */ 4300 if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { 4301 verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); 4302 WARN_ONCE(1, "verifier backtracking bug"); 4303 return -EFAULT; 4304 } With the below test (also in the next patch): __used __naked static void __bpf_jmp_r10(void) { asm volatile ( "r2 = 2314885393468386424 ll;" "goto +0;" "if r2 <= r10 goto +3;" "if r1 >= -1835016 goto +0;" "if r2 <= 8 goto +0;" "if r3 <= 0 goto +0;" "exit;" ::: __clobber_all); } SEC("?raw_tp") __naked void bpf_jmp_r10(void) { asm volatile ( "r3 = 0 ll;" "call __bpf_jmp_r10;" "r0 = 0;" "exit;" ::: __clobber_all); } The following is the verifier failure log: 0: (18) r3 = 0x0 ; R3_w=0 2: (85) call pc+2 caller: R10=fp0 callee: frame1: R1=ctx() R3_w=0 R10=fp0 5: frame1: R1=ctx() R3_w=0 R10=fp0 ; asm volatile (" \ @ verifier_precision.c:184 5: (18) r2 = 0x20202000256c6c78 ; frame1: R2_w=0x20202000256c6c78 7: (05) goto pc+0 8: (bd) if r2 <= r10 goto pc+3 ; frame1: R2_w=0x20202000256c6c78 R10=fp0 9: (35) if r1 >= 0xffe3fff8 goto pc+0 ; frame1: R1=ctx() 10: (b5) if r2 <= 0x8 goto pc+0 mark_precise: frame1: last_idx 10 first_idx 0 subseq_idx -1 mark_precise: frame1: regs=r2 stack= before 9: (35) if r1 >= 0xffe3fff8 goto pc+0 mark_precise: frame1: regs=r2 stack= before 8: (bd) if r2 <= r10 goto pc+3 mark_precise: frame1: regs=r2,r10 stack= before 7: (05) goto pc+0 mark_precise: frame1: regs=r2,r10 stack= before 5: (18) r2 = 0x20202000256c6c78 mark_precise: frame1: regs=r10 stack= before 2: (85) call pc+2 BUG regs 400 The main failure reason is due to r10 in precision backtracking bookkeeping. Actually r10 is always precise and there is no need to add it for the precision backtracking bookkeeping. One way to fix the issue is to prevent bt_set_reg() if any src/dst reg is r10. Andrii suggested to go with push_insn_history() approach to avoid explicitly checking r10 in backtrack_insn(). This patch added push_insn_history() support for cond_jmp like 'rX rY' operations. In check_cond_jmp_op(), if any of rX or rY is a stack pointer, push_insn_history() will record such information, and later backtrack_insn() will do bt_set_reg() properly for those register(s). [1] https://lore.kernel.org/bpf/Z%2F8q3xzpU59CIYQE@ly-workstation/ Reported by: Yi Lai Fixes: 407958a0e980 ("bpf: encapsulate precision backtracking bookkeeping") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250524041335.4046126-1-yonghong.song@linux.dev --- include/linux/bpf_verifier.h | 12 ++++++++---- kernel/bpf/verifier.c | 18 ++++++++++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 78c97e12ea4e..256274acb1d8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -356,7 +356,11 @@ enum { INSN_F_SPI_MASK = 0x3f, /* 6 bits */ INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ - INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ + INSN_F_STACK_ACCESS = BIT(9), + + INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */ + INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */ + /* total 12 bits are used now. */ }; static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); @@ -365,9 +369,9 @@ static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); struct bpf_insn_hist_entry { u32 idx; /* insn idx can't be bigger than 1 million */ - u32 prev_idx : 22; - /* special flags, e.g., whether insn is doing register stack spill/load */ - u32 flags : 10; + u32 prev_idx : 20; + /* special INSN_F_xxx flags */ + u32 flags : 12; /* additional registers that need precision tracking when this * jump is backtracked, vector of six 10-bit records */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99582e5a8c69..98c52829936e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4410,8 +4410,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * before it would be equally necessary to * propagate it to dreg. */ - bt_set_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) + bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) + bt_set_reg(bt, dreg); } else if (BPF_SRC(insn->code) == BPF_K) { /* dreg K * Only dreg still needs precision before @@ -16392,6 +16394,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *eq_branch_regs; struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); + int insn_flags = 0; bool is_jmp32; int pred = -1; int err; @@ -16450,6 +16453,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + + if (src_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_SRC_REG_STACK; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -16461,6 +16467,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, __mark_reg_known(src_reg, insn->imm); } + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; + if (insn_flags) { + err = push_insn_history(env, this_branch, insn_flags, 0); + if (err) + return err; + } + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { -- cgit v1.2.3 From 5ffb537e416ee22dbfb3d552102e50da33fec7f6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 23 May 2025 21:13:40 -0700 Subject: selftests/bpf: Add tests with stack ptr register in conditional jmp Add two tests: - one test has 'rX r10' where rX is not r10, and - another test has 'rX rY' where rX and rY are not r10 but there is an early insn 'rX = r10'. Without previous verifier change, both tests will fail. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250524041340.4046304-1-yonghong.song@linux.dev --- kernel/bpf/verifier.c | 7 ++- .../selftests/bpf/progs/verifier_precision.c | 53 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98c52829936e..a7d6e0c5928b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16456,6 +16456,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (src_reg->type == PTR_TO_STACK) insn_flags |= INSN_F_SRC_REG_STACK; + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -16465,10 +16467,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); + + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } - if (dst_reg->type == PTR_TO_STACK) - insn_flags |= INSN_F_DST_REG_STACK; if (insn_flags) { err = push_insn_history(env, this_branch, insn_flags, 0); if (err) diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 2dd0d15c2678..9fe5d255ee37 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -178,4 +178,57 @@ __naked int state_loop_first_last_equal(void) ); } +__used __naked static void __bpf_cond_op_r10(void) +{ + asm volatile ( + "r2 = 2314885393468386424 ll;" + "goto +0;" + "if r2 <= r10 goto +3;" + "if r1 >= -1835016 goto +0;" + "if r2 <= 8 goto +0;" + "if r3 <= 0 goto +0;" + "exit;" + ::: __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("8: (bd) if r2 <= r10 goto pc+3") +__msg("9: (35) if r1 >= 0xffe3fff8 goto pc+0") +__msg("10: (b5) if r2 <= 0x8 goto pc+0") +__msg("mark_precise: frame1: last_idx 10 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame1: regs=r2 stack= before 9: (35) if r1 >= 0xffe3fff8 goto pc+0") +__msg("mark_precise: frame1: regs=r2 stack= before 8: (bd) if r2 <= r10 goto pc+3") +__msg("mark_precise: frame1: regs=r2 stack= before 7: (05) goto pc+0") +__naked void bpf_cond_op_r10(void) +{ + asm volatile ( + "r3 = 0 ll;" + "call __bpf_cond_op_r10;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("3: (bf) r3 = r10") +__msg("4: (bd) if r3 <= r2 goto pc+1") +__msg("5: (b5) if r2 <= 0x8 goto pc+2") +__msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r2 stack= before 4: (bd) if r3 <= r2 goto pc+1") +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r3 = r10") +__naked void bpf_cond_op_not_r10(void) +{ + asm volatile ( + "r0 = 0;" + "r2 = 2314885393468386424 ll;" + "r3 = r10;" + "if r3 <= r2 goto +1;" + "if r2 <= 8 goto +2;" + "r0 = 2 ll;" + "exit;" + ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3