From 41a5db8d8161457b121a03fde999ff6e00090ee2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:27:34 -0700 Subject: bpf: Add support for non-fix-size percpu mem allocation This is needed for later percpu mem allocation when the allocation is done by bpf program. For such cases, a global bpf_global_percpu_ma is added where a flexible allocation size is needed. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152734.1995725-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 8 +++++--- kernel/bpf/memalloc.c | 14 ++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0f8f036d8bd1..95599df82ee4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -64,8 +64,8 @@ #define OFF insn->off #define IMM insn->imm -struct bpf_mem_alloc bpf_global_ma; -bool bpf_global_ma_set; +struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; +bool bpf_global_ma_set, bpf_global_percpu_ma_set; /* No hurry in this branch * @@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void) ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; - return ret; + ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); + bpf_global_percpu_ma_set = !ret; + return !bpf_global_ma_set || !bpf_global_percpu_ma_set; } late_initcall(bpf_global_ma_init); #endif diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9c49ae53deaf..cb60445de98a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + /* room for llist_node and per-cpu pointer */ + if (percpu) + percpu_size = LLIST_NODE_SZ + sizeof(void *); + if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) - /* room for llist_node and per-cpu pointer */ - percpu_size = LLIST_NODE_SZ + sizeof(void *); - else + if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; @@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } - /* size == 0 && percpu is an invalid combination */ - if (WARN_ON_ONCE(percpu)) - return -EINVAL; - pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; @@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->percpu_size = percpu_size; c->tgt = c; prefill_mem_cache(c, cpu); } -- cgit v1.2.3 From 55db92f42fe4a4ef7b4c2b4960c6212c8512dd53 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:27:39 -0700 Subject: bpf: Add BPF_KPTR_PERCPU as a field type BPF_KPTR_PERCPU represents a percpu field type like below struct val_t { ... fields ... }; struct t { ... struct val_t __percpu_kptr *percpu_data_ptr; ... }; where #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) While BPF_KPTR_REF points to a trusted kernel object or a trusted local object, BPF_KPTR_PERCPU points to a trusted local percpu object. This patch added basic support for BPF_KPTR_PERCPU related to percpu_kptr field parsing, recording and free operations. BPF_KPTR_PERCPU also supports the same map types as BPF_KPTR_REF does. Note that unlike a local kptr, it is possible that a BPF_KTPR_PERCPU struct may not contain any special fields like other kptr, bpf_spin_lock, bpf_list_head, etc. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152739.1996391-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 18 ++++++++++++------ kernel/bpf/btf.c | 5 +++++ kernel/bpf/syscall.c | 4 ++++ 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 440dd1f59a1c..87eeb3a46a1d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -180,14 +180,15 @@ enum btf_field_type { BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), - BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, - BPF_LIST_HEAD = (1 << 4), - BPF_LIST_NODE = (1 << 5), - BPF_RB_ROOT = (1 << 6), - BPF_RB_NODE = (1 << 7), + BPF_KPTR_PERCPU = (1 << 4), + BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU, + BPF_LIST_HEAD = (1 << 5), + BPF_LIST_NODE = (1 << 6), + BPF_RB_ROOT = (1 << 7), + BPF_RB_NODE = (1 << 8), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, - BPF_REFCOUNT = (1 << 8), + BPF_REFCOUNT = (1 << 9), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -300,6 +301,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; + case BPF_KPTR_PERCPU: + return "percpu_kptr"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: @@ -325,6 +328,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); @@ -351,6 +355,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); @@ -389,6 +394,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_TIMER: case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: break; default: WARN_ON_ONCE(1); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1095bbe29859..187b57276fec 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_UNREF; else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; + else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_PERCPU; else return -EINVAL; @@ -3457,6 +3459,7 @@ static int btf_find_struct_field(const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3523,6 +3526,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3783,6 +3787,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb01c31ed591..6a692f3bea15 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -514,6 +514,7 @@ void btf_record_free(struct btf_record *rec) switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); @@ -560,6 +561,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; @@ -650,6 +652,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); if (!xchgd_field) break; @@ -1045,6 +1048,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && -- cgit v1.2.3 From 36d8bdf75a93190e5669b9d1d95994e13e15ba1d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:27:44 -0700 Subject: bpf: Add alloc/xchg/direct_access support for local percpu kptr Add two new kfunc's, bpf_percpu_obj_new_impl() and bpf_percpu_obj_drop_impl(), to allocate a percpu obj. Two functions are very similar to bpf_obj_new_impl() and bpf_obj_drop_impl(). The major difference is related to percpu handling. bpf_rcu_read_lock() struct val_t __percpu_kptr *v = map_val->percpu_data; ... bpf_rcu_read_unlock() For a percpu data map_val like above 'v', the reg->type is set as PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU if inside rcu critical section. MEM_RCU marking here is similar to NON_OWN_REF as 'v' is not a owning reference. But NON_OWN_REF is trusted and typically inside the spinlock while MEM_RCU is under rcu read lock. RCU is preferred here since percpu data structures mean potential concurrent access into its contents. Also, bpf_percpu_obj_new_impl() is restricted such that no pointers or special fields are allowed. Therefore, the bpf_list_head and bpf_rb_root will not be supported in this patch set to avoid potential memory leak issue due to racing between bpf_obj_free_fields() and another bpf_kptr_xchg() moving an allocated object to bpf_list_head and bpf_rb_root. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152744.1996739-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 16 ++++++++ kernel/bpf/verifier.c | 112 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 106 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8bd3812fb8df..b0a9834f1051 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1902,6 +1902,14 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + u64 size = local_type_id__k; + + /* The verifier has ensured that meta__ign must be NULL */ + return bpf_mem_alloc(&bpf_global_percpu_ma, size); +} + /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) { @@ -1930,6 +1938,12 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) __bpf_obj_drop_impl(p, meta ? meta->record : NULL); } +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + /* The verifier has ensured that meta__ign must be NULL */ + bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); +} + __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; @@ -2442,7 +2456,9 @@ BTF_SET8_START(generic_btf_ids) BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb78212fa5b2..6c886ead18f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta { /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic - * bpf_obj_drop + * bpf_obj_drop/bpf_percpu_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) * Record the local kptr type to be refcount_incr'd and use @@ -5001,6 +5001,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, perm_flags |= PTR_UNTRUSTED; } else { perm_flags = PTR_MAYBE_NULL | MEM_ALLOC; + if (kptr_field->type == BPF_KPTR_PERCPU) + perm_flags |= MEM_PERCPU; } if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -5044,7 +5046,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, kptr_field->kptr.btf, kptr_field->kptr.btf_id, - kptr_field->type == BPF_KPTR_REF)) + kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; return 0; bad_type: @@ -5088,7 +5090,18 @@ static bool rcu_safe_kptr(const struct btf_field *field) { const struct btf_field_kptr *kptr = &field->kptr; - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); + return field->type == BPF_KPTR_PERCPU || + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); +} + +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) +{ + if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) { + if (kptr_field->type != BPF_KPTR_PERCPU) + return PTR_MAYBE_NULL | MEM_RCU; + return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; + } + return PTR_MAYBE_NULL | PTR_UNTRUSTED; } static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, @@ -5114,7 +5127,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { + if (class != BPF_LDX && + (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -5125,10 +5139,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, - rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? - PTR_MAYBE_NULL | MEM_RCU : - PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -5182,6 +5193,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -7320,7 +7332,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (kptr_field->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } @@ -7831,8 +7843,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) { type &= ~MEM_ALLOC; + type &= ~MEM_PERCPU; + } for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; @@ -7915,6 +7929,7 @@ found: break; } case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); @@ -9882,8 +9897,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; - if (!btf_is_kernel(ret_btf)) + if (!btf_is_kernel(ret_btf)) { regs[BPF_REG_0].type |= MEM_ALLOC; + if (meta.kptr_field->type == BPF_KPTR_PERCPU) + regs[BPF_REG_0].type |= MEM_PERCPU; + } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -10268,6 +10286,8 @@ enum special_kfunc_type { KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, + KF_bpf_percpu_obj_new_impl, + KF_bpf_percpu_obj_drop_impl, }; BTF_SET_START(special_kfunc_set) @@ -10288,6 +10308,8 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10310,6 +10332,8 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11004,7 +11028,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) { + verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i); + return -EINVAL; + } + } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { + if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { + verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i); + return -EINVAL; + } + } else { verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } @@ -11012,8 +11046,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - if (meta->btf == btf_vmlinux && - meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta->btf == btf_vmlinux) { meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; } @@ -11413,6 +11446,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux || (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && + meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] && meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; @@ -11426,11 +11460,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; struct btf *ret_btf; u32 ret_btf_id; - if (unlikely(!bpf_global_ma_set)) + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + return -ENOMEM; + + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set) return -ENOMEM; if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { @@ -11443,24 +11482,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* This may be NULL due to user not supplying a BTF */ if (!ret_btf) { - verbose(env, "bpf_obj_new requires prog BTF\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); return -EINVAL; } ret_t = btf_type_by_id(ret_btf, ret_btf_id); if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); return -EINVAL; } + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = - btf_find_struct_meta(ret_btf, ret_btf_id); + insn_aux->kptr_struct_meta = struct_meta; } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; @@ -11597,7 +11650,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); @@ -18266,21 +18320,35 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) return 0; - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { + verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); insn_buf[1] = addr[0]; insn_buf[2] = addr[1]; insn_buf[3] = *insn; *cnt = 4; } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] || desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { + verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && !kptr_struct_meta) { verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", -- cgit v1.2.3 From 01cc55af93884f1ff5a883426e1924378dfcc62a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:27:49 -0700 Subject: bpf: Add bpf_this_cpu_ptr/bpf_per_cpu_ptr support for allocated percpu obj The bpf helpers bpf_this_cpu_ptr() and bpf_per_cpu_ptr() are re-purposed for allocated percpu objects. For an allocated percpu obj, the reg type is 'PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU'. The return type for these two re-purposed helpera is 'PTR_TO_MEM | MEM_RCU | MEM_ALLOC'. The MEM_ALLOC allows that the per-cpu data can be read and written. Since the memory allocator bpf_mem_alloc() returns a ptr to a percpu ptr for percpu data, the first argument of bpf_this_cpu_ptr() and bpf_per_cpu_ptr() is patched with a dereference before passing to the helper func. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152749.1997202-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b6e58dab8e27..a3236651ec64 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -480,6 +480,7 @@ struct bpf_insn_aux_data { bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter__next() kfunc call */ + bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c886ead18f6..6b7e7ca611f3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6221,7 +6221,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !reg->ref_obj_id) { + !(reg->type & MEM_RCU) && !reg->ref_obj_id) { verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); return -EFAULT; } @@ -7765,6 +7765,7 @@ static const struct bpf_reg_types btf_ptr_types = { static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU, + PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU, PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, } }; @@ -7941,6 +7942,7 @@ found: } break; case PTR_TO_BTF_ID | MEM_PERCPU: + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU: case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: /* Handled by helper specific checks */ break; @@ -9547,6 +9549,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int *insn_idx_p) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + bool returns_cpu_specific_alloc_ptr = false; const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; enum bpf_type_flag ret_flag; @@ -9785,6 +9788,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; } + case BPF_FUNC_per_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: + { + struct bpf_reg_state *reg = ®s[BPF_REG_1]; + const struct btf_type *type; + + if (reg->type & MEM_RCU) { + type = btf_type_by_id(reg->btf, reg->btf_id); + if (!type || !btf_type_is_struct(type)) { + verbose(env, "Helper has invalid btf/btf_id in R1\n"); + return -EFAULT; + } + returns_cpu_specific_alloc_ptr = true; + env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true; + } + break; + } case BPF_FUNC_user_ringbuf_drain: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_user_ringbuf_callback_state); @@ -9874,14 +9894,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - /* MEM_RDONLY may be carried from ret_flag, but it - * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise - * it will confuse the check of PTR_TO_BTF_ID in - * check_mem_access(). - */ - ret_flag &= ~MEM_RDONLY; + if (returns_cpu_specific_alloc_ptr) { + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU; + } else { + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; + } - regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } @@ -18676,6 +18700,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */ + if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) { + /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data, + * bpf_mem_alloc() returns a ptr to the percpu data ptr. + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. -- cgit v1.2.3 From 5b221ecb3a9e48013d7b4ad7960af3adba23d1d1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:28:16 -0700 Subject: bpf: Mark OBJ_RELEASE argument as MEM_RCU when possible In previous selftests/bpf patch, we have p = bpf_percpu_obj_new(struct val_t); if (!p) goto out; p1 = bpf_kptr_xchg(&e->pc, p); if (p1) { /* race condition */ bpf_percpu_obj_drop(p1); } p = e->pc; if (!p) goto out; After bpf_kptr_xchg(), we need to re-read e->pc into 'p'. This is due to that the second argument of bpf_kptr_xchg() is marked OBJ_RELEASE and it will be marked as invalid after the call. So after bpf_kptr_xchg(), 'p' is an unknown scalar, and the bpf program needs to reread from the map value. This patch checks if the 'p' has type MEM_ALLOC and MEM_PERCPU, and if 'p' is RCU protected. If this is the case, 'p' can be marked as MEM_RCU. MEM_ALLOC needs to be removed since 'p' is not an owning reference any more. Such a change makes re-read from the map value unnecessary. Note that re-reading 'e->pc' after bpf_kptr_xchg() might get a different value from 'p' if immediately before 'p = e->pc', another cpu may do another bpf_kptr_xchg() and swap in another value into 'e->pc'. If this is the case, then 'p = e->pc' may get either 'p' or another value, and race condition already exists. So removing direct re-reading seems fine too. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152816.2000760-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b7e7ca611f3..dbba2b806017 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9660,6 +9660,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { + u32 ref_obj_id = meta.ref_obj_id; + bool in_rcu = in_rcu_cs(env); + struct bpf_func_state *state; + struct bpf_reg_state *reg; + + err = release_reference_state(cur_func(env), ref_obj_id); + if (!err) { + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) { + if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->ref_obj_id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } else { + mark_reg_invalid(env, reg); + } + } + })); + } } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); } else if (register_is_null(®s[meta.release_regno])) { -- cgit v1.2.3 From 566f6de3cea3482d75d836a2398792a8be32ec26 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 1 Sep 2023 19:19:52 +0800 Subject: bpf: Enable IRQ after irq_work_raise() completes in unit_alloc() When doing stress test for qp-trie, bpf_mem_alloc() returned NULL unexpectedly because all qp-trie operations were initiated from bpf syscalls and there was still available free memory. bpf_obj_new() has the same problem as shown by the following selftest. The failure is due to the preemption. irq_work_raise() will invoke irq_work_claim() first to mark the irq work as pending and then inovke __irq_work_queue_local() to raise an IPI. So when the current task which is invoking irq_work_raise() is preempted by other task, unit_alloc() may return NULL for preemption task as shown below: task A task B unit_alloc() // low_watermark = 32 // free_cnt = 31 after alloc irq_work_raise() // mark irq work as IRQ_WORK_PENDING irq_work_claim() // task B preempts task A unit_alloc() // free_cnt = 30 after alloc // irq work is already PENDING, // so just return irq_work_raise() // does unit_alloc() 30-times ...... unit_alloc() // free_cnt = 0 before alloc return NULL Fix it by enabling IRQ after irq_work_raise() completes. An alternative fix is using preempt_{disable|enable}_notrace() pair, but it may have extra overhead. Another feasible fix is to only disable preemption or IRQ before invoking irq_work_queue() and enable preemption or IRQ after the invocation completes, but it can't handle the case when c->low_watermark is 1. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230901111954.1804721-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index cb60445de98a..c5d822d7cfaa 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } -- cgit v1.2.3 From 62cf51cb0ebe997a9903208e546755b63eb7ff9d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 1 Sep 2023 19:19:53 +0800 Subject: bpf: Enable IRQ after irq_work_raise() completes in unit_free{_rcu}() Both unit_free() and unit_free_rcu() invoke irq_work_raise() to free freed objects back to slab and the invocation may also be preempted by unit_alloc() and unit_alloc() may return NULL unexpectedly as shown in the following case: task A task B unit_free() // high_watermark = 48 // free_cnt = 49 after free irq_work_raise() // mark irq work as IRQ_WORK_PENDING irq_work_claim() // task B preempts task A unit_alloc() // free_cnt = 48 after alloc // does unit_alloc() 32-times ...... // free_cnt = 16 unit_alloc() // free_cnt = 15 after alloc // irq work is already PENDING, // so just return irq_work_raise() // does unit_alloc() 15-times ...... // free_cnt = 0 unit_alloc() // free_cnt = 0 before alloc return NULL Fix it by enabling IRQ after irq_work_raise() completes. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230901111954.1804721-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index c5d822d7cfaa..961df89d45f1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -778,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) @@ -800,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); - local_irq_restore(flags); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. -- cgit v1.2.3 From 1a00ef57d9f120b711b6b1193d12ba3789d47ec2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Sep 2023 17:46:46 +0200 Subject: bpf: task_group_seq_get_next: cleanup the usage of next_thread() 1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we can safely iterate the task->thread_group list. Even if this task exits right after get_pid_task() (or goto retry) and pid_alive() returns 0. Kill the unnecessary pid_alive() check. 2. next_thread() simply can't return NULL, kill the bogus "if (!next_task)" check. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230905154646.GA24928@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..4d1125108014 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -75,15 +75,8 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return NULL; retry: - if (!pid_alive(task)) { - put_task_struct(task); - return NULL; - } - next_task = next_thread(task); put_task_struct(task); - if (!next_task) - return NULL; saved_tid = *tid; *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); -- cgit v1.2.3 From 4981921350452a7639fac9ac8f19be4d25febdca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Sep 2023 17:46:49 +0200 Subject: bpf: task_group_seq_get_next: cleanup the usage of get/put_task_struct get_pid_task() makes no sense, the code does put_task_struct() soon after. Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill put_task_struct(), this allows to do get_task_struct() only once before return. While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)" block, this matches the next usage of find_pid_ns() + get_pid_task() in this function. Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230905154649.GA24935@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4d1125108014..1589ec3faded 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -42,9 +42,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm if (!*tid) { /* The first time, the iterator calls this function. */ pid = find_pid_ns(common->pid, common->ns); - if (!pid) - return NULL; - task = get_pid_task(pid, PIDTYPE_TGID); if (!task) return NULL; @@ -66,17 +63,12 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return task; } - pid = find_pid_ns(common->pid_visiting, common->ns); - if (!pid) - return NULL; - - task = get_pid_task(pid, PIDTYPE_PID); + task = find_task_by_pid_ns(common->pid_visiting, common->ns); if (!task) return NULL; retry: next_task = next_thread(task); - put_task_struct(task); saved_tid = *tid; *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); @@ -88,7 +80,6 @@ retry: return NULL; } - get_task_struct(next_task); common->pid_visiting = *tid; if (skip_if_dup_files && task->files == task->group_leader->files) { @@ -96,6 +87,7 @@ retry: goto retry; } + get_task_struct(next_task); return next_task; } -- cgit v1.2.3 From 87abbf7a54f6c9c51374b0701cd7ab47534516ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Sep 2023 17:46:51 +0200 Subject: bpf: task_group_seq_get_next: fix the skip_if_dup_files check Unless I am notally confused it is wrong. We are going to return or skip next_task so we need to check next_task-files, not task->files. Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230905154651.GA24940@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 1589ec3faded..2264870ae3fc 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -82,7 +82,7 @@ retry: common->pid_visiting = *tid; - if (skip_if_dup_files && task->files == task->group_leader->files) { + if (skip_if_dup_files && next_task->files == next_task->group_leader->files) { task = next_task; goto retry; } -- cgit v1.2.3 From 0ee9808b0a211ba1e572073c6afe5897f8300b9c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Sep 2023 17:46:54 +0200 Subject: bpf: task_group_seq_get_next: kill next_task It only adds the unnecessary confusion and compicates the "retry" code. Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230905154654.GA24945@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 2264870ae3fc..f51f476ec679 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -35,7 +35,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm u32 *tid, bool skip_if_dup_files) { - struct task_struct *task, *next_task; + struct task_struct *task; struct pid *pid; u32 saved_tid; @@ -68,10 +68,10 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return NULL; retry: - next_task = next_thread(task); + task = next_thread(task); saved_tid = *tid; - *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + *tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); if (!*tid || *tid == common->pid) { /* Run out of tasks of a process. The tasks of a * thread_group are linked as circular linked list. @@ -82,13 +82,11 @@ retry: common->pid_visiting = *tid; - if (skip_if_dup_files && next_task->files == next_task->group_leader->files) { - task = next_task; + if (skip_if_dup_files && task->files == task->group_leader->files) goto retry; - } - get_task_struct(next_task); - return next_task; + get_task_struct(task); + return task; } static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, -- cgit v1.2.3 From 780aa8dfcb73f4703b1c4be11c21c8dca36502ad Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Sep 2023 17:46:56 +0200 Subject: bpf: task_group_seq_get_next: simplify the "next tid" logic Kill saved_tid. It looks ugly to update *tid and then restore the previous value if __task_pid_nr_ns() returns 0. Change this code to update *tid and common->pid_visiting once before return. Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230905154656.GA24950@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index f51f476ec679..7473068ed313 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -37,7 +37,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm { struct task_struct *task; struct pid *pid; - u32 saved_tid; + u32 next_tid; if (!*tid) { /* The first time, the iterator calls this function. */ @@ -70,21 +70,18 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm retry: task = next_thread(task); - saved_tid = *tid; - *tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); - if (!*tid || *tid == common->pid) { + next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); + if (!next_tid || next_tid == common->pid) { /* Run out of tasks of a process. The tasks of a * thread_group are linked as circular linked list. */ - *tid = saved_tid; return NULL; } - common->pid_visiting = *tid; - if (skip_if_dup_files && task->files == task->group_leader->files) goto retry; + *tid = common->pid_visiting = next_tid; get_task_struct(task); return task; } -- cgit v1.2.3 From 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 Sep 2023 23:04:41 +0800 Subject: bpf, x64: Fix tailcall infinite loop From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT"), the tailcall on x64 works better than before. From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms for x64 JIT"), tailcall is able to run in BPF subprograms on x64. From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program to other BPF programs"), BPF program is able to trace other BPF programs. How about combining them all together? 1. FENTRY/FEXIT on a BPF subprogram. 2. A tailcall runs in the BPF subprogram. 3. The tailcall calls the subprogram's caller. As a result, a tailcall infinite loop comes up. And the loop would halt the machine. As we know, in tail call context, the tail_call_cnt propagates by stack and rax register between BPF subprograms. So do in trampolines. Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") Reviewed-by: Maciej Fijalkowski Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------ include/linux/bpf.h | 5 +++++ kernel/bpf/trampoline.c | 4 ++-- kernel/bpf/verifier.c | 3 +++ 4 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index bcca1c9b9a02..2846c21d75bf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1022,6 +1022,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) +/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ +#define RESTORE_TAIL_CALL_CNT(stack) \ + EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -1627,9 +1631,7 @@ st: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { - /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ - EMIT3_off32(0x48, 0x8B, 0x85, - -round_up(bpf_prog->aux->stack_depth, 8) - 8); + RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); if (!imm32) return -EINVAL; offs = 7 + x86_call_depth_emit_accounting(&prog, func); @@ -2404,6 +2406,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * [ ... ] * [ stack_arg2 ] * RBP - arg_stack_off [ stack_arg1 ] + * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX */ /* room for return value of orig_call or fentry prog */ @@ -2468,6 +2471,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i else /* sub rsp, stack_size */ EMIT4(0x48, 0x83, 0xEC, stack_size); + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + EMIT1(0x50); /* push rax */ /* mov QWORD PTR [rbp - rbx_off], rbx */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); @@ -2520,9 +2525,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i restore_regs(m, &prog, regs_off); save_args(m, &prog, arg_stack_off, true); + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before calling the original function, restore the + * tail_call_cnt from stack to rax. + */ + RESTORE_TAIL_CALL_CNT(stack_size); + if (flags & BPF_TRAMP_F_ORIG_STACK) { - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); - EMIT2(0xff, 0xd0); /* call *rax */ + emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8); + EMIT2(0xff, 0xd3); /* call *rbx */ } else { /* call original function */ if (emit_rsb_call(&prog, orig_call, prog)) { @@ -2573,7 +2584,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - } + } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before running the original function, restore the + * tail_call_cnt from stack to rax. + */ + RESTORE_TAIL_CALL_CNT(stack_size); + /* restore return value of orig_call or fentry prog back into RAX */ if (save_ret) emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 87eeb3a46a1d..b9e573159432 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1035,6 +1035,11 @@ struct btf_func_model { */ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) +/* Indicate that current trampoline is in a tail call context. Then, it has to + * cache and restore tail_call_cnt to avoid infinite tail call loop. + */ +#define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 53ff50cac61e..e97aeda3a86b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -415,8 +415,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut goto out; } - /* clear all bits except SHARE_IPMODIFY */ - tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ + tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbba2b806017..18e673c0ac15 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19774,6 +19774,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!tr) return -ENOMEM; + if (tgt_prog && tgt_prog->aux->tail_call_reachable) + tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX; + prog->aux->dst_trampoline = tr; return 0; } -- cgit v1.2.3 From 5c04433daf9ed8b28d4900112be1fd19e1786b25 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 14 Sep 2023 15:25:42 -0700 Subject: bpf: Charge modmem for struct_ops trampoline Current code charges modmem for regular trampoline, but not for struct_ops trampoline. Add bpf_jit_[charge|uncharge]_modmem() to struct_ops so the trampoline is charged in both cases. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230914222542.2986059-1-song@kernel.org Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_struct_ops.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index fdc3e8705a3c..db6176fb64dc 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -615,7 +615,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); - bpf_jit_free_exec(st_map->image); + if (st_map->image) { + bpf_jit_free_exec(st_map->image); + bpf_jit_uncharge_modmem(PAGE_SIZE); + } bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } @@ -657,6 +660,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; struct bpf_map *map; + int ret; st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); if (!st_ops) @@ -681,12 +685,27 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->st_ops = st_ops; map = &st_map->map; + ret = bpf_jit_charge_modmem(PAGE_SIZE); + if (ret) { + __bpf_struct_ops_map_free(map); + return ERR_PTR(ret); + } + + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->image) { + /* __bpf_struct_ops_map_free() uses st_map->image as flag + * for "charged or not". In this case, we need to unchange + * here. + */ + bpf_jit_uncharge_modmem(PAGE_SIZE); + __bpf_struct_ops_map_free(map); + return ERR_PTR(-ENOMEM); + } st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->links = bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); - st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->links || !st_map->image) { + if (!st_map->uvalue || !st_map->links) { __bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -907,4 +926,3 @@ err_out: kfree(link); return err; } - -- cgit v1.2.3 From fc45c5b642dbcac3bb10f4f904e4b863233e5369 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 13 Sep 2023 10:13:48 -0700 Subject: bpf: make it easier to add new metadata kfunc No functional changes. Instead of having hand-crafted code in bpf_dev_bound_resolve_kfunc, move kfunc <> xmo handler relationship into XDP_METADATA_KFUNC_xxx. This way, any time new kfunc is added, we don't have to touch bpf_dev_bound_resolve_kfunc. Also document XDP_METADATA_KFUNC_xxx arguments since we now have more than two and it might be confusing what is what. Cc: netdev@vger.kernel.org Cc: Willem de Bruijn Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230913171350.369987-2-sdf@google.com Signed-off-by: Martin KaFai Lau --- include/net/xdp.h | 16 ++++++++++++---- kernel/bpf/offload.c | 9 +++++---- net/core/xdp.c | 4 ++-- 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/net/xdp.h b/include/net/xdp.h index de08c8e0d134..d59e12f8f311 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -383,14 +383,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE +/* Define the relationship between xdp-rx-metadata kfunc and + * various other entities: + * - xdp_rx_metadata enum + * - kfunc name + * - xdp_metadata_ops field + */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ - bpf_xdp_metadata_rx_timestamp) \ + bpf_xdp_metadata_rx_timestamp, \ + xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ - bpf_xdp_metadata_rx_hash) \ + bpf_xdp_metadata_rx_hash, \ + xmo_rx_hash) \ -enum { -#define XDP_METADATA_KFUNC(name, _) name, +enum xdp_rx_metadata { +#define XDP_METADATA_KFUNC(name, _, __) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3e4f2ec1af06..6aa6de8d715d 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -845,10 +845,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) if (!ops) goto out; - if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) - p = ops->xmo_rx_timestamp; - else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) - p = ops->xmo_rx_hash; +#define XDP_METADATA_KFUNC(name, _, xmo) \ + if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; + XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + out: up_read(&bpf_devs_lock); diff --git a/net/core/xdp.c b/net/core/xdp.c index a70670fe9a2d..bab563b2f812 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, __diag_pop(); BTF_SET8_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) @@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { }; BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) -#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str) +#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC -- cgit v1.2.3 From a9c2a608549bb1a2363d289d63907640afcf22af Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 13 Sep 2023 10:13:49 -0700 Subject: bpf: expose information about supported xdp metadata kfunc Add new xdp-rx-metadata-features member to netdev netlink which exports a bitmask of supported kfuncs. Most of the patch is autogenerated (headers), the only relevant part is netdev.yaml and the changes in netdev-genl.c to marshal into netlink. Example output on veth: $ ip link add veth0 type veth peer name veth1 # ifndex == 12 $ ./tools/net/ynl/samples/netdev 12 Select ifc ($ifindex; or 0 = dump; or -2 ntf check): 12 veth1[12] xdp-features (23): basic redirect rx-sg xdp-rx-metadata-features (3): timestamp hash xdp-zc-max-segs=0 Cc: netdev@vger.kernel.org Cc: Willem de Bruijn Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230913171350.369987-3-sdf@google.com Signed-off-by: Martin KaFai Lau --- Documentation/netlink/specs/netdev.yaml | 21 +++++++++++++++++++++ Documentation/networking/xdp-rx-metadata.rst | 7 +++++++ include/net/xdp.h | 5 ++++- include/uapi/linux/netdev.h | 16 ++++++++++++++++ kernel/bpf/offload.c | 2 +- net/core/netdev-genl.c | 12 +++++++++++- net/core/xdp.c | 4 ++-- tools/include/uapi/linux/netdev.h | 16 ++++++++++++++++ 8 files changed, 78 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 1c7284fd535b..c46fcc78fc04 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -42,6 +42,19 @@ definitions: doc: This feature informs if netdev implements non-linear XDP buffer support in ndo_xdp_xmit callback. + - + type: flags + name: xdp-rx-metadata + render-max: true + entries: + - + name: timestamp + doc: + Device is capable of exposing receive HW timestamp via bpf_xdp_metadata_rx_timestamp(). + - + name: hash + doc: + Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash(). attribute-sets: - @@ -68,6 +81,13 @@ attribute-sets: type: u32 checks: min: 1 + - + name: xdp-rx-metadata-features + doc: Bitmask of supported XDP receive metadata features. + See Documentation/networking/xdp-rx-metadata.rst for more details. + type: u64 + enum: xdp-rx-metadata + enum-as-flags: true operations: list: @@ -84,6 +104,7 @@ operations: - ifindex - xdp-features - xdp-zc-max-segs + - xdp-rx-metadata-features dump: reply: *dev-all - diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst index 25ce72af81c2..205696780b78 100644 --- a/Documentation/networking/xdp-rx-metadata.rst +++ b/Documentation/networking/xdp-rx-metadata.rst @@ -105,6 +105,13 @@ bpf_tail_call Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY`` is currently not supported. +Supported Devices +================= + +It is possible to query which kfunc the particular netdev implements via +netlink. See ``xdp-rx-metadata-features`` attribute set in +``Documentation/netlink/specs/netdev.yaml``. + Example ======= diff --git a/include/net/xdp.h b/include/net/xdp.h index d59e12f8f311..349c36fb5fd8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -386,19 +386,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum + * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ + NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ + NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ enum xdp_rx_metadata { -#define XDP_METADATA_KFUNC(name, _, __) name, +#define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index c1634b95c223..2943a151d4f1 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -38,11 +38,27 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_MASK = 127, }; +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6aa6de8d715d..e7a1752b5a09 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -845,7 +845,7 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) if (!ops) goto out; -#define XDP_METADATA_KFUNC(name, _, xmo) \ +#define XDP_METADATA_KFUNC(name, _, __, xmo) \ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index c1aea8b756b6..fe61f85bcf33 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "netdev-genl-gen.h" @@ -12,15 +13,24 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; +#define XDP_METADATA_KFUNC(_, flag, __, xmo) \ + if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ + xdp_rx_meta |= flag; +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, - netdev->xdp_features, NETDEV_A_DEV_PAD)) { + netdev->xdp_features, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + xdp_rx_meta, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } diff --git a/net/core/xdp.c b/net/core/xdp.c index bab563b2f812..df4789ab512d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, __diag_pop(); BTF_SET8_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) @@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { }; BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) -#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str) +#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index c1634b95c223..2943a151d4f1 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -38,11 +38,27 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_MASK = 127, }; +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) -- cgit v1.2.3 From 9b2b86332a9b9932d9022a0c004251d5d6437020 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 15 Sep 2023 10:39:10 +0200 Subject: bpf: Allow to use kfunc XDP hints and frags together There is no fundamental reason, why multi-buffer XDP and XDP kfunc RX hints cannot coexist in a single program. Allow those features to be used together by modifying the flags condition for dev-bound-only programs, segments are still prohibited for fully offloaded programs, hence additional check. Suggested-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/CAKH8qBuzgtJj=OKMdsxEkyML36VsAuZpcrsXcyqjdKXSJCBq=Q@mail.gmail.com/ Reviewed-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230915083914.65538-1-larysa.zaremba@intel.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/offload.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e7a1752b5a09..92c9df46134a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -232,7 +232,14 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) attr->prog_type != BPF_PROG_TYPE_XDP) return -EINVAL; - if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) + if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS)) + return -EINVAL; + + /* Frags are allowed only if program is dev-bound-only, but not + * if it is requesting bpf offload. + */ + if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS && + !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && -- cgit v1.2.3 From fd5d27b70188379bb441d404c29a0afb111e1753 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:31:59 +0200 Subject: arch/x86: Implement arch_bpf_stack_walk The plumbing for offline unwinding when we throw an exception in programs would require walking the stack, hence introduce a new arch_bpf_stack_walk function. This is provided when the JIT supports exceptions, i.e. bpf_jit_supports_exceptions is true. The arch-specific code is really minimal, hence it should be straightforward to extend this support to other architectures as well, as it reuses the logic of arch_stack_walk, but allowing access to unwind_state data. Once the stack pointer and frame pointer are known for the main subprog during the unwinding, we know the stack layout and location of any callee-saved registers which must be restored before we return back to the kernel. This handling will be added in the subsequent patches. Note that while we primarily unwind through BPF frames, which are effectively CONFIG_UNWINDER_FRAME_POINTER, we still need one of this or CONFIG_UNWINDER_ORC to be able to unwind through the bpf_throw frame from which we begin walking the stack. We also require both sp and bp (stack and frame pointers) from the unwind_state structure, which are only available when one of these two options are enabled. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++ include/linux/filter.h | 2 ++ kernel/bpf/core.c | 9 +++++++++ 3 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a0d03503b3cb..d0c24b5a6abb 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -16,6 +16,7 @@ #include #include #include +#include static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -2933,3 +2934,30 @@ void bpf_jit_free(struct bpf_prog *prog) bpf_prog_unlock_free(prog); } + +bool bpf_jit_supports_exceptions(void) +{ + /* We unwind through both kernel frames (starting from within bpf_throw + * call) and BPF frames. Therefore we require one of ORC or FP unwinder + * to be enabled to walk kernel frames and reach BPF frames in the stack + * trace. + */ + return IS_ENABLED(CONFIG_UNWINDER_ORC) || IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER); +} + +void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) +{ +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) + struct unwind_state state; + unsigned long addr; + + for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp)) + break; + } + return; +#endif + WARN(1, "verification of programs using bpf_throw should have failed\n"); +} diff --git a/include/linux/filter.h b/include/linux/filter.h index 0138832ad571..88874de974cb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -954,6 +954,8 @@ bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); +bool bpf_jit_supports_exceptions(void); +void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95599df82ee4..c4ac084f2767 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2914,6 +2914,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len) return -ENOTSUPP; } +bool __weak bpf_jit_supports_exceptions(void) +{ + return false; +} + +void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) +{ +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { -- cgit v1.2.3 From 335d1c5b545284d75ef96ee42e461eacefe865bb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:00 +0200 Subject: bpf: Implement support for adding hidden subprogs Introduce support in the verifier for generating a subprogram and include it as part of a BPF program dynamically after the do_check phase is complete. The first user will be the next patch which generates default exception callbacks if none are set for the program. The phase of invocation will be do_misc_fixups. Note that this is an internal verifier function, and should be used with instruction blocks which uphold the invariants stated in check_subprogs. Since these subprogs are always appended to the end of the instruction sequence of the program, it becomes relatively inexpensive to do the related adjustments to the subprog_info of the program. Only the fake exit subprogram is shifted forward, making room for our new subprog. This is useful to insert a new subprogram, get it JITed, and obtain its function pointer. The next patch will use this functionality to insert a default exception callback which will be invoked after unwinding the stack. Note that these added subprograms are invisible to userspace, and never reported in BPF_OBJ_GET_INFO_BY_ID etc. For now, only a single subprogram is supported, but more can be easily supported in the future. To this end, two function counts are introduced now, the existing func_cnt, and real_func_cnt, the latter including hidden programs. This allows us to conver the JIT code to use the real_func_cnt for management of resources while syscall path continues working with existing func_cnt. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 3 ++- kernel/bpf/core.c | 12 ++++++------ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++++--- 5 files changed, 43 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9171b0b6a590..c3667e95af59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1389,6 +1389,7 @@ struct bpf_prog_aux { u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ + u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ u32 ctx_arg_info_size; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a3236651ec64..3c2a8636ab29 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -588,6 +588,7 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ + u32 hidden_subprog_cnt; /* number of hidden subprogs */ bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; @@ -598,7 +599,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; - struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; + struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */ union { struct bpf_idmap idmap_scratch; struct bpf_idset idset_scratch; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c4ac084f2767..840ba952702d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -212,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const struct bpf_line_info *linfo; void **jited_linfo; - if (!prog->aux->jited_linfo) + if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; @@ -539,7 +539,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; - for (i = 0; i < fp->aux->func_cnt; i++) + for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } @@ -589,7 +589,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog) sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ - if (prog->aux->func_info_cnt) { + if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); @@ -1208,7 +1208,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, if (!extra_pass) addr = NULL; else if (prog->aux->func && - off >= 0 && off < prog->aux->func_cnt) + off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; @@ -2721,7 +2721,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); - for (i = 0; i < aux->func_cnt; i++) { + for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. @@ -2729,7 +2729,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } - if (aux->func_cnt) { + if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6a692f3bea15..85c1d908f70f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2749,7 +2749,7 @@ free_used_maps: * period before we can tear down JIT memory since symbols * are already exposed under kallsyms. */ - __bpf_prog_put_noref(prog, prog->aux->func_cnt); + __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; free_prog_sec: free_uid(prog->aux->user); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 18e673c0ac15..39548e326d53 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15210,7 +15210,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env) if (!aux->func_info) return; - for (i = 0; i < env->subprog_cnt; i++) + /* func_info is not available for hidden subprogs */ + for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++) aux->func_info[i].insn_off = env->subprog_info[i].start; } @@ -18151,7 +18152,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) * the call instruction, as an index for this list */ func[i]->aux->func = func; - func[i]->aux->func_cnt = env->subprog_cnt; + func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + func[i]->aux->real_func_cnt = env->subprog_cnt; } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; @@ -18197,7 +18199,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->extable = func[0]->aux->extable; prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt; + prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + prog->aux->real_func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); return 0; out_free: @@ -18433,6 +18436,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ +static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) +{ + struct bpf_subprog_info *info = env->subprog_info; + int cnt = env->subprog_cnt; + struct bpf_prog *prog; + + /* We only reserve one slot for hidden subprogs in subprog_info. */ + if (env->hidden_subprog_cnt) { + verbose(env, "verifier internal error: only one hidden subprog supported\n"); + return -EFAULT; + } + /* We're not patching any existing instruction, just appending the new + * ones for the hidden subprog. Hence all of the adjustment operations + * in bpf_patch_insn_data are no-ops. + */ + prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len); + if (!prog) + return -ENOMEM; + env->prog = prog; + info[cnt + 1].start = info[cnt].start; + info[cnt].start = prog->len - len + 1; + env->subprog_cnt++; + env->hidden_subprog_cnt++; + return 0; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ -- cgit v1.2.3 From f18b03fabaa9b7c80e80b72a621f481f0d706ae0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:01 +0200 Subject: bpf: Implement BPF exceptions This patch implements BPF exceptions, and introduces a bpf_throw kfunc to allow programs to throw exceptions during their execution at runtime. A bpf_throw invocation is treated as an immediate termination of the program, returning back to its caller within the kernel, unwinding all stack frames. This allows the program to simplify its implementation, by testing for runtime conditions which the verifier has no visibility into, and assert that they are true. In case they are not, the program can simply throw an exception from the other branch. BPF exceptions are explicitly *NOT* an unlikely slowpath error handling primitive, and this objective has guided design choices of the implementation of the them within the kernel (with the bulk of the cost for unwinding the stack offloaded to the bpf_throw kfunc). The implementation of this mechanism requires use of add_hidden_subprog mechanism introduced in the previous patch, which generates a couple of instructions to move R1 to R0 and exit. The JIT then rewrites the prologue of this subprog to take the stack pointer and frame pointer as inputs and reset the stack frame, popping all callee-saved registers saved by the main subprog. The bpf_throw function then walks the stack at runtime, and invokes this exception subprog with the stack and frame pointers as parameters. Reviewers must take note that currently the main program is made to save all callee-saved registers on x86_64 during entry into the program. This is because we must do an equivalent of a lightweight context switch when unwinding the stack, therefore we need the callee-saved registers of the caller of the BPF program to be able to return with a sane state. Note that we have to additionally handle r12, even though it is not used by the program, because when throwing the exception the program makes an entry into the kernel which could clobber r12 after saving it on the stack. To be able to preserve the value we received on program entry, we push r12 and restore it from the generated subprogram when unwinding the stack. For now, bpf_throw invocation fails when lingering resources or locks exist in that path of the program. In a future followup, bpf_throw will be extended to perform frame-by-frame unwinding to release lingering resources for each stack frame, removing this limitation. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 89 ++++++++++++++++--- include/linux/bpf.h | 3 + include/linux/bpf_verifier.h | 4 + include/linux/filter.h | 6 ++ kernel/bpf/core.c | 2 +- kernel/bpf/helpers.c | 38 ++++++++ kernel/bpf/verifier.c | 116 ++++++++++++++++++++++--- tools/testing/selftests/bpf/bpf_experimental.h | 16 ++++ 8 files changed, 247 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d0c24b5a6abb..84005f2114e0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -18,6 +18,8 @@ #include #include +static bool all_callee_regs_used[4] = {true, true, true, true}; + static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { if (len == 1) @@ -256,6 +258,14 @@ struct jit_context { /* Number of bytes that will be skipped on tailcall */ #define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE) +static void push_r12(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x54); /* push r12 */ + *pprog = prog; +} + static void push_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; @@ -271,6 +281,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used) *pprog = prog; } +static void pop_r12(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x5C); /* pop r12 */ + *pprog = prog; +} + static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; @@ -292,7 +310,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) * while jumping to another program */ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, - bool tail_call_reachable, bool is_subprog) + bool tail_call_reachable, bool is_subprog, + bool is_exception_cb) { u8 *prog = *pprog; @@ -312,8 +331,22 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, /* Keep the same instruction layout. */ EMIT2(0x66, 0x90); /* nop2 */ } - EMIT1(0x55); /* push rbp */ - EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + /* Exception callback receives FP as third parameter */ + if (is_exception_cb) { + EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */ + EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */ + /* The main frame must have exception_boundary as true, so we + * first restore those callee-saved regs from stack, before + * reusing the stack frame. + */ + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + /* Reset the stack frame. */ + EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */ + } else { + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + } /* X86_TAIL_CALL_OFFSET is here */ EMIT_ENDBR(); @@ -472,7 +505,8 @@ static void emit_return(u8 **pprog, u8 *ip) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, +static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, + u8 **pprog, bool *callee_regs_used, u32 stack_depth, u8 *ip, struct jit_context *ctx) { @@ -522,7 +556,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, offset = ctx->tail_call_indirect_label - (prog + 2 - start); EMIT2(X86_JE, offset); /* je out */ - pop_callee_regs(&prog, callee_regs_used); + if (bpf_prog->aux->exception_boundary) { + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + } else { + pop_callee_regs(&prog, callee_regs_used); + } EMIT1(0x58); /* pop rax */ if (stack_depth) @@ -546,7 +585,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, *pprog = prog; } -static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, +static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, + struct bpf_jit_poke_descriptor *poke, u8 **pprog, u8 *ip, bool *callee_regs_used, u32 stack_depth, struct jit_context *ctx) @@ -575,7 +615,13 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); - pop_callee_regs(&prog, callee_regs_used); + if (bpf_prog->aux->exception_boundary) { + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + } else { + pop_callee_regs(&prog, callee_regs_used); + } + EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); @@ -1050,8 +1096,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image emit_prologue(&prog, bpf_prog->aux->stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, - bpf_is_subprog(bpf_prog)); - push_callee_regs(&prog, callee_regs_used); + bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + /* Exception callback will clobber callee regs for its own use, and + * restore the original callee regs from main prog's stack frame. + */ + if (bpf_prog->aux->exception_boundary) { + /* We also need to save r12, which is not mapped to any BPF + * register, as we throw after entry into the kernel, which may + * overwrite r12. + */ + push_r12(&prog); + push_callee_regs(&prog, all_callee_regs_used); + } else { + push_callee_regs(&prog, callee_regs_used); + } ilen = prog - temp; if (rw_image) @@ -1648,13 +1706,15 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_TAIL_CALL: if (imm32) - emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + emit_bpf_tail_call_direct(bpf_prog, + &bpf_prog->aux->poke_tab[imm32 - 1], &prog, image + addrs[i - 1], callee_regs_used, bpf_prog->aux->stack_depth, ctx); else - emit_bpf_tail_call_indirect(&prog, + emit_bpf_tail_call_indirect(bpf_prog, + &prog, callee_regs_used, bpf_prog->aux->stack_depth, image + addrs[i - 1], @@ -1907,7 +1967,12 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - pop_callee_regs(&prog, callee_regs_used); + if (bpf_prog->aux->exception_boundary) { + pop_callee_regs(&prog, all_callee_regs_used); + pop_r12(&prog); + } else { + pop_callee_regs(&prog, callee_regs_used); + } EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3667e95af59..16740ee82082 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1410,6 +1410,8 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; + bool exception_cb; + bool exception_boundary; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -1432,6 +1434,7 @@ struct bpf_prog_aux { int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; + unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); #ifdef CONFIG_SECURITY void *security; #endif diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3c2a8636ab29..da21a3ec5027 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -541,7 +541,9 @@ struct bpf_subprog_info { bool has_tail_call; bool tail_call_reachable; bool has_ld_abs; + bool is_cb; bool is_async_cb; + bool is_exception_cb; }; struct bpf_verifier_env; @@ -589,6 +591,7 @@ struct bpf_verifier_env { u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ + int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; @@ -596,6 +599,7 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; + bool seen_exception; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; diff --git a/include/linux/filter.h b/include/linux/filter.h index 88874de974cb..27406aee2d40 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1171,6 +1171,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); +struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, @@ -1238,6 +1239,11 @@ static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, return -ERANGE; } +static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +{ + return NULL; +} + static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 840ba952702d..7849b9cca749 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -733,7 +733,7 @@ bool is_bpf_text_address(unsigned long addr) return ret; } -static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym = bpf_ksym_find(addr); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b0a9834f1051..78e8f4de6750 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2449,6 +2449,43 @@ __bpf_kfunc void bpf_rcu_read_unlock(void) rcu_read_unlock(); } +struct bpf_throw_ctx { + struct bpf_prog_aux *aux; + u64 sp; + u64 bp; + int cnt; +}; + +static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct bpf_throw_ctx *ctx = cookie; + struct bpf_prog *prog; + + if (!is_bpf_text_address(ip)) + return !ctx->cnt; + prog = bpf_prog_ksym_find(ip); + ctx->cnt++; + if (bpf_is_subprog(prog)) + return true; + ctx->aux = prog->aux; + ctx->sp = sp; + ctx->bp = bp; + return false; +} + +__bpf_kfunc void bpf_throw(u64 cookie) +{ + struct bpf_throw_ctx ctx = {}; + + arch_bpf_stack_walk(bpf_stack_walker, &ctx); + WARN_ON_ONCE(!ctx.aux); + if (ctx.aux) + WARN_ON_ONCE(!ctx.aux->exception_boundary); + WARN_ON_ONCE(!ctx.bp); + WARN_ON_ONCE(!ctx.cnt); + ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp); +} + __diag_pop(); BTF_SET8_START(generic_btf_ids) @@ -2478,6 +2515,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_throw) BTF_SET8_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39548e326d53..9baa6f187b38 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) } static bool is_callback_calling_kfunc(u32 btf_id); +static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_callback_calling_function(enum bpf_func_id func_id) { @@ -1748,7 +1749,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them */ + /* if dst has more stack frames then src frame, free them, this is also + * necessary in case of exceptional exits using bpf_throw. + */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; @@ -2868,7 +2871,7 @@ next: if (i == subprog_end - 1) { /* to avoid fall-through from one subprog into another * the last insn of the subprog should be either exit - * or unconditional jump back + * or unconditional jump back or bpf_throw call */ if (code != (BPF_JMP | BPF_EXIT) && code != (BPF_JMP32 | BPF_JA) && @@ -5661,6 +5664,27 @@ continue_func: for (; i < subprog_end; i++) { int next_insn, sidx; + if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { + bool err = false; + + if (!is_bpf_throw_kfunc(insn + i)) + continue; + if (subprog[idx].is_cb) + err = true; + for (int c = 0; c < frame && !err; c++) { + if (subprog[ret_prog[c]].is_cb) { + err = true; + break; + } + } + if (!err) + continue; + verbose(env, + "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n", + i, idx); + return -EINVAL; + } + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ @@ -8919,6 +8943,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn * callbacks */ if (set_callee_state_cb != set_callee_state) { + env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && !is_callback_calling_kfunc(insn->imm)) { verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", @@ -9308,7 +9333,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) verbose(env, "to caller at %d:\n", *insn_idx); print_verifier_state(env, caller, true); } - /* clear everything in the callee */ + /* clear everything in the callee. In case of exceptional exits using + * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; return 0; @@ -9432,17 +9458,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } -static int check_reference_leak(struct bpf_verifier_env *env) +static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_func_state *state = cur_func(env); bool refs_lingering = false; int i; - if (state->frameno && !state->in_callback_fn) + if (!exception_exit && state->frameno && !state->in_callback_fn) return 0; for (i = 0; i < state->acquired_refs; i++) { - if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -9697,7 +9723,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: - err = check_reference_leak(env); + err = check_reference_leak(env, false); if (err) { verbose(env, "tail_call would lead to reference leak\n"); return err; @@ -10332,6 +10358,7 @@ enum special_kfunc_type { KF_bpf_dynptr_clone, KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, + KF_bpf_throw, }; BTF_SET_START(special_kfunc_set) @@ -10354,6 +10381,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_throw) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10378,6 +10406,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_throw) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -10695,6 +10724,12 @@ static bool is_callback_calling_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +{ + return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + insn->imm == special_kfunc_list[KF_bpf_throw]; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -11480,6 +11515,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { + if (!bpf_jit_supports_exceptions()) { + verbose(env, "JIT does not support calling kfunc %s#%d\n", + func_name, meta.func_id); + return -ENOTSUPP; + } + env->seen_exception = true; + } + for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); @@ -14525,7 +14569,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * gen_ld_abs() may terminate the program at runtime, leading to * reference leak. */ - err = check_reference_leak(env); + err = check_reference_leak(env, false); if (err) { verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); return err; @@ -16539,6 +16583,7 @@ static int do_check(struct bpf_verifier_env *env) int prev_insn_idx = -1; for (;;) { + bool exception_exit = false; struct bpf_insn *insn; u8 class; int err; @@ -16753,12 +16798,17 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } } - if (insn->src_reg == BPF_PSEUDO_CALL) + if (insn->src_reg == BPF_PSEUDO_CALL) { err = check_func_call(env, insn, &env->insn_idx); - else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { err = check_kfunc_call(env, insn, &env->insn_idx); - else + if (!err && is_bpf_throw_kfunc(insn)) { + exception_exit = true; + goto process_bpf_exit_full; + } + } else { err = check_helper_call(env, insn, &env->insn_idx); + } if (err) return err; @@ -16788,7 +16838,7 @@ static int do_check(struct bpf_verifier_env *env) verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } - +process_bpf_exit_full: if (env->cur_state->active_lock.ptr && !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_spin_unlock is missing\n"); @@ -16807,10 +16857,23 @@ static int do_check(struct bpf_verifier_env *env) * function, for which reference_state must * match caller reference state when it exits. */ - err = check_reference_leak(env); + err = check_reference_leak(env, exception_exit); if (err) return err; + /* The side effect of the prepare_func_exit + * which is being skipped is that it frees + * bpf_func_state. Typically, process_bpf_exit + * will only be hit with outermost exit. + * copy_verifier_state in pop_stack will handle + * freeing of any extra bpf_func_state left over + * from not processing all nested function + * exits. We also skip return code checks as + * they are not needed for exceptional exits. + */ + if (exception_exit) + goto process_bpf_exit; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -18113,6 +18176,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; + func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + if (!i) + func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -18201,6 +18267,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; prog->aux->real_func_cnt = env->subprog_cnt; + prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; + prog->aux->exception_boundary = func[0]->aux->exception_boundary; bpf_prog_jit_attempt_done(prog); return 0; out_free: @@ -18437,7 +18505,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } /* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ -static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) +static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) { struct bpf_subprog_info *info = env->subprog_info; int cnt = env->subprog_cnt; @@ -18481,6 +18549,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) struct bpf_map *map_ptr; int i, ret, cnt, delta = 0; + if (env->seen_exception && !env->exception_callback_subprog) { + struct bpf_insn patch[] = { + env->prog->insnsi[insn_cnt - 1], + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + + ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); + if (ret < 0) + return ret; + prog = env->prog; + insn = prog->insnsi; + + env->exception_callback_subprog = env->subprog_cnt - 1; + /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ + env->subprog_info[env->exception_callback_subprog].is_cb = true; + env->subprog_info[env->exception_callback_subprog].is_async_cb = true; + env->subprog_info[env->exception_callback_subprog].is_exception_cb = true; + } + for (i = 0; i < insn_cnt; i++, insn++) { /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 4494eaa9937e..333b54a86e3a 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -162,4 +162,20 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym; /* Convenience macro to wrap over bpf_obj_drop_impl */ #define bpf_percpu_obj_drop(kptr) bpf_percpu_obj_drop_impl(kptr, NULL) +/* Description + * Throw a BPF exception from the program, immediately terminating its + * execution and unwinding the stack. The supplied 'cookie' parameter + * will be the return value of the program when an exception is thrown. + * + * Note that throwing an exception with lingering resources (locks, + * references, etc.) will lead to a verification error. + * + * Note that callbacks *cannot* call this helper. + * Returns + * Never. + * Throws + * An exception with the specified 'cookie' value. + */ +extern void bpf_throw(u64 cookie) __ksym; + #endif -- cgit v1.2.3 From aaa619ebccb2b78b3c6d2c0cd72d206ee8fc0025 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:02 +0200 Subject: bpf: Refactor check_btf_func and split into two phases This patch splits the check_btf_info's check_btf_func check into two separate phases. The first phase sets up the BTF and prepares func_info, but does not perform any validation of required invariants for subprogs just yet. This is left to the second phase, which happens where check_btf_info executes currently, and performs the line_info and CO-RE relocation. The reason to perform this split is to obtain the userspace supplied func_info information before we perform the add_subprog call, where we would now require finding and adding subprogs that may not have a bpf_pseudo_call or bpf_pseudo_func instruction in the program. We require this as we want to enable userspace to supply exception callbacks that can override the default hidden subprogram generated by the verifier (which performs a hardcoded action). In such a case, the exception callback may never be referenced in an instruction, but will still be suitably annotated (by way of BTF declaration tags). For finding this exception callback, we would require the program's BTF information, and the supplied func_info information which maps BTF type IDs to subprograms. Since the exception callback won't actually be referenced through instructions, later checks in check_cfg and do_check_subprogs will not verify the subprog. This means that add_subprog needs to add them in the add_subprog_and_kfunc phase before we move forward, which is why the BTF and func_info are required at that point. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 128 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9baa6f187b38..ec767ae08c2b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15115,20 +15115,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env) #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 -static int check_btf_func(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +static int check_btf_func_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) { - const struct btf_type *type, *func_proto, *ret_type; - u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); + const struct btf_type *type, *func_proto; + u32 i, nfuncs, urec_size, min_size; struct bpf_func_info *krecord; - struct bpf_func_info_aux *info_aux = NULL; struct bpf_prog *prog; const struct btf *btf; - bpfptr_t urecord; u32 prev_offset = 0; - bool scalar_return; + bpfptr_t urecord; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; @@ -15138,11 +15136,6 @@ static int check_btf_func(struct bpf_verifier_env *env, return 0; } - if (nfuncs != env->subprog_cnt) { - verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); - return -EINVAL; - } - urec_size = attr->func_info_rec_size; if (urec_size < MIN_BPF_FUNCINFO_SIZE || urec_size > MAX_FUNCINFO_REC_SIZE || @@ -15160,9 +15153,6 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); if (!krecord) return -ENOMEM; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); - if (!info_aux) - goto err_free; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -15201,11 +15191,6 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; } - if (env->subprog_info[i].start != krecord[i].insn_off) { - verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - goto err_free; - } - /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); if (!type || !btf_type_is_func(type)) { @@ -15213,12 +15198,80 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord[i].type_id); goto err_free; } - info_aux[i].linkage = BTF_INFO_VLEN(type->info); func_proto = btf_type_by_id(btf, type->type); if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) /* btf_func_check() already verified it during BTF load */ goto err_free; + + prev_offset = krecord[i].insn_off; + bpfptr_add(&urecord, urec_size); + } + + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; + return 0; + +err_free: + kvfree(krecord); + return ret; +} + +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + const struct btf_type *type, *func_proto, *ret_type; + u32 i, nfuncs, urec_size, min_size; + u32 krec_size = sizeof(struct bpf_func_info); + struct bpf_func_info *krecord; + struct bpf_func_info_aux *info_aux = NULL; + struct bpf_prog *prog; + const struct btf *btf; + bpfptr_t urecord; + u32 prev_offset = 0; + bool scalar_return; + int ret = -ENOMEM; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + + prog = env->prog; + btf = prog->aux->btf; + + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); + min_size = min_t(u32, krec_size, urec_size); + + krecord = prog->aux->func_info; + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + if (!info_aux) + return -ENOMEM; + + for (i = 0; i < nfuncs; i++) { + /* check insn_off */ + ret = -EINVAL; + + if (env->subprog_info[i].start != krecord[i].insn_off) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + goto err_free; + } + + /* Already checked type_id */ + type = btf_type_by_id(btf, krecord[i].type_id); + info_aux[i].linkage = BTF_INFO_VLEN(type->info); + /* Already checked func_proto */ + func_proto = btf_type_by_id(btf, type->type); + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); @@ -15235,13 +15288,10 @@ static int check_btf_func(struct bpf_verifier_env *env, bpfptr_add(&urecord, urec_size); } - prog->aux->func_info = krecord; - prog->aux->func_info_cnt = nfuncs; prog->aux->func_info_aux = info_aux; return 0; err_free: - kvfree(krecord); kfree(info_aux); return ret; } @@ -15459,9 +15509,9 @@ static int check_core_relo(struct bpf_verifier_env *env, return err; } -static int check_btf_info(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +static int check_btf_info_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) { struct btf *btf; int err; @@ -15481,6 +15531,24 @@ static int check_btf_info(struct bpf_verifier_env *env, } env->prog->aux->btf = btf; + err = check_btf_func_early(env, attr, uattr); + if (err) + return err; + return 0; +} + +static int check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + err = check_btf_func(env, attr, uattr); if (err) return err; @@ -19990,6 +20058,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (!env->explored_states) goto skip_full_check; + ret = check_btf_info_early(env, attr, uattr); + if (ret < 0) + goto skip_full_check; + ret = add_subprog_and_kfunc(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From b9ae0c9dd0aca79bffc17be51c2dc148d1f72708 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:03 +0200 Subject: bpf: Add support for custom exception callbacks By default, the subprog generated by the verifier to handle a thrown exception hardcodes a return value of 0. To allow user-defined logic and modification of the return value when an exception is thrown, introduce the 'exception_callback:' declaration tag, which marks a callback as the default exception handler for the program. The format of the declaration tag is 'exception_callback:', where is the name of the exception callback. Each main program can be tagged using this BTF declaratiion tag to associate it with an exception callback. In case the tag is absent, the default callback is used. As such, the exception callback cannot be modified at runtime, only set during verification. Allowing modification of the callback for the current program execution at runtime leads to issues when the programs begin to nest, as any per-CPU state maintaing this information will have to be saved and restored. We don't want it to stay in bpf_prog_aux as this takes a global effect for all programs. An alternative solution is spilling the callback pointer at a known location on the program stack on entry, and then passing this location to bpf_throw as a parameter. However, since exceptions are geared more towards a use case where they are ideally never invoked, optimizing for this use case and adding to the complexity has diminishing returns. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +- include/linux/bpf_verifier.h | 1 + kernel/bpf/btf.c | 29 +++++-- kernel/bpf/verifier.c | 113 +++++++++++++++++++++++-- tools/testing/selftests/bpf/bpf_experimental.h | 31 ++++++- 5 files changed, 160 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16740ee82082..30063a760b5a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2422,9 +2422,11 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg); + struct bpf_reg_state *reg, bool is_ex_cb); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); +const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key); struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index da21a3ec5027..94ec766432f5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -300,6 +300,7 @@ struct bpf_func_state { bool in_callback_fn; struct tnum callback_ret_range; bool in_async_callback_fn; + bool in_exception_callback_fn; /* The following fields should be last. See copy_func_state() */ int acquired_refs; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 187b57276fec..f93e835d90af 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3310,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, return BTF_FIELD_FOUND; } -static const char *btf_find_decl_tag_value(const struct btf *btf, - const struct btf_type *pt, - int comp_idx, const char *tag_key) +const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key) { + const char *value = NULL; int i; for (i = 1; i < btf_nr_types(btf); i++) { @@ -3327,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf, continue; if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) continue; - return __btf_name_by_offset(btf, t->name_off) + len; + /* Prevent duplicate entries for same type */ + if (value) + return ERR_PTR(-EEXIST); + value = __btf_name_by_offset(btf, t->name_off) + len; } - return NULL; + if (!value) + return ERR_PTR(-ENOENT); + return value; } static int @@ -3347,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, if (t->size != sz) return BTF_FIELD_IGNORE; value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); - if (!value_type) + if (IS_ERR(value_type)) return -EINVAL; node_field_name = strstr(value_type, ":"); if (!node_field_name) @@ -6954,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) + struct bpf_reg_state *regs, bool is_ex_cb) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -7011,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function returns int */ + /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -7060,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, i, btf_type_str(t), tname); return -EINVAL; } + /* We have already ensured that the callback returns an integer, just + * like all global subprogs. We need to determine it only has a single + * scalar argument. + */ + if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) { + bpf_log(log, "exception cb only supports single integer argument\n"); + return -EINVAL; + } return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ec767ae08c2b..ec3f22312516 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2457,6 +2457,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off) return env->subprog_cnt - 1; } +static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + struct btf *btf = aux->btf; + const struct btf_type *t; + u32 main_btf_id, id; + const char *name; + int ret, i; + + /* Non-zero func_info_cnt implies valid btf */ + if (!aux->func_info_cnt) + return 0; + main_btf_id = aux->func_info[0].type_id; + + t = btf_type_by_id(btf, main_btf_id); + if (!t) { + verbose(env, "invalid btf id for main subprog in func_info\n"); + return -EINVAL; + } + + name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:"); + if (IS_ERR(name)) { + ret = PTR_ERR(name); + /* If there is no tag present, there is no exception callback */ + if (ret == -ENOENT) + ret = 0; + else if (ret == -EEXIST) + verbose(env, "multiple exception callback tags for main subprog\n"); + return ret; + } + + ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC); + if (ret < 0) { + verbose(env, "exception callback '%s' could not be found in BTF\n", name); + return ret; + } + id = ret; + t = btf_type_by_id(btf, id); + if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) { + verbose(env, "exception callback '%s' must have global linkage\n", name); + return -EINVAL; + } + ret = 0; + for (i = 0; i < aux->func_info_cnt; i++) { + if (aux->func_info[i].type_id != id) + continue; + ret = aux->func_info[i].insn_off; + /* Further func_info and subprog checks will also happen + * later, so assume this is the right insn_off for now. + */ + if (!ret) { + verbose(env, "invalid exception callback insn_off in func_info: 0\n"); + ret = -EINVAL; + } + } + if (!ret) { + verbose(env, "exception callback type id not found in func_info\n"); + ret = -EINVAL; + } + return ret; +} + #define MAX_KFUNC_DESCS 256 #define MAX_KFUNC_BTFS 256 @@ -2796,8 +2858,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; + int i, ret, insn_cnt = env->prog->len, ex_cb_insn; struct bpf_insn *insn = env->prog->insnsi; - int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); @@ -2823,6 +2885,26 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return ret; } + ret = bpf_find_exception_callback_insn_off(env); + if (ret < 0) + return ret; + ex_cb_insn = ret; + + /* If ex_cb_insn > 0, this means that the main program has a subprog + * marked using BTF decl tag to serve as the exception callback. + */ + if (ex_cb_insn) { + ret = add_subprog(env, ex_cb_insn); + if (ret < 0) + return ret; + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].start != ex_cb_insn) + continue; + env->exception_callback_subprog = i; + break; + } + } + /* Add a fake 'exit' subprog which could simplify subprog iteration * logic. 'subprog_cnt' should not be increased. */ @@ -5707,6 +5789,10 @@ continue_func: /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; + if (subprog[sidx].is_exception_cb) { + verbose(env, "insn %d cannot call exception cb directly\n", i); + return -EINVAL; + } } i = next_insn; idx = sidx; @@ -5728,8 +5814,13 @@ continue_func: * tail call counter throughout bpf2bpf calls combined with tailcalls */ if (tail_call_reachable) - for (j = 0; j < frame; j++) + for (j = 0; j < frame; j++) { + if (subprog[ret_prog[j]].is_exception_cb) { + verbose(env, "cannot tail call within exception cb\n"); + return -EINVAL; + } subprog[ret_prog[j]].tail_call_reachable = true; + } if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -14630,7 +14721,7 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog) { + if (!is_subprog || frame->in_exception_callback_fn) { switch (prog_type) { case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -14678,7 +14769,7 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - if (is_subprog) { + if (is_subprog && !frame->in_exception_callback_fn) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", reg_type_str(env, reg->type)); @@ -19334,7 +19425,7 @@ static void free_states(struct bpf_verifier_env *env) } } -static int do_check_common(struct bpf_verifier_env *env, int subprog) +static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; @@ -19365,7 +19456,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - ret = btf_prepare_func_args(env, subprog, regs); + ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb); if (ret) goto out; for (i = BPF_REG_1; i <= BPF_REG_5; i++) { @@ -19381,6 +19472,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) regs[i].id = ++env->id_gen; } } + if (is_ex_cb) { + state->frame[0]->in_exception_callback_fn = true; + env->subprog_info[subprog].is_cb = true; + env->subprog_info[subprog].is_async_cb = true; + env->subprog_info[subprog].is_exception_cb = true; + } } else { /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; @@ -19445,7 +19542,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) continue; env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); - ret = do_check_common(env, i); + ret = do_check_common(env, i, env->exception_callback_subprog == i); if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -19462,7 +19559,7 @@ static int do_check_main(struct bpf_verifier_env *env) int ret; env->insn_idx = 0; - ret = do_check_common(env, 0); + ret = do_check_common(env, 0, false); if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 333b54a86e3a..9a87170524ce 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -165,7 +165,16 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym; /* Description * Throw a BPF exception from the program, immediately terminating its * execution and unwinding the stack. The supplied 'cookie' parameter - * will be the return value of the program when an exception is thrown. + * will be the return value of the program when an exception is thrown, + * and the default exception callback is used. Otherwise, if an exception + * callback is set using the '__exception_cb(callback)' declaration tag + * on the main program, the 'cookie' parameter will be the callback's only + * input argument. + * + * Thus, in case of default exception callback, 'cookie' is subjected to + * constraints on the program's return value (as with R0 on exit). + * Otherwise, the return value of the marked exception callback will be + * subjected to the same checks. * * Note that throwing an exception with lingering resources (locks, * references, etc.) will lead to a verification error. @@ -178,4 +187,24 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym; */ extern void bpf_throw(u64 cookie) __ksym; +/* This macro must be used to mark the exception callback corresponding to the + * main program. For example: + * + * int exception_cb(u64 cookie) { + * return cookie; + * } + * + * SEC("tc") + * __exception_cb(exception_cb) + * int main_prog(struct __sk_buff *ctx) { + * ... + * return TC_ACT_OK; + * } + * + * Here, exception callback for the main program will be 'exception_cb'. Note + * that this attribute can only be used once, and multiple exception callbacks + * specified for the main program will lead to verification error. + */ +#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name))) + #endif -- cgit v1.2.3 From b62bf8a5e9110922f58f6ea8fe747e1759f49e61 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:04 +0200 Subject: bpf: Perform CFG walk for exception callback Since exception callbacks are not referenced using bpf_pseudo_func and bpf_pseudo_call instructions, check_cfg traversal will never explore instructions of the exception callback. Even after adding the subprog, the program will then fail with a 'unreachable insn' error. We thus need to begin walking from the start of the exception callback again in check_cfg after a complete CFG traversal finishes, so as to explore the CFG rooted at the exception callback. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ec3f22312516..863e4e6c4616 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15126,8 +15126,8 @@ static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; int *insn_stack, *insn_state; - int ret = 0; - int i; + int ex_insn_beg, i, ret = 0; + bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -15143,6 +15143,7 @@ static int check_cfg(struct bpf_verifier_env *env) insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; +walk_cfg: while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; @@ -15169,6 +15170,16 @@ static int check_cfg(struct bpf_verifier_env *env) goto err_free; } + if (env->exception_callback_subprog && !ex_done) { + ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; + + insn_state[ex_insn_beg] = DISCOVERED; + insn_stack[0] = ex_insn_beg; + env->cfg.cur_stack = 1; + ex_done = true; + goto walk_cfg; + } + for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); -- cgit v1.2.3 From a923819fb2c5be029a69c0ca53239865c9bc05dd Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:05 +0200 Subject: bpf: Treat first argument as return value for bpf_throw In case of the default exception callback, change the behavior of bpf_throw, where the passed cookie value is no longer ignored, but is instead the return value of the default exception callback. As such, we need to place restrictions on the value being passed into bpf_throw in such a case, only allowing those permitted by the check_return_code function. Thus, bpf_throw can now control the return value of the program from each call site without having the user install a custom exception callback just to override the return value when an exception is thrown. We also modify the hidden subprog instructions to now move BPF_REG_1 to BPF_REG_0, so as to set the return value before exit in the default callback. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 863e4e6c4616..0ba32b626320 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11485,6 +11485,8 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +static int check_return_code(struct bpf_verifier_env *env, int regno); + static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11613,6 +11615,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -ENOTSUPP; } env->seen_exception = true; + + /* In the case of the default callback, the cookie value passed + * to bpf_throw becomes the return value of the program. + */ + if (!env->exception_callback_subprog) { + err = check_return_code(env, BPF_REG_1); + if (err < 0) + return err; + } } for (i = 0; i < CALLER_SAVED_REGS; i++) @@ -14709,7 +14720,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static int check_return_code(struct bpf_verifier_env *env) +static int check_return_code(struct bpf_verifier_env *env, int regno) { struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; @@ -14743,22 +14754,22 @@ static int check_return_code(struct bpf_verifier_env *env) * of bpf_exit, which means that program wrote * something into it earlier */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); + err = check_reg_arg(env, regno, SRC_OP); if (err) return err; - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); + if (is_pointer_value(env, regno)) { + verbose(env, "R%d leaks addr as return value\n", regno); return -EACCES; } - reg = cur_regs(env) + BPF_REG_0; + reg = cur_regs(env) + regno; if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ if (reg->type != SCALAR_VALUE) { - verbose(env, "In async callback the register R0 is not a known value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "In async callback the register R%d is not a known value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } @@ -14771,8 +14782,8 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog && !frame->in_exception_callback_fn) { if (reg->type != SCALAR_VALUE) { - verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -14854,8 +14865,8 @@ static int check_return_code(struct bpf_verifier_env *env) } if (reg->type != SCALAR_VALUE) { - verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "At program exit the register R%d is not a known value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } @@ -17053,7 +17064,7 @@ process_bpf_exit_full: continue; } - err = check_return_code(env); + err = check_return_code(env, BPF_REG_0); if (err) return err; process_bpf_exit: @@ -18722,7 +18733,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (env->seen_exception && !env->exception_callback_subprog) { struct bpf_insn patch[] = { env->prog->insnsi[insn_cnt - 1], - BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }; -- cgit v1.2.3 From ec5290a178b787b2f8b21581fdadc919bd004e12 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:07 +0200 Subject: bpf: Prevent KASAN false positive with bpf_throw The KASAN stack instrumentation when CONFIG_KASAN_STACK is true poisons the stack of a function when it is entered and unpoisons it when leaving. However, in the case of bpf_throw, we will never return as we switch our stack frame to the BPF exception callback. Later, this discrepancy will lead to confusing KASAN splats when kernel resumes execution on return from the BPF program. Fix this by unpoisoning everything below the stack pointer of the BPF program, which should cover the range that would not be unpoisoned. An example splat is below: BUG: KASAN: stack-out-of-bounds in stack_trace_consume_entry+0x14e/0x170 Write of size 8 at addr ffffc900013af958 by task test_progs/227 CPU: 0 PID: 227 Comm: test_progs Not tainted 6.5.0-rc2-g43f1c6c9052a-dirty #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-2.fc39 04/01/2014 Call Trace: dump_stack_lvl+0x4a/0x80 print_report+0xcf/0x670 ? arch_stack_walk+0x79/0x100 kasan_report+0xda/0x110 ? stack_trace_consume_entry+0x14e/0x170 ? stack_trace_consume_entry+0x14e/0x170 ? __pfx_stack_trace_consume_entry+0x10/0x10 stack_trace_consume_entry+0x14e/0x170 ? __sys_bpf+0xf2e/0x41b0 arch_stack_walk+0x8b/0x100 ? __sys_bpf+0xf2e/0x41b0 ? bpf_prog_test_run_skb+0x341/0x1c70 ? bpf_prog_test_run_skb+0x341/0x1c70 stack_trace_save+0x9b/0xd0 ? __pfx_stack_trace_save+0x10/0x10 ? __kasan_slab_free+0x109/0x180 ? bpf_prog_test_run_skb+0x341/0x1c70 ? __sys_bpf+0xf2e/0x41b0 ? __x64_sys_bpf+0x78/0xc0 ? do_syscall_64+0x3c/0x90 ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8 kasan_save_stack+0x33/0x60 ? kasan_save_stack+0x33/0x60 ? kasan_set_track+0x25/0x30 ? kasan_save_free_info+0x2b/0x50 ? __kasan_slab_free+0x109/0x180 ? kmem_cache_free+0x191/0x460 ? bpf_prog_test_run_skb+0x341/0x1c70 kasan_set_track+0x25/0x30 kasan_save_free_info+0x2b/0x50 __kasan_slab_free+0x109/0x180 kmem_cache_free+0x191/0x460 bpf_prog_test_run_skb+0x341/0x1c70 ? __pfx_bpf_prog_test_run_skb+0x10/0x10 ? __fget_light+0x51/0x220 __sys_bpf+0xf2e/0x41b0 ? __might_fault+0xa2/0x170 ? __pfx___sys_bpf+0x10/0x10 ? lock_release+0x1de/0x620 ? __might_fault+0xcd/0x170 ? __pfx_lock_release+0x10/0x10 ? __pfx_blkcg_maybe_throttle_current+0x10/0x10 __x64_sys_bpf+0x78/0xc0 ? syscall_enter_from_user_mode+0x20/0x50 do_syscall_64+0x3c/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 RIP: 0033:0x7f0fbb38880d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f3 45 12 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe13907de8 EFLAGS: 00000206 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 00007ffe13908708 RCX: 00007f0fbb38880d RDX: 0000000000000050 RSI: 00007ffe13907e20 RDI: 000000000000000a RBP: 00007ffe13907e00 R08: 0000000000000000 R09: 00007ffe13907e20 R10: 0000000000000064 R11: 0000000000000206 R12: 0000000000000003 R13: 0000000000000000 R14: 00007f0fbb532000 R15: 0000000000cfbd90 The buggy address belongs to stack of task test_progs/227 KASAN internal error: frame info validation failed; invalid marker: 0 The buggy address belongs to the virtual mapping at [ffffc900013a8000, ffffc900013b1000) created by: kernel_clone+0xcd/0x600 The buggy address belongs to the physical page: page:00000000b70f4332 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11418f flags: 0x2fffe0000000000(node=0|zone=2|lastcpupid=0x7fff) page_type: 0xffffffff() raw: 02fffe0000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffffc900013af800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffffc900013af880: 00 00 00 f1 f1 f1 f1 00 00 00 f3 f3 f3 f3 f3 00 >ffffc900013af900: 00 00 00 00 00 00 00 00 00 00 00 f1 00 00 00 00 ^ ffffc900013af980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffffc900013afa00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Disabling lock debugging due to kernel taint Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Andrey Konovalov Link: https://lore.kernel.org/r/20230912233214.1518551-11-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 78e8f4de6750..2c8e1ee97b71 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -2483,6 +2484,11 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN_ON_ONCE(!ctx.aux->exception_boundary); WARN_ON_ONCE(!ctx.bp); WARN_ON_ONCE(!ctx.cnt); + /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning + * deeper stack depths than ctx.sp as we do not return from bpf_throw, + * which skips compiler generated instrumentation to do the same. + */ + kasan_unpoison_task_stack_below((void *)ctx.sp); ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp); } -- cgit v1.2.3 From 66d9111f3517f85ef2af0337ece02683ce0faf21 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:08 +0200 Subject: bpf: Detect IP == ksym.end as part of BPF program Now that bpf_throw kfunc is the first such call instruction that has noreturn semantics within the verifier, this also kicks in dead code elimination in unprecedented ways. For one, any instruction following a bpf_throw call will never be marked as seen. Moreover, if a callchain ends up throwing, any instructions after the call instruction to the eventually throwing subprog in callers will also never be marked as seen. The tempting way to fix this would be to emit extra 'int3' instructions which bump the jited_len of a program, and ensure that during runtime when a program throws, we can discover its boundaries even if the call instruction to bpf_throw (or to subprogs that always throw) is emitted as the final instruction in the program. An example of such a program would be this: do_something(): ... r0 = 0 exit foo(): r1 = 0 call bpf_throw r0 = 0 exit bar(cond): if r1 != 0 goto pc+2 call do_something exit call foo r0 = 0 // Never seen by verifier exit // main(ctx): r1 = ... call bar r0 = 0 exit Here, if we do end up throwing, the stacktrace would be the following: bpf_throw foo bar main In bar, the final instruction emitted will be the call to foo, as such, the return address will be the subsequent instruction (which the JIT emits as int3 on x86). This will end up lying outside the jited_len of the program, thus, when unwinding, we will fail to discover the return address as belonging to any program and end up in a panic due to the unreliable stack unwinding of BPF programs that we never expect. To remedy this case, make bpf_prog_ksym_find treat IP == ksym.end as part of the BPF program, so that is_bpf_text_address returns true when such a case occurs, and we are able to unwind reliably when the final instruction ends up being a call instruction. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-12-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7849b9cca749..8f921b6d6981 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -623,7 +623,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) if (val < ksym->start) return -1; - if (val >= ksym->end) + /* Ensure that we detect return addresses as part of the program, when + * the final instruction is a call for a program part of the stack + * trace. Therefore, do val > ksym->end instead of val >= ksym->end. + */ + if (val > ksym->end) return 1; return 0; -- cgit v1.2.3 From fd548e1a46185000191a89cae4be560e076ed6c7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:09 +0200 Subject: bpf: Disallow fentry/fexit/freplace for exception callbacks During testing, it was discovered that extensions to exception callbacks had no checks, upon running a testcase, the kernel ended up running off the end of a program having final call as bpf_throw, and hitting int3 instructions. The reason is that while the default exception callback would have reset the stack frame to return back to the main program's caller, the replacing extension program will simply return back to bpf_throw, which will instead return back to the program and the program will continue execution, now in an undefined state where anything could happen. The way to support extensions to an exception callback would be to mark the BPF_PROG_TYPE_EXT main subprog as an exception_cb, and prevent it from calling bpf_throw. This would make the JIT produce a prologue that restores saved registers and reset the stack frame. But let's not do that until there is a concrete use case for this, and simply disallow this for now. Similar issues will exist for fentry and fexit cases, where trampoline saves data on the stack when invoking exception callback, which however will then end up resetting the stack frame, and on return, the fexit program will never will invoked as the return address points to the main program's caller in the kernel. Instead of additional complexity and back and forth between the two stacks to enable such a use case, simply forbid it. One key point here to note is that currently X86_TAIL_CALL_OFFSET didn't require any modifications, even though we emit instructions before the corresponding endbr64 instruction. This is because we ensure that a main subprog never serves as an exception callback, and therefore the exception callback (which will be a global subprog) can never serve as the tail call target, eliminating any discrepancies. However, once we support a BPF_PROG_TYPE_EXT to also act as an exception callback, it will end up requiring change to the tail call offset to account for the extra instructions. For simplicitly, tail calls could be disabled for such targets. Noting the above, it appears better to wait for a concrete use case before choosing to permit extension programs to replace exception callbacks. As a precaution, we disable fentry and fexit for exception callbacks as well. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-13-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + kernel/bpf/verifier.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2c8e1ee97b71..7ff2a42f1996 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2490,6 +2490,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) */ kasan_unpoison_task_stack_below((void *)ctx.sp); ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp); + WARN(1, "A call to BPF exception callback should never return\n"); } __diag_pop(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0ba32b626320..5ccb50fd74e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19750,6 +19750,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Subprog %s doesn't exist\n", tname); return -EINVAL; } + if (aux->func && aux->func[subprog]->aux->exception_cb) { + bpf_log(log, + "%s programs cannot attach to exception callback\n", + prog_extension ? "Extension" : "FENTRY/FEXIT"); + return -EINVAL; + } conservative = aux->func_info_aux[subprog].unreliable; if (prog_extension) { if (conservative) { -- cgit v1.2.3 From 06d686f771ddc27a8554cd8f5b22e071040dc90e Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 13 Sep 2023 01:32:10 +0200 Subject: bpf: Fix kfunc callback register type handling The kfunc code to handle KF_ARG_PTR_TO_CALLBACK does not check the reg type before using reg->subprogno. This can accidently permit invalid pointers from being passed into callback helpers (e.g. silently from different paths). Likewise, reg->subprogno from the per-register type union may not be meaningful either. We need to reject any other type except PTR_TO_FUNC. Acked-by: Dave Marchevsky Fixes: 5d92ddc3de1b ("bpf: Add callback validation to kfunc verifier logic") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230912233214.1518551-14-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ccb50fd74e5..a7178ecf676d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11407,6 +11407,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } case KF_ARG_PTR_TO_CALLBACK: + if (reg->type != PTR_TO_FUNC) { + verbose(env, "arg%d expected pointer to func\n", i); + return -EINVAL; + } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: -- cgit v1.2.3