From 23da464dd6b8935b66f4ee306ad8947fd32ccd75 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:51 +0530 Subject: bpf: Allow specifying volatile type modifier for kptrs This is useful in particular to mark the pointer as volatile, so that compiler treats each load and store to the field as a volatile access. The alternative is having to define and use READ_ONCE and WRITE_ONCE in the BPF program. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20221103191013.1236066-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 35c07afac924..f4d21eef6ebd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3225,6 +3225,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, enum bpf_kptr_type type; u32 res_id; + /* Permit modifiers on the pointer itself */ + if (btf_type_is_volatile(t)) + t = btf_type_by_id(btf, t->type); /* For PTR, sz is always == 8 */ if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; -- cgit v1.2.3 From 261f4664caffdeb9dff4e83ee3c0334b1c3a552f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:52 +0530 Subject: bpf: Clobber stack slot when writing over spilled PTR_TO_BTF_ID When support was added for spilled PTR_TO_BTF_ID to be accessed by helper memory access, the stack slot was not overwritten to STACK_MISC (and that too is only safe when env->allow_ptr_leaks is true). This means that helpers who take ARG_PTR_TO_MEM and write to it may essentially overwrite the value while the verifier continues to track the slot for spilled register. This can cause issues when PTR_TO_BTF_ID is spilled to stack, and then overwritten by helper write access, which can then be passed to BPF helpers or kfuncs. Handle this by falling back to the case introduced in a later commit, which will also handle PTR_TO_BTF_ID along with other pointer types, i.e. cd17d38f8b28 ("bpf: Permits pointers on stack for helper calls"). Finally, include a comment on why REG_LIVE_WRITTEN is not being set when clobber is set to true. In short, the reason is that while when clobber is unset, we know that we won't be writing, when it is true, we *may* write to any of the stack slots in that range. It may be a partial or complete write, to just one or many stack slots. We cannot be sure, hence to be conservative, we leave things as is and never set REG_LIVE_WRITTEN for any stack slot. However, clobber still needs to reset them to STACK_MISC assuming writes happened. However read marks still need to be propagated upwards from liveness point of view, as parent stack slot's contents may still continue to matter to child states. Cc: Yonghong Song Fixes: 1d68f22b3d53 ("bpf: Handle spilled PTR_TO_BTF_ID properly when checking stack_boundary") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82c07fe0bfb1..7bf12c492201 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5154,10 +5154,6 @@ static int check_stack_range_initialized( goto mark; } - if (is_spilled_reg(&state->stack[spi]) && - base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) - goto mark; - if (is_spilled_reg(&state->stack[spi]) && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { @@ -5188,6 +5184,11 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + * be sure that whether stack slot is written to or not. Hence, + * we must still conservatively propagate reads upwards even if + * helper may write to the entire memory range. + */ } return update_stack_depth(env, state, min_off); } -- cgit v1.2.3 From f5e477a861e4a20d8a1c5f7a245f3a3c3c376b03 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:53 +0530 Subject: bpf: Fix slot type check in check_stack_write_var_off For the case where allow_ptr_leaks is false, code is checking whether slot type is STACK_INVALID and STACK_SPILL and rejecting other cases. This is a consequence of incorrectly checking for register type instead of the slot type (NOT_INIT and SCALAR_VALUE respectively). Fix the check. Fixes: 01f810ace9ed ("bpf: Allow variable-offset stack access") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7bf12c492201..eb111a8034e7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3181,14 +3181,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; mark_stack_slot_scratched(env, spi); - if (!env->allow_ptr_leaks - && *stype != NOT_INIT - && *stype != SCALAR_VALUE) { - /* Reject the write if there's are spilled pointers in - * range. If we didn't reject here, the ptr status - * would be erased below (even though not all slots are - * actually overwritten), possibly opening the door to - * leaks. + if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) { + /* Reject the write if range we may write to has not + * been initialized beforehand. If we didn't reject + * here, the ptr status would be erased below (even + * though not all slots are actually overwritten), + * possibly opening the door to leaks. + * + * We do however catch STACK_INVALID case below, and + * only allow reading possibly uninitialized memory + * later for CAP_PERFMON, as the write may not happen to + * that slot. */ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", insn_idx, i); -- cgit v1.2.3 From a28ace782e687424d7aa2c29a4516f54d5561a14 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:54 +0530 Subject: bpf: Drop reg_type_may_be_refcounted_or_null It is not scalable to maintain a list of types that can have non-zero ref_obj_id. It is never set for scalars anyway, so just remove the conditional on register types and print it whenever it is non-zero. Acked-by: Dave Marchevsky Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20221103191013.1236066-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb111a8034e7..14d350a25d5d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -457,13 +457,6 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) map_value_has_spin_lock(reg->map_ptr); } -static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) -{ - type = base_type(type); - return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || - type == PTR_TO_MEM || type == PTR_TO_BTF_ID; -} - static bool type_is_rdonly_mem(u32 type) { return type & MEM_RDONLY; @@ -875,7 +868,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); -- cgit v1.2.3 From aa3496accc412b3d975e4ee5d06076d73394d8b5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:55 +0530 Subject: bpf: Refactor kptr_off_tab into btf_record To prepare the BPF verifier to handle special fields in both map values and program allocated types coming from program BTF, we need to refactor the kptr_off_tab handling code into something more generic and reusable across both cases to avoid code duplication. Later patches also require passing this data to helpers at runtime, so that they can work on user defined types, initialize them, destruct them, etc. The main observation is that both map values and such allocated types point to a type in program BTF, hence they can be handled similarly. We can prepare a field metadata table for both cases and store them in struct bpf_map or struct btf depending on the use case. Hence, refactor the code into generic btf_record and btf_field member structs. The btf_record represents the fields of a specific btf_type in user BTF. The cnt indicates the number of special fields we successfully recognized, and field_mask is a bitmask of fields that were found, to enable quick determination of availability of a certain field. Subsequently, refactor the rest of the code to work with these generic types, remove assumptions about kptr and kptr_off_tab, rename variables to more meaningful names, etc. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 125 +++++++++++++++-------- include/linux/btf.h | 3 +- kernel/bpf/arraymap.c | 13 ++- kernel/bpf/btf.c | 67 ++++++------ kernel/bpf/hashtab.c | 14 ++- kernel/bpf/map_in_map.c | 14 ++- kernel/bpf/syscall.c | 263 +++++++++++++++++++++++++++--------------------- kernel/bpf/verifier.c | 96 +++++++++--------- 8 files changed, 332 insertions(+), 263 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8d948bfcb984..5f2a42033a37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,35 +165,41 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BPF map value */ - BPF_MAP_VALUE_OFF_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + /* Support at most 8 pointers in a BTF type */ + BTF_FIELDS_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + 1 + /* for bpf_spin_lock */ 1, /* for bpf_timer */ }; -enum bpf_kptr_type { - BPF_KPTR_UNREF, - BPF_KPTR_REF, +enum btf_field_type { + BPF_KPTR_UNREF = (1 << 2), + BPF_KPTR_REF = (1 << 3), + BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, }; -struct bpf_map_value_off_desc { +struct btf_field_kptr { + struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; + u32 btf_id; +}; + +struct btf_field { u32 offset; - enum bpf_kptr_type type; - struct { - struct btf *btf; - struct module *module; - btf_dtor_kfunc_t dtor; - u32 btf_id; - } kptr; + enum btf_field_type type; + union { + struct btf_field_kptr kptr; + }; }; -struct bpf_map_value_off { - u32 nr_off; - struct bpf_map_value_off_desc off[]; +struct btf_record { + u32 cnt; + u32 field_mask; + struct btf_field fields[]; }; -struct bpf_map_off_arr { +struct btf_field_offs { u32 cnt; u32 field_off[BPF_MAP_OFF_ARR_MAX]; u8 field_sz[BPF_MAP_OFF_ARR_MAX]; @@ -215,7 +221,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ - struct bpf_map_value_off *kptr_off_tab; + struct btf_record *record; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -227,7 +233,7 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct bpf_map_off_arr *off_arr; + struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -251,6 +257,37 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline u32 btf_field_type_size(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return sizeof(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline u32 btf_field_type_align(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return __alignof__(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) +{ + if (IS_ERR_OR_NULL(rec)) + return false; + return rec->field_mask & type; +} + static inline bool map_value_has_spin_lock(const struct bpf_map *map) { return map->spin_lock_off >= 0; @@ -261,23 +298,19 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } -static inline bool map_value_has_kptrs(const struct bpf_map *map) -{ - return !IS_ERR_OR_NULL(map->kptr_off_tab); -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); - if (unlikely(map_value_has_kptrs(map))) { - struct bpf_map_value_off *tab = map->kptr_off_tab; + if (!IS_ERR_OR_NULL(map->record)) { + struct btf_field *fields = map->record->fields; + u32 cnt = map->record->cnt; int i; - for (i = 0; i < tab->nr_off; i++) - *(u64 *)(dst + tab->off[i].offset) = 0; + for (i = 0; i < cnt; i++) + memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); } } @@ -303,7 +336,7 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); else @@ -311,11 +344,12 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memcpy(dst + curr_off, src + curr_off, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memcpy(dst + curr_off, src + curr_off, sz); + curr_off += map->field_offs->field_sz[i]; } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } @@ -335,16 +369,17 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { memset(dst, 0, map->value_size); return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memset(dst + curr_off, 0, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memset(dst + curr_off, 0, sz); + curr_off += map->field_offs->field_sz[i]; } memset(dst + curr_off, 0, map->value_size - curr_off); } @@ -1699,11 +1734,13 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); -void bpf_map_free_kptr_off_tab(struct bpf_map *map); -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); +struct btf_field *btf_record_find(const struct btf_record *rec, + u32 offset, enum btf_field_type type); +void btf_record_free(struct btf_record *rec); +void bpf_map_free_record(struct bpf_map *map); +struct btf_record *btf_record_dup(const struct btf_record *rec); +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 86aad9b2ce02..9e62717cdc7a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,8 +163,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 832b2659e96e..417f84342e98 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -310,8 +310,7 @@ static void check_and_free_fields(struct bpf_array *arr, void *val) { if (map_value_has_timer(&arr->map)) bpf_timer_cancel_and_free(val + arr->map.timer_off); - if (map_value_has_kptrs(&arr->map)) - bpf_map_free_kptrs(&arr->map, val); + bpf_obj_free_fields(arr->map.record, val); } /* Called from syscall or from eBPF program */ @@ -409,7 +408,7 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free kptr on uref dropping to zero. */ + /* We don't reset or free fields other than timer on uref dropping to zero. */ if (!map_value_has_timer(map)) return; @@ -423,22 +422,22 @@ static void array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (map_value_has_kptrs(map)) { + if (!IS_ERR_OR_NULL(map->record)) { if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { for (i = 0; i < array->map.max_entries; i++) { void __percpu *pptr = array->pptrs[i & array->index_mask]; int cpu; for_each_possible_cpu(cpu) { - bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } } else { for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); } - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f4d21eef6ebd..8391a77138ee 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3191,7 +3191,7 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -enum btf_field_type { +enum btf_field_info_type { BTF_FIELD_SPIN_LOCK, BTF_FIELD_TIMER, BTF_FIELD_KPTR, @@ -3203,9 +3203,9 @@ enum { }; struct btf_field_info { - u32 type_id; + enum btf_field_type type; u32 off; - enum bpf_kptr_type type; + u32 type_id; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, @@ -3222,7 +3222,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info) { - enum bpf_kptr_type type; + enum btf_field_type type; u32 res_id; /* Permit modifiers on the pointer itself */ @@ -3259,7 +3259,7 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, const char *name, int sz, int align, - enum btf_field_type field_type, + enum btf_field_info_type field_type, struct btf_field_info *info, int info_cnt) { const struct btf_member *member; @@ -3311,7 +3311,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, const char *name, int sz, int align, - enum btf_field_type field_type, + enum btf_field_info_type field_type, struct btf_field_info *info, int info_cnt) { const struct btf_var_secinfo *vsi; @@ -3360,7 +3360,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - enum btf_field_type field_type, + enum btf_field_info_type field_type, struct btf_field_info *info, int info_cnt) { const char *name; @@ -3423,14 +3423,13 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t) return info.off; } -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t) +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t) { - struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; - struct bpf_map_value_off *tab; + struct btf_field_info info_arr[BTF_FIELDS_MAX]; struct btf *kernel_btf = NULL; struct module *mod = NULL; - int ret, i, nr_off; + struct btf_record *rec; + int ret, i, cnt; ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) @@ -3438,12 +3437,12 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, if (!ret) return NULL; - nr_off = ret; - tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); - if (!tab) + cnt = ret; + rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + if (!rec) return ERR_PTR(-ENOMEM); - - for (i = 0; i < nr_off; i++) { + rec->cnt = 0; + for (i = 0; i < cnt; i++) { const struct btf_type *t; s32 id; @@ -3500,28 +3499,24 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, ret = -EINVAL; goto end_mod; } - tab->off[i].kptr.dtor = (void *)addr; + rec->fields[i].kptr.dtor = (void *)addr; } - tab->off[i].offset = info_arr[i].off; - tab->off[i].type = info_arr[i].type; - tab->off[i].kptr.btf_id = id; - tab->off[i].kptr.btf = kernel_btf; - tab->off[i].kptr.module = mod; + rec->field_mask |= info_arr[i].type; + rec->fields[i].offset = info_arr[i].off; + rec->fields[i].type = info_arr[i].type; + rec->fields[i].kptr.btf_id = id; + rec->fields[i].kptr.btf = kernel_btf; + rec->fields[i].kptr.module = mod; + rec->cnt++; } - tab->nr_off = nr_off; - return tab; + return rec; end_mod: module_put(mod); end_btf: btf_put(kernel_btf); end: - while (i--) { - btf_put(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - } - kfree(tab); + btf_record_free(rec); return ERR_PTR(ret); } @@ -6370,7 +6365,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* kptr_get is only true for kfunc */ if (i == 0 && kptr_get) { - struct bpf_map_value_off_desc *off_desc; + struct btf_field *kptr_field; if (reg->type != PTR_TO_MAP_VALUE) { bpf_log(log, "arg#0 expected pointer to map value\n"); @@ -6386,8 +6381,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); - if (!off_desc || off_desc->type != BPF_KPTR_REF) { + kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); + if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", reg->off + reg->var_off.value); return -EINVAL; @@ -6406,8 +6401,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, func_name, i, btf_type_str(ref_t), ref_tname); return -EINVAL; } - if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, - off_desc->kptr.btf_id, true)) { + if (!btf_struct_ids_match(log, btf, ref_id, 0, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, true)) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname); return -EINVAL; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f39ee3e05589..c5ea8f9bb7a9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -238,21 +238,20 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } -static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_kptrs(&htab->map)) + if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); - for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -766,8 +765,7 @@ static void check_and_free_fields(struct bpf_htab *htab, if (map_value_has_timer(&htab->map)) bpf_timer_cancel_and_free(map_value + htab->map.timer_off); - if (map_value_has_kptrs(&htab->map)) - bpf_map_free_kptrs(&htab->map, map_value); + bpf_obj_free_fields(htab->map.record, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -1517,11 +1515,11 @@ static void htab_map_free(struct bpf_map *map) if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { - htab_free_prealloced_kptrs(htab); + htab_free_prealloced_fields(htab); prealloc_destroy(htab); } - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 135205d0d560..d6c662183f88 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -52,7 +52,15 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; - inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); + inner_map_meta->record = btf_record_dup(inner_map->record); + if (IS_ERR(inner_map_meta->record)) { + /* btf_record_dup returns NULL or valid pointer in case of + * invalid/empty/valid, but ERR_PTR in case of errors. During + * equality NULL or IS_ERR is equivalent. + */ + fdput(f); + return ERR_CAST(inner_map_meta->record); + } if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -72,7 +80,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { - bpf_map_free_kptr_off_tab(map_meta); + bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -86,7 +94,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->value_size == meta1->value_size && meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags && - bpf_map_equal_kptr_off_tab(meta0, meta1); + btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5887592eeb93..b80c0e2eb73f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -495,114 +495,134 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -static int bpf_map_kptr_off_cmp(const void *a, const void *b) +static int btf_field_cmp(const void *a, const void *b) { - const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + const struct btf_field *f1 = a, *f2 = b; - if (off_desc1->offset < off_desc2->offset) + if (f1->offset < f2->offset) return -1; - else if (off_desc1->offset > off_desc2->offset) + else if (f1->offset > f2->offset) return 1; return 0; } -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, + enum btf_field_type type) { - /* Since members are iterated in btf_find_field in increasing order, - * offsets appended to kptr_off_tab are in increasing order, so we can - * do bsearch to find exact match. - */ - struct bpf_map_value_off *tab; + struct btf_field *field; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + return NULL; + field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); + if (!field || !(field->type & type)) return NULL; - tab = map->kptr_off_tab; - return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); + return field; } -void bpf_map_free_kptr_off_tab(struct bpf_map *map) +void btf_record_free(struct btf_record *rec) { - struct bpf_map_value_off *tab = map->kptr_off_tab; int i; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < tab->nr_off; i++) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + for (i = 0; i < rec->cnt; i++) { + switch (rec->fields[i].type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (rec->fields[i].kptr.module) + module_put(rec->fields[i].kptr.module); + btf_put(rec->fields[i].kptr.btf); + break; + default: + WARN_ON_ONCE(1); + continue; + } } - kfree(tab); - map->kptr_off_tab = NULL; + kfree(rec); +} + +void bpf_map_free_record(struct bpf_map *map) +{ + btf_record_free(map->record); + map->record = NULL; } -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +struct btf_record *btf_record_dup(const struct btf_record *rec) { - struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; - int size, i; + const struct btf_field *fields; + struct btf_record *new_rec; + int ret, size, i; - if (!map_value_has_kptrs(map)) - return ERR_PTR(-ENOENT); - size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); - new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); - if (!new_tab) + if (IS_ERR_OR_NULL(rec)) + return NULL; + size = offsetof(struct btf_record, fields[rec->cnt]); + new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_rec) return ERR_PTR(-ENOMEM); - /* Do a deep copy of the kptr_off_tab */ - for (i = 0; i < tab->nr_off; i++) { - btf_get(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { - while (i--) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + /* Do a deep copy of the btf_record */ + fields = rec->fields; + new_rec->cnt = 0; + for (i = 0; i < rec->cnt; i++) { + switch (fields[i].type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + btf_get(fields[i].kptr.btf); + if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { + ret = -ENXIO; + goto free; } - kfree(new_tab); - return ERR_PTR(-ENXIO); + break; + default: + ret = -EFAULT; + WARN_ON_ONCE(1); + goto free; } + new_rec->cnt++; } - return new_tab; + return new_rec; +free: + btf_record_free(new_rec); + return ERR_PTR(ret); } -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { - struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; - bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; - if (!a_has_kptr && !b_has_kptr) + if (!a_has_fields && !b_has_fields) return true; - if (a_has_kptr != b_has_kptr) + if (a_has_fields != b_has_fields) return false; - if (tab_a->nr_off != tab_b->nr_off) + if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); - return !memcmp(tab_a, tab_b, size); + size = offsetof(struct btf_record, fields[rec_a->cnt]); + return !memcmp(rec_a, rec_b, size); } -/* Caller must ensure map_value_has_kptrs is true. Note that this function can - * be called on a map value while the map_value is visible to BPF programs, as - * it ensures the correct synchronization, and we already enforce the same using - * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. - */ -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - unsigned long *btf_id_ptr; + const struct btf_field *fields; int i; - for (i = 0; i < tab->nr_off; i++) { - struct bpf_map_value_off_desc *off_desc = &tab->off[i]; - unsigned long old_ptr; - - btf_id_ptr = map_value + off_desc->offset; - if (off_desc->type == BPF_KPTR_UNREF) { - u64 *p = (u64 *)btf_id_ptr; - - WRITE_ONCE(*p, 0); + if (IS_ERR_OR_NULL(rec)) + return; + fields = rec->fields; + for (i = 0; i < rec->cnt; i++) { + const struct btf_field *field = &fields[i]; + void *field_ptr = obj + field->offset; + + switch (fields[i].type) { + case BPF_KPTR_UNREF: + WRITE_ONCE(*(u64 *)field_ptr, 0); + break; + case BPF_KPTR_REF: + field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + break; + default: + WARN_ON_ONCE(1); continue; } - old_ptr = xchg(btf_id_ptr, 0); - off_desc->kptr.dtor((void *)old_ptr); } } @@ -612,10 +632,10 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); - kfree(map->off_arr); + kfree(map->field_offs); bpf_map_release_memcg(map); /* implementation dependent freeing, map_free callback also does - * bpf_map_free_kptr_off_tab, if needed. + * bpf_map_free_record, if needed. */ map->ops->map_free(map); } @@ -779,7 +799,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) int err; if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map) || map_value_has_kptrs(map)) + map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -906,7 +926,7 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) +static int map_field_offs_cmp(const void *_a, const void *_b, const void *priv) { const u32 a = *(const u32 *)_a; const u32 b = *(const u32 *)_b; @@ -918,15 +938,15 @@ static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) return 0; } -static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) +static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv) { struct bpf_map *map = (struct bpf_map *)priv; - u32 *off_base = map->off_arr->field_off; + u32 *off_base = map->field_offs->field_off; u32 *a = _a, *b = _b; u8 *sz_a, *sz_b; - sz_a = map->off_arr->field_sz + (a - off_base); - sz_b = map->off_arr->field_sz + (b - off_base); + sz_a = map->field_offs->field_sz + (a - off_base); + sz_b = map->field_offs->field_sz + (b - off_base); swap(*a, *b); swap(*sz_a, *sz_b); @@ -936,51 +956,51 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map) { bool has_spin_lock = map_value_has_spin_lock(map); bool has_timer = map_value_has_timer(map); - bool has_kptrs = map_value_has_kptrs(map); - struct bpf_map_off_arr *off_arr; + bool has_fields = !IS_ERR_OR_NULL(map->record); + struct btf_field_offs *fo; u32 i; - if (!has_spin_lock && !has_timer && !has_kptrs) { - map->off_arr = NULL; + if (!has_spin_lock && !has_timer && !has_fields) { + map->field_offs = NULL; return 0; } - off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); - if (!off_arr) + fo = kmalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN); + if (!fo) return -ENOMEM; - map->off_arr = off_arr; + map->field_offs = fo; - off_arr->cnt = 0; + fo->cnt = 0; if (has_spin_lock) { - i = off_arr->cnt; + i = fo->cnt; - off_arr->field_off[i] = map->spin_lock_off; - off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); - off_arr->cnt++; + fo->field_off[i] = map->spin_lock_off; + fo->field_sz[i] = sizeof(struct bpf_spin_lock); + fo->cnt++; } if (has_timer) { - i = off_arr->cnt; + i = fo->cnt; - off_arr->field_off[i] = map->timer_off; - off_arr->field_sz[i] = sizeof(struct bpf_timer); - off_arr->cnt++; + fo->field_off[i] = map->timer_off; + fo->field_sz[i] = sizeof(struct bpf_timer); + fo->cnt++; } - if (has_kptrs) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - u32 *off = &off_arr->field_off[off_arr->cnt]; - u8 *sz = &off_arr->field_sz[off_arr->cnt]; + if (has_fields) { + struct btf_record *rec = map->record; + u32 *off = &fo->field_off[fo->cnt]; + u8 *sz = &fo->field_sz[fo->cnt]; - for (i = 0; i < tab->nr_off; i++) { - *off++ = tab->off[i].offset; - *sz++ = sizeof(u64); + for (i = 0; i < rec->cnt; i++) { + *off++ = rec->fields[i].offset; + *sz++ = btf_field_type_size(rec->fields[i].type); } - off_arr->cnt += tab->nr_off; + fo->cnt += rec->cnt; } - if (off_arr->cnt == 1) + if (fo->cnt == 1) return 0; - sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), - map_off_arr_cmp, map_off_arr_swap, map); + sort_r(fo->field_off, fo->cnt, sizeof(fo->field_off[0]), + map_field_offs_cmp, map_field_offs_swap, map); return 0; } @@ -1038,8 +1058,10 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EOPNOTSUPP; } - map->kptr_off_tab = btf_parse_kptrs(btf, value_type); - if (map_value_has_kptrs(map)) { + map->record = btf_parse_fields(btf, value_type); + if (!IS_ERR_OR_NULL(map->record)) { + int i; + if (!bpf_capable()) { ret = -EPERM; goto free_map_tab; @@ -1048,12 +1070,25 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, ret = -EACCES; goto free_map_tab; } - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { - ret = -EOPNOTSUPP; - goto free_map_tab; + for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { + switch (map->record->field_mask & (1 << i)) { + case 0: + continue; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + default: + /* Fail if map_type checks are missing for a field type */ + ret = -EOPNOTSUPP; + goto free_map_tab; + } } } @@ -1065,7 +1100,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; free_map_tab: - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); return ret; } @@ -1186,7 +1221,7 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); free_map_off_arr: - kfree(map->off_arr); + kfree(map->field_offs); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -1883,7 +1918,7 @@ static int map_freeze(const union bpf_attr *attr) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || map_value_has_kptrs(map)) { + map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14d350a25d5d..5ce5364ce898 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -262,7 +262,7 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; - struct bpf_map_value_off_desc *kptr_off_desc; + struct btf_field *kptr_field; u8 uninit_dynptr_regno; }; @@ -3674,15 +3674,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, } static int map_kptr_match_type(struct bpf_verifier_env *env, - struct bpf_map_value_off_desc *off_desc, + struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); int perm_flags = PTR_MAYBE_NULL; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) perm_flags |= PTR_UNTRUSTED; if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -3729,15 +3729,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * strict mode to true for type match. */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - off_desc->kptr.btf, off_desc->kptr.btf_id, - off_desc->type == BPF_KPTR_REF)) + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + kptr_field->type == BPF_KPTR_REF)) goto bad_type; return 0; bad_type: verbose(env, "invalid kptr access, R%d type=%s%s ", regno, reg_type_str(env, reg->type), reg_name); verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), targ_name); else @@ -3747,7 +3747,7 @@ bad_type: static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, - struct bpf_map_value_off_desc *off_desc) + struct btf_field *kptr_field) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); @@ -3757,7 +3757,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * - Reject cases where variable offset may touch kptr * - size of access (must be BPF_DW) * - tnum_is_const(reg->var_off) - * - off_desc->offset == off + reg->var_off.value + * - kptr_field->offset == off + reg->var_off.value */ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ if (BPF_MODE(insn->code) != BPF_MEM) { @@ -3768,7 +3768,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -3778,19 +3778,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, - off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && - map_kptr_match_type(env, off_desc, val_reg, value_regno)) + map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES; } else if (class == BPF_ST) { if (insn->imm) { verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", - off_desc->offset); + kptr_field->offset); return -EACCES; } } else { @@ -3809,7 +3809,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; - int err; + struct btf_record *rec; + int err, i; err = check_mem_region_access(env, regno, off, size, map->value_size, zero_size_allowed); @@ -3839,15 +3840,18 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } - if (map_value_has_kptrs(map)) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - int i; - - for (i = 0; i < tab->nr_off; i++) { - u32 p = tab->off[i].offset; - - if (reg->smin_value + off < p + sizeof(u64) && - p < reg->umax_value + off + size) { + if (IS_ERR_OR_NULL(map->record)) + return 0; + rec = map->record; + for (i = 0; i < rec->cnt; i++) { + struct btf_field *field = &rec->fields[i]; + u32 p = field->offset; + + if (reg->smin_value + off < p + btf_field_type_size(field->type) && + p < reg->umax_value + off + size) { + switch (field->type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -3866,10 +3870,13 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } break; + default: + verbose(env, "field cannot be accessed directly by load/store\n"); + return -EACCES; } } } - return err; + return 0; } #define MAX_PACKET_OFF 0xffff @@ -4742,7 +4749,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { - struct bpf_map_value_off_desc *kptr_off_desc = NULL; + struct btf_field *kptr_field = NULL; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -4756,11 +4763,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; if (tnum_is_const(reg->var_off)) - kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, - off + reg->var_off.value); - if (kptr_off_desc) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, - kptr_off_desc); + kptr_field = btf_record_find(reg->map_ptr->record, + off + reg->var_off.value, BPF_KPTR); + if (kptr_field) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; @@ -5527,10 +5533,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map_value_off_desc *off_desc; struct bpf_map *map_ptr = reg->map_ptr; + struct btf_field *kptr_field; u32 kptr_off; - int ret; if (!tnum_is_const(reg->var_off)) { verbose(env, @@ -5543,30 +5548,23 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, map_ptr->name); return -EINVAL; } - if (!map_value_has_kptrs(map_ptr)) { - ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); - if (ret == -E2BIG) - verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, - BPF_MAP_VALUE_OFF_MAX); - else if (ret == -EEXIST) - verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); - else - verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); return -EINVAL; } meta->map_ptr = map_ptr; kptr_off = reg->off + reg->var_off.value; - off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); - if (!off_desc) { + kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); + if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (off_desc->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } - meta->kptr_off_desc = off_desc; + meta->kptr_field = kptr_field; return 0; } @@ -5788,7 +5786,7 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { @@ -7536,8 +7534,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; if (func_id == BPF_FUNC_kptr_xchg) { - ret_btf = meta.kptr_off_desc->kptr.btf; - ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + ret_btf = meta.kptr_field->kptr.btf; + ret_btf_id = meta.kptr_field->kptr.btf_id; } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); -- cgit v1.2.3 From db559117828d2448fe81ada051c60bcf39f822e9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:56 +0530 Subject: bpf: Consolidate spin_lock, timer management into btf_record Now that kptr_off_tab has been refactored into btf_record, and can hold more than one specific field type, accomodate bpf_spin_lock and bpf_timer as well. While they don't require any more metadata than offset, having all special fields in one place allows us to share the same code for allocated user defined types and handle both map values and these allocated objects in a similar fashion. As an optimization, we still keep spin_lock_off and timer_off offsets in the btf_record structure, just to avoid having to find the btf_field struct each time their offset is needed. This is mostly needed to manipulate such objects in a map value at runtime. It's ok to hardcode just one offset as more than one field is disallowed. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 53 ++++--- include/linux/btf.h | 3 +- kernel/bpf/arraymap.c | 19 +-- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/btf.c | 323 ++++++++++++++++++++++------------------- kernel/bpf/hashtab.c | 24 +-- kernel/bpf/helpers.c | 6 +- kernel/bpf/local_storage.c | 2 +- kernel/bpf/map_in_map.c | 5 +- kernel/bpf/syscall.c | 135 ++++++++--------- kernel/bpf/verifier.c | 82 +++-------- net/core/bpf_sk_storage.c | 4 +- 12 files changed, 314 insertions(+), 344 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5f2a42033a37..aae85019abde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -166,13 +166,13 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + - 1 + /* for bpf_spin_lock */ - 1, /* for bpf_timer */ + BTF_FIELDS_MAX = 10, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, }; enum btf_field_type { + BPF_SPIN_LOCK = (1 << 0), + BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, @@ -196,6 +196,8 @@ struct btf_field { struct btf_record { u32 cnt; u32 field_mask; + int spin_lock_off; + int timer_off; struct btf_field fields[]; }; @@ -220,10 +222,8 @@ struct bpf_map { u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; - int spin_lock_off; /* >=0 valid offset, <0 error */ - struct btf_record *record; - int timer_off; /* >=0 valid offset, <0 error */ u32 id; + struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; @@ -257,9 +257,29 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline const char *btf_field_type_name(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + return "bpf_spin_lock"; + case BPF_TIMER: + return "bpf_timer"; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return "kptr"; + default: + WARN_ON_ONCE(1); + return "unknown"; + } +} + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return sizeof(struct bpf_spin_lock); + case BPF_TIMER: + return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); @@ -272,6 +292,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return __alignof__(struct bpf_spin_lock); + case BPF_TIMER: + return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); @@ -288,22 +312,8 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline bool map_value_has_spin_lock(const struct bpf_map *map) -{ - return map->spin_lock_off >= 0; -} - -static inline bool map_value_has_timer(const struct bpf_map *map) -{ - return map->timer_off >= 0; -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - if (unlikely(map_value_has_spin_lock(map))) - memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); - if (unlikely(map_value_has_timer(map))) - memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); if (!IS_ERR_OR_NULL(map->record)) { struct btf_field *fields = map->record->fields; u32 cnt = map->record->cnt; @@ -1740,6 +1750,7 @@ void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 9e62717cdc7a..282006abd062 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,7 +163,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 417f84342e98..672eb17ac421 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -306,13 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_fields(struct bpf_array *arr, void *val) -{ - if (map_value_has_timer(&arr->map)) - bpf_timer_cancel_and_free(val + arr->map.timer_off); - bpf_obj_free_fields(arr->map.record, val); -} - /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -334,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -348,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } return 0; } @@ -385,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -409,11 +402,11 @@ static void array_map_free_timers(struct bpf_map *map) int i; /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!map_value_has_timer(map)) + if (!btf_record_has_field(map->record, BPF_TIMER)) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 93d9b1b17bc8..37020078d1c1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -382,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(&smap->map))) + !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8391a77138ee..3dad828db13c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3205,16 +3205,20 @@ enum { struct btf_field_info { enum btf_field_type type; u32 off; - u32 type_id; + struct { + u32 type_id; + } kptr; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, enum btf_field_type field_type, + struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; + info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } @@ -3251,28 +3255,66 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!__btf_type_is_struct(t)) return -EINVAL; - info->type_id = res_id; - info->off = off; info->type = type; + info->off = off; + info->kptr.type_id = res_id; return BTF_FIELD_FOUND; } -static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_info_type field_type, +static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, + int *align, int *sz) +{ + int type = 0; + + if (field_mask & BPF_SPIN_LOCK) { + if (!strcmp(name, "bpf_spin_lock")) { + if (*seen_mask & BPF_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_SPIN_LOCK; + type = BPF_SPIN_LOCK; + goto end; + } + } + if (field_mask & BPF_TIMER) { + if (!strcmp(name, "bpf_timer")) { + if (*seen_mask & BPF_TIMER) + return -E2BIG; + *seen_mask |= BPF_TIMER; + type = BPF_TIMER; + goto end; + } + } + /* Only return BPF_KPTR when all other types with matchable names fail */ + if (field_mask & BPF_KPTR) { + type = BPF_KPTR_REF; + goto end; + } + return 0; +end: + *sz = btf_field_type_size(type); + *align = btf_field_type_align(type); + return type; +} + +static int btf_find_struct_field(const struct btf *btf, + const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_member *member; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; off = __btf_member_bit_offset(t, member); if (off % 8) @@ -3280,17 +3322,18 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t return -EINVAL; off /= 8; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, member_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3310,37 +3353,41 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_info_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_var_secinfo *vsi; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - off = vsi->offset; - - if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; + + off = vsi->offset; if (vsi->size != sz) continue; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, var_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3360,78 +3407,98 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - enum btf_field_info_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { - const char *name; - int sz, align; - - switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - name = "bpf_spin_lock"; - sz = sizeof(struct bpf_spin_lock); - align = __alignof__(struct bpf_spin_lock); - break; - case BTF_FIELD_TIMER: - name = "bpf_timer"; - sz = sizeof(struct bpf_timer); - align = __alignof__(struct bpf_timer); - break; - case BTF_FIELD_KPTR: - name = NULL; - sz = sizeof(u64); - align = 8; - break; - default: - return -EFAULT; - } - if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_struct_field(btf, t, field_mask, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); return -EINVAL; } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; + struct module *mod = NULL; + const struct btf_type *t; + struct btf *kernel_btf; int ret; + s32 id; - ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; -} + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info->kptr.type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) + return id; + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info->type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } -int btf_find_timer(const struct btf *btf, const struct btf_type *t) -{ - struct btf_field_info info; - int ret; + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } - ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + field->kptr.dtor = (void *)addr; + } + + field->kptr.btf_id = id; + field->kptr.btf = kernel_btf; + field->kptr.module = mod; + return 0; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); + return ret; } -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t) +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size) { struct btf_field_info info_arr[BTF_FIELDS_MAX]; - struct btf *kernel_btf = NULL; - struct module *mod = NULL; struct btf_record *rec; int ret, i, cnt; - ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) @@ -3441,80 +3508,44 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); - rec->cnt = 0; - for (i = 0; i < cnt; i++) { - const struct btf_type *t; - s32 id; - /* Find type in map BTF, and use it to look up the matching type - * in vmlinux or module BTFs, by name and kind. - */ - t = btf_type_by_id(btf, info_arr[i].type_id); - id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); - if (id < 0) { - ret = id; + rec->spin_lock_off = -EINVAL; + rec->timer_off = -EINVAL; + for (i = 0; i < cnt; i++) { + if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { + WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); + ret = -EFAULT; goto end; } - /* Find and stash the function pointer for the destruction function that - * needs to be eventually invoked from the map free path. - */ - if (info_arr[i].type == BPF_KPTR_REF) { - const struct btf_type *dtor_func; - const char *dtor_func_name; - unsigned long addr; - s32 dtor_btf_id; - - /* This call also serves as a whitelist of allowed objects that - * can be used as a referenced pointer and be stored in a map at - * the same time. - */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); - if (dtor_btf_id < 0) { - ret = dtor_btf_id; - goto end_btf; - } - - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); - if (!dtor_func) { - ret = -ENOENT; - goto end_btf; - } - - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); - if (!mod) { - ret = -ENXIO; - goto end_btf; - } - } - - /* We already verified dtor_func to be btf_type_is_func - * in register_btf_id_dtor_kfuncs. - */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); - addr = kallsyms_lookup_name(dtor_func_name); - if (!addr) { - ret = -EINVAL; - goto end_mod; - } - rec->fields[i].kptr.dtor = (void *)addr; - } - rec->field_mask |= info_arr[i].type; rec->fields[i].offset = info_arr[i].off; rec->fields[i].type = info_arr[i].type; - rec->fields[i].kptr.btf_id = id; - rec->fields[i].kptr.btf = kernel_btf; - rec->fields[i].kptr.module = mod; + + switch (info_arr[i].type) { + case BPF_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->spin_lock_off = rec->fields[i].offset; + break; + case BPF_TIMER: + WARN_ON_ONCE(rec->timer_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->timer_off = rec->fields[i].offset; + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + default: + ret = -EFAULT; + goto end; + } rec->cnt++; } return rec; -end_mod: - module_put(mod); -end_btf: - btf_put(kernel_btf); end: btf_record_free(rec); return ERR_PTR(ret); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5ea8f9bb7a9..50d254cd0709 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_timer(&htab->map)) + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -231,9 +231,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -763,8 +761,6 @@ static void check_and_free_fields(struct bpf_htab *htab, { void *map_value = elem->key + round_up(htab->map.key_size, 8); - if (map_value_has_timer(&htab->map)) - bpf_timer_cancel_and_free(map_value + htab->map.timer_off); bpf_obj_free_fields(htab->map.record, map_value); } @@ -1089,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { - if (unlikely(!map_value_has_spin_lock(map))) + if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, @@ -1472,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We don't reset or free kptr on uref dropping to zero, - * hence just free timer. - */ - bpf_timer_cancel_and_free(l->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + /* We only free timer on uref dropping to zero */ + bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } @@ -1488,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(&htab->map)) + /* We only free timer on uref dropping to zero */ + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1673,7 +1665,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 124fd199ce5c..283f55bbeb70 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -366,9 +366,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, struct bpf_spin_lock *lock; if (lock_src) - lock = src + map->spin_lock_off; + lock = src + map->record->spin_lock_off; else - lock = dst + map->spin_lock_off; + lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); @@ -1169,7 +1169,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->timer_off; + t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 098cf336fae6..e90d9f63edc5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index d6c662183f88..8ca0cca39d49 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -29,7 +29,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-ENOTSUPP); } - if (map_value_has_spin_lock(inner_map)) { + if (btf_record_has_field(inner_map->record, BPF_SPIN_LOCK)) { fdput(f); return ERR_PTR(-ENOTSUPP); } @@ -50,8 +50,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; - inner_map_meta->spin_lock_off = inner_map->spin_lock_off; - inner_map_meta->timer_off = inner_map->timer_off; inner_map_meta->record = btf_record_dup(inner_map->record); if (IS_ERR(inner_map_meta->record)) { /* btf_record_dup returns NULL or valid pointer in case of @@ -92,7 +90,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags && btf_record_equal(meta0->record, meta1->record); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b80c0e2eb73f..53d6dc5cf0e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -527,6 +527,9 @@ void btf_record_free(struct btf_record *rec) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (rec->fields[i].kptr.module) @@ -564,6 +567,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: btf_get(fields[i].kptr.btf); @@ -600,6 +606,13 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return !memcmp(rec_a, rec_b, size); } +void bpf_obj_free_timer(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) + return; + bpf_timer_cancel_and_free(obj + rec->timer_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -613,6 +626,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) void *field_ptr = obj + field->offset; switch (fields[i].type) { + case BPF_SPIN_LOCK: + break; + case BPF_TIMER: + bpf_timer_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -798,8 +816,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) + if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -954,48 +971,30 @@ static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv) static int bpf_map_alloc_off_arr(struct bpf_map *map) { - bool has_spin_lock = map_value_has_spin_lock(map); - bool has_timer = map_value_has_timer(map); bool has_fields = !IS_ERR_OR_NULL(map->record); struct btf_field_offs *fo; - u32 i; + struct btf_record *rec; + u32 i, *off; + u8 *sz; - if (!has_spin_lock && !has_timer && !has_fields) { + if (!has_fields) { map->field_offs = NULL; return 0; } - fo = kmalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN); + fo = kzalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN); if (!fo) return -ENOMEM; map->field_offs = fo; - fo->cnt = 0; - if (has_spin_lock) { - i = fo->cnt; - - fo->field_off[i] = map->spin_lock_off; - fo->field_sz[i] = sizeof(struct bpf_spin_lock); - fo->cnt++; - } - if (has_timer) { - i = fo->cnt; - - fo->field_off[i] = map->timer_off; - fo->field_sz[i] = sizeof(struct bpf_timer); - fo->cnt++; - } - if (has_fields) { - struct btf_record *rec = map->record; - u32 *off = &fo->field_off[fo->cnt]; - u8 *sz = &fo->field_sz[fo->cnt]; - - for (i = 0; i < rec->cnt; i++) { - *off++ = rec->fields[i].offset; - *sz++ = btf_field_type_size(rec->fields[i].type); - } - fo->cnt += rec->cnt; + rec = map->record; + off = fo->field_off; + sz = fo->field_sz; + for (i = 0; i < rec->cnt; i++) { + *off++ = rec->fields[i].offset; + *sz++ = btf_field_type_size(rec->fields[i].type); } + fo->cnt = rec->cnt; if (fo->cnt == 1) return 0; @@ -1026,39 +1025,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; - map->spin_lock_off = btf_find_spin_lock(btf, value_type); - - if (map_value_has_spin_lock(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE && - map->map_type != BPF_MAP_TYPE_TASK_STORAGE && - map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) - return -ENOTSUPP; - if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > - map->value_size) { - WARN_ONCE(1, - "verifier bug spin_lock_off %d value_size %d\n", - map->spin_lock_off, map->value_size); - return -EFAULT; - } - } - - map->timer_off = btf_find_timer(btf, value_type); - if (map_value_has_timer(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) - return -EOPNOTSUPP; - } - - map->record = btf_parse_fields(btf, value_type); + map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR, + map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1074,6 +1042,26 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, switch (map->record->field_mask & (1 << i)) { case 0: continue; + case BPF_SPIN_LOCK: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_TIMER: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + return -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (map->map_type != BPF_MAP_TYPE_HASH && @@ -1153,8 +1141,6 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); - map->spin_lock_off = -EINVAL; - map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1368,7 +1354,7 @@ static int map_lookup_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1441,7 +1427,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1604,7 +1590,7 @@ int generic_map_delete_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1661,7 +1647,7 @@ int generic_map_update_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1724,7 +1710,7 @@ int generic_map_lookup_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); @@ -1846,7 +1832,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1917,8 +1903,7 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ce5364ce898..73a3516f1a48 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -454,7 +454,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && - map_value_has_spin_lock(reg->map_ptr); + btf_record_has_field(reg->map_ptr->record, BPF_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -1388,7 +1388,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (map_value_has_timer(map->inner_map_meta)) + if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; @@ -3817,29 +3817,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) return err; - if (map_value_has_spin_lock(map)) { - u32 lock = map->spin_lock_off; - - /* if any part of struct bpf_spin_lock can be touched by - * load/store reject this program. - * To check that [x1, x2) overlaps with [y1, y2) - * it is sufficient to check x1 < y2 && y1 < x2. - */ - if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && - lock < reg->umax_value + off + size) { - verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_timer(map)) { - u32 t = map->timer_off; - - if (reg->smin_value + off < t + sizeof(struct bpf_timer) && - t < reg->umax_value + off + size) { - verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); - return -EACCES; - } - } if (IS_ERR_OR_NULL(map->record)) return 0; rec = map->record; @@ -3847,6 +3824,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct btf_field *field = &rec->fields[i]; u32 p = field->offset; + /* If any part of a field can be touched by load/store, reject + * this program. To check that [x1, x2) overlaps with [y1, y2), + * it is sufficient to check x1 < y2 && y1 < x2. + */ if (reg->smin_value + off < p + btf_field_type_size(field->type) && p < reg->umax_value + off + size) { switch (field->type) { @@ -3871,7 +3852,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, } break; default: - verbose(env, "field cannot be accessed directly by load/store\n"); + verbose(env, "%s cannot be accessed directly by load/store\n", + btf_field_type_name(field->type)); return -EACCES; } } @@ -5440,24 +5422,13 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_spin_lock(map)) { - if (map->spin_lock_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_spin_lock'\n", - map->name); - else if (map->spin_lock_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_spin_lock'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_spin_lock is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + verbose(env, "map '%s' has no valid bpf_spin_lock\n", map->name); return -EINVAL; } - if (map->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", - val + reg->off); + if (map->record->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", + val + reg->off, map->record->spin_lock_off); return -EINVAL; } if (is_lock) { @@ -5500,24 +5471,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_timer(map)) { - if (map->timer_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_timer'\n", - map->name); - else if (map->timer_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_timer'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_timer is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_TIMER)) { + verbose(env, "map '%s' has no valid bpf_timer\n", map->name); return -EINVAL; } - if (map->timer_off != val + reg->off) { + if (map->record->timer_off != val + reg->off) { verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->timer_off); + val + reg->off, map->record->timer_off); return -EINVAL; } if (meta->map_ptr) { @@ -7470,7 +7430,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_type) && - map_value_has_spin_lock(meta.map_ptr)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -10381,7 +10341,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - if (map_value_has_spin_lock(map)) + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -12659,7 +12619,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - if (map_value_has_spin_lock(map)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -12676,7 +12636,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (map_value_has_timer(map)) { + if (btf_record_has_field(map->record, BPF_TIMER)) { if (is_tracing_prog_type(prog_type)) { verbose(env, "tracing progs cannot use bpf_timer yet\n"); return -EINVAL; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 49884e7de080..9d2288c0736e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -147,7 +147,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, if (!copy_selem) return NULL; - if (map_value_has_spin_lock(&smap->map)) + if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, SDATA(selem)->data, true); else @@ -566,7 +566,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) if (!nla_value) goto errout; - if (map_value_has_spin_lock(&smap->map)) + if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, nla_data(nla_value), sdata->data, true); else -- cgit v1.2.3 From f71b2f64177a199d5b1d2047e155d45fd98f564a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:57 +0530 Subject: bpf: Refactor map->off_arr handling Refactor map->off_arr handling into generic functions that can work on their own without hardcoding map specific code. The btf_fields_offs structure is now returned from btf_parse_field_offs, which can be reused later for types in program BTF. All functions like copy_map_value, zero_map_value call generic underlying functions so that they can also be reused later for copying to values allocated in programs which encode specific fields. Later, some helper functions will also require access to this btf_field_offs structure to be able to skip over special fields at runtime. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 +++++++++++++++++------------ include/linux/btf.h | 1 + kernel/bpf/btf.c | 55 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 73 +++++++--------------------------------------------- 4 files changed, 89 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aae85019abde..798aec816970 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -341,57 +341,64 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) +static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, + void *dst, void *src, u32 size, + bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { + if (likely(!foffs)) { if (long_memcpy) - bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + bpf_long_memcpy(dst, src, round_up(size, 8)); else - memcpy(dst, src, map->value_size); + memcpy(dst, src, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); + memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, false); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, true); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); } -static inline void zero_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { - memset(dst, 0, map->value_size); + if (likely(!foffs)) { + memset(dst, 0, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memset(dst + curr_off, 0, map->value_size - curr_off); + memset(dst + curr_off, 0, size - curr_off); +} + +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_memzero(map->field_offs, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 282006abd062..d80345fa566b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -165,6 +165,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3dad828db13c..5579ff3a5b54 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3551,6 +3551,61 @@ end: return ERR_PTR(ret); } +static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv) +{ + struct btf_field_offs *foffs = (void *)priv; + u32 *off_base = foffs->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = foffs->field_sz + (a - off_base); + sz_b = foffs->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec) +{ + struct btf_field_offs *foffs; + u32 i, *off; + u8 *sz; + + BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz)); + if (IS_ERR_OR_NULL(rec) || WARN_ON_ONCE(rec->cnt > sizeof(foffs->field_off))) + return NULL; + + foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN); + if (!foffs) + return ERR_PTR(-ENOMEM); + + off = foffs->field_off; + sz = foffs->field_sz; + for (i = 0; i < rec->cnt; i++) { + off[i] = rec->fields[i].offset; + sz[i] = btf_field_type_size(rec->fields[i].type); + } + foffs->cnt = rec->cnt; + + if (foffs->cnt == 1) + return foffs; + sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]), + btf_field_offs_cmp, btf_field_offs_swap, foffs); + return foffs; +} + static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 53d6dc5cf0e2..85532d301124 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -943,66 +943,6 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_field_offs_cmp(const void *_a, const void *_b, const void *priv) -{ - const u32 a = *(const u32 *)_a; - const u32 b = *(const u32 *)_b; - - if (a < b) - return -1; - else if (a > b) - return 1; - return 0; -} - -static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv) -{ - struct bpf_map *map = (struct bpf_map *)priv; - u32 *off_base = map->field_offs->field_off; - u32 *a = _a, *b = _b; - u8 *sz_a, *sz_b; - - sz_a = map->field_offs->field_sz + (a - off_base); - sz_b = map->field_offs->field_sz + (b - off_base); - - swap(*a, *b); - swap(*sz_a, *sz_b); -} - -static int bpf_map_alloc_off_arr(struct bpf_map *map) -{ - bool has_fields = !IS_ERR_OR_NULL(map->record); - struct btf_field_offs *fo; - struct btf_record *rec; - u32 i, *off; - u8 *sz; - - if (!has_fields) { - map->field_offs = NULL; - return 0; - } - - fo = kzalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN); - if (!fo) - return -ENOMEM; - map->field_offs = fo; - - rec = map->record; - off = fo->field_off; - sz = fo->field_sz; - for (i = 0; i < rec->cnt; i++) { - *off++ = rec->fields[i].offset; - *sz++ = btf_field_type_size(rec->fields[i].type); - } - fo->cnt = rec->cnt; - - if (fo->cnt == 1) - return 0; - sort_r(fo->field_off, fo->cnt, sizeof(fo->field_off[0]), - map_field_offs_cmp, map_field_offs_swap, map); - return 0; -} - static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -1097,6 +1037,7 @@ free_map_tab: static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct btf_field_offs *foffs; struct bpf_map *map; int f_flags; int err; @@ -1176,13 +1117,17 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = bpf_map_alloc_off_arr(map); - if (err) + + foffs = btf_parse_field_offs(map->record); + if (IS_ERR(foffs)) { + err = PTR_ERR(foffs); goto free_map; + } + map->field_offs = foffs; err = security_bpf_map_alloc(map); if (err) - goto free_map_off_arr; + goto free_map_field_offs; err = bpf_map_alloc_id(map); if (err) @@ -1206,7 +1151,7 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); -free_map_off_arr: +free_map_field_offs: kfree(map->field_offs); free_map: btf_put(map->btf); -- cgit v1.2.3 From a3b666bfa9c9edc05bca62a87abafe0936bd7f97 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Nov 2022 09:36:44 -0700 Subject: bpf: propagate precision in ALU/ALU64 operations When processing ALU/ALU64 operations (apart from BPF_MOV, which is handled correctly already; and BPF_NEG and BPF_END are special and don't have source register), if destination register is already marked precise, this causes problem with potentially missing precision tracking for the source register. E.g., when we have r1 >>= r5 and r1 is marked precise, but r5 isn't, this will lead to r5 staying as imprecise. This is due to the precision backtracking logic stopping early when it sees r1 is already marked precise. If r1 wasn't precise, we'd keep backtracking and would add r5 to the set of registers that need to be marked precise. So there is a discrepancy here which can lead to invalid and incompatible states matched due to lack of precision marking on r5. If r1 wasn't precise, precision backtracking would correctly mark both r1 and r5 as precise. This is simple to fix, though. During the forward instruction simulation pass, for arithmetic operations of `scalar = scalar` form (where is ALU or ALU64 operations), if destination register is already precise, mark source register as precise. This applies only when both involved registers are SCALARs. `ptr += scalar` and `scalar += ptr` cases are already handled correctly. This does have (negative) effect on some selftest programs and few Cilium programs. ~/baseline-tmp-results.csv are veristat results with this patch, while ~/baseline-results.csv is without it. See post scriptum for instructions on how to make Cilium programs testable with veristat. Correctness has a price. $ ./veristat -C -e file,prog,insns,states ~/baseline-results.csv ~/baseline-tmp-results.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ----------------------- -------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_cubic.bpf.linked1.o bpf_cubic_cong_avoid 997 1700 +703 (+70.51%) 62 90 +28 (+45.16%) test_l4lb.bpf.linked1.o balancer_ingress 4559 5469 +910 (+19.96%) 118 126 +8 (+6.78%) ----------------------- -------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- $ ./veristat -C -e file,prog,verdict,insns,states ~/baseline-results-cilium.csv ~/baseline-tmp-results-cilium.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_host.o tail_nodeport_nat_ingress_ipv6 4448 5261 +813 (+18.28%) 234 247 +13 (+5.56%) bpf_host.o tail_nodeport_nat_ipv6_egress 3396 3446 +50 (+1.47%) 201 203 +2 (+1.00%) bpf_lxc.o tail_nodeport_nat_ingress_ipv6 4448 5261 +813 (+18.28%) 234 247 +13 (+5.56%) bpf_overlay.o tail_nodeport_nat_ingress_ipv6 4448 5261 +813 (+18.28%) 234 247 +13 (+5.56%) bpf_xdp.o tail_lb_ipv4 71736 73442 +1706 (+2.38%) 4295 4370 +75 (+1.75%) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- P.S. To make Cilium ([0]) programs libbpf-compatible and thus veristat-loadable, apply changes from topmost commit in [1], which does minimal changes to Cilium source code, mostly around SEC() annotations and BPF map definitions. [0] https://github.com/cilium/cilium/ [1] https://github.com/anakryiko/cilium/commits/libbpf-friendliness Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221104163649.121784-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 73a3516f1a48..ddfb4b0ab35f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9153,6 +9153,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); + } else if (dst_reg->precise) { + /* if dst_reg is precise, src_reg should be precise as well */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; } } else { /* Pretend the src is a reg with a known value, since we only -- cgit v1.2.3 From 529409ea92d590659be487ba0839710329bd8074 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Nov 2022 09:36:45 -0700 Subject: bpf: propagate precision across all frames, not just the last one When equivalent completed state is found and it has additional precision restrictions, BPF verifier propagates precision to currently-being-verified state chain (i.e., including parent states) so that if some of the states in the chain are not yet completed, necessary precision restrictions are enforced. Unfortunately, right now this happens only for the last frame (deepest active subprogram's frame), not all the frames. This can lead to incorrect matching of states due to missing precision marker. Currently this doesn't seem possible as BPF verifier forces everything to precise when validated BPF program has any subprograms. But with the next patch lifting this restriction, this becomes problematic. In fact, without this fix, we'll start getting failure in one of the existing test_verifier test cases: #906/p precise: cross frame pruning FAIL Unexpected success to load! verification time 48 usec stack depth 0+0 processed 26 insns (limit 1000000) max_states_per_insn 3 total_states 17 peak_states 17 mark_read 8 This patch adds precision propagation across all frames. Fixes: a3ce685dd01a ("bpf: fix precision tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221104163649.121784-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 71 ++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ddfb4b0ab35f..5c708eb30664 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2756,7 +2756,7 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, } } -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, +static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) { struct bpf_verifier_state *st = env->cur_state; @@ -2773,7 +2773,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, if (!env->bpf_capable) return 0; - func = st->frame[st->curframe]; + func = st->frame[frame]; if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { @@ -2854,7 +2854,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, break; new_marks = false; - func = st->frame[st->curframe]; + func = st->frame[frame]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; @@ -2920,12 +2920,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno, -1); + return __mark_chain_precision(env, env->cur_state->curframe, regno, -1); } -static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno) { - return __mark_chain_precision(env, -1, spi); + return __mark_chain_precision(env, frame, regno, -1); +} + +static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) +{ + return __mark_chain_precision(env, frame, -1, spi); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -11794,34 +11799,36 @@ static int propagate_precision(struct bpf_verifier_env *env, { struct bpf_reg_state *state_reg; struct bpf_func_state *state; - int i, err = 0; + int i, err = 0, fr; - state = old->frame[old->curframe]; - state_reg = state->regs; - for (i = 0; i < BPF_REG_FP; i++, state_reg++) { - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating r%d\n", i); - err = mark_chain_precision(env, i); - if (err < 0) - return err; - } + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating r%d\n", i, fr); + err = mark_chain_precision_frame(env, fr, i); + if (err < 0) + return err; + } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&state->stack[i])) - continue; - state_reg = &state->stack[i].spilled_ptr; - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE); - err = mark_chain_precision_stack(env, i); - if (err < 0) - return err; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&state->stack[i])) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE, fr); + err = mark_chain_precision_stack_frame(env, fr, i); + if (err < 0) + return err; + } } return 0; } -- cgit v1.2.3 From be2ef8161572ec1973124ebc50f56dafc2925e07 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Nov 2022 09:36:46 -0700 Subject: bpf: allow precision tracking for programs with subprogs Stop forcing precise=true for SCALAR registers when BPF program has any subprograms. Current restriction means that any BPF program, as soon as it uses subprograms, will end up not getting any of the precision tracking benefits in reduction of number of verified states. This patch keeps the fallback mark_all_scalars_precise() behavior if precise marking has to cross function frames. E.g., if subprogram requires R1 (first input arg) to be marked precise, ideally we'd need to backtrack to the parent function and keep marking R1 and its dependencies as precise. But right now we give up and force all the SCALARs in any of the current and parent states to be forced to precise=true. We can lift that restriction in the future. But this patch fixes two issues identified when trying to enable precision tracking for subprogs. First, prevent "escaping" from top-most state in a global subprog. While with entry-level BPF program we never end up requesting precision for R1-R5 registers, because R2-R5 are not initialized (and so not readable in correct BPF program), and R1 is PTR_TO_CTX, not SCALAR, and so is implicitly precise. With global subprogs, though, it's different, as global subprog a) can have up to 5 SCALAR input arguments, which might get marked as precise=true and b) it is validated in isolation from its main entry BPF program. b) means that we can end up exhausting parent state chain and still not mark all registers in reg_mask as precise, which would lead to verifier bug warning. To handle that, we need to consider two cases. First, if the very first state is not immediately "checkpointed" (i.e., stored in state lookup hashtable), it will get correct first_insn_idx and last_insn_idx instruction set during state checkpointing. As such, this case is already handled and __mark_chain_precision() already handles that by just doing nothing when we reach to the very first parent state. st->parent will be NULL and we'll just stop. Perhaps some extra check for reg_mask and stack_mask is due here, but this patch doesn't address that issue. More problematic second case is when global function's initial state is immediately checkpointed before we manage to process the very first instruction. This is happening because when there is a call to global subprog from the main program the very first subprog's instruction is marked as pruning point, so before we manage to process first instruction we have to check and checkpoint state. This patch adds a special handling for such "empty" state, which is identified by having st->last_insn_idx set to -1. In such case, we check that we are indeed validating global subprog, and with some sanity checking we mark input args as precise if requested. Note that we also initialize state->first_insn_idx with correct start insn_idx offset. For main program zero is correct value, but for any subprog it's quite confusing to not have first_insn_idx set. This doesn't have any functional impact, but helps with debugging and state printing. We also explicitly initialize state->last_insns_idx instead of relying on is_state_visited() to do this with env->prev_insns_idx, which will be -1 on the very first instruction. This concludes necessary changes to handle specifically global subprog's precision tracking. Second identified problem was missed handling of BPF helper functions that call into subprogs (e.g., bpf_loop and few others). From precision tracking and backtracking logic's standpoint those are effectively calls into subprogs and should be called as BPF_PSEUDO_CALL calls. This patch takes the least intrusive way and just checks against a short list of current BPF helpers that do call subprogs, encapsulated in is_callback_calling_function() function. But to prevent accidentally forgetting to add new BPF helpers to this "list", we also do a sanity check in __check_func_call, which has to be called for each such special BPF helper, to validate that BPF helper is indeed recognized as callback-calling one. This should catch any missed checks in the future. Adding some special flags to be added in function proto definitions seemed like an overkill in this case. With the above changes, it's possible to remove forceful setting of reg->precise to true in __mark_reg_unknown, which turns on precision tracking both inside subprogs and entry progs that have subprogs. No warnings or errors were detected across all the selftests, but also when validating with veristat against internal Meta BPF objects and Cilium objects. Further, in some BPF programs there are noticeable reduction in number of states and instructions validated due to more effective precision tracking, especially benefiting syncookie test. $ ./veristat -C -e file,prog,insns,states ~/baseline-results.csv ~/subprog-precise-results.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ---------------------------------------- -------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- pyperf600_bpf_loop.bpf.linked1.o on_event 3966 3678 -288 (-7.26%) 306 276 -30 (-9.80%) pyperf_global.bpf.linked1.o on_event 7563 7530 -33 (-0.44%) 520 517 -3 (-0.58%) pyperf_subprogs.bpf.linked1.o on_event 36358 36934 +576 (+1.58%) 2499 2531 +32 (+1.28%) setget_sockopt.bpf.linked1.o skops_sockopt 3965 4038 +73 (+1.84%) 343 347 +4 (+1.17%) test_cls_redirect_subprogs.bpf.linked1.o cls_redirect 64965 64901 -64 (-0.10%) 4619 4612 -7 (-0.15%) test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1491 1307 -184 (-12.34%) 110 100 -10 (-9.09%) test_pkt_access.bpf.linked1.o test_pkt_access 354 349 -5 (-1.41%) 25 24 -1 (-4.00%) test_sock_fields.bpf.linked1.o egress_read_sock_fields 435 375 -60 (-13.79%) 22 20 -2 (-9.09%) test_sysctl_loop2.bpf.linked1.o sysctl_tcp_mem 1508 1501 -7 (-0.46%) 29 28 -1 (-3.45%) test_tc_dtime.bpf.linked1.o egress_fwdns_prio100 468 435 -33 (-7.05%) 45 41 -4 (-8.89%) test_tc_dtime.bpf.linked1.o ingress_fwdns_prio100 398 408 +10 (+2.51%) 42 39 -3 (-7.14%) test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 1096 842 -254 (-23.18%) 97 73 -24 (-24.74%) test_tcp_hdr_options.bpf.linked1.o estab 2758 2408 -350 (-12.69%) 208 181 -27 (-12.98%) test_urandom_usdt.bpf.linked1.o urand_read_with_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) test_urandom_usdt.bpf.linked1.o urand_read_without_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) test_urandom_usdt.bpf.linked1.o urandlib_read_with_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) test_urandom_usdt.bpf.linked1.o urandlib_read_without_sema 466 448 -18 (-3.86%) 31 28 -3 (-9.68%) test_xdp_noinline.bpf.linked1.o balancer_ingress_v6 4302 4294 -8 (-0.19%) 257 256 -1 (-0.39%) xdp_synproxy_kern.bpf.linked1.o syncookie_tc 583722 405757 -177965 (-30.49%) 35846 25735 -10111 (-28.21%) xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 609123 479055 -130068 (-21.35%) 35452 29145 -6307 (-17.79%) ---------------------------------------- -------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221104163649.121784-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c708eb30664..c1169ee1bc7c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,15 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_for_each_map_elem || + func_id == BPF_FUNC_timer_set_callback || + func_id == BPF_FUNC_find_vma || + func_id == BPF_FUNC_loop || + func_id == BPF_FUNC_user_ringbuf_drain; +} + static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -1677,7 +1686,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; + reg->precise = !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -2646,6 +2655,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) return -ENOTSUPP; + /* BPF helpers that invoke callback subprogs are + * equivalent to BPF_PSEUDO_CALL above + */ + if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) + return -ENOTSUPP; /* regular helper call sets R0 */ *reg_mask &= ~1; if (*reg_mask & 0x3f) { @@ -2809,12 +2823,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r return 0; if (!reg_mask && !stack_mask) return 0; + for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + + if (last_idx < 0) { + /* we are at the entry into subprog, which + * is expected for global funcs, but only if + * requested precise registers are R1-R5 + * (which are global func's input arguments) + */ + if (st->curframe == 0 && + st->frame[0]->subprogno > 0 && + st->frame[0]->callsite == BPF_MAIN_FUNC && + stack_mask == 0 && (reg_mask & ~0x3e) == 0) { + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &st->frame[0]->regs[i]; + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); + continue; + } + reg->precise = true; + } + return 0; + } + + verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", + st->frame[0]->subprogno, reg_mask, stack_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + for (i = last_idx;;) { if (skip_first) { err = 0; @@ -6602,6 +6646,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx); + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) @@ -6652,6 +6700,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + /* set_callee_state is used for direct subprog calls, but we are + * interested in validating only BPF helpers that can call subprogs as + * callbacks + */ + if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { + verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + if (insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { @@ -14571,6 +14629,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, subprog); + state->first_insn_idx = env->subprog_info[subprog].start; + state->last_insn_idx = -1; regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { -- cgit v1.2.3 From f63181b6ae79fd3b034cde641db774268c2c3acf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Nov 2022 09:36:47 -0700 Subject: bpf: stop setting precise in current state Setting reg->precise to true in current state is not necessary from correctness standpoint, but it does pessimise the whole precision (or rather "imprecision", because that's what we want to keep as much as possible) tracking. Why is somewhat subtle and my best attempt to explain this is recorded in an extensive comment for __mark_chain_precise() function. Some more careful thinking and code reading is probably required still to grok this completely, unfortunately. Whiteboarding and a bunch of extra handwaiving in person would be even more helpful, but is deemed impractical in Git commit. Next patch pushes this imprecision property even further, building on top of the insights described in this patch. End results are pretty nice, we get reduction in number of total instructions and states verified due to a better states reuse, as some of the states are now more generic and permissive due to less unnecessary precise=true requirements. SELFTESTS RESULTS ================= $ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results.csv ~/imprecise-early-results.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) --------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_iter_ksym.bpf.linked1.o dump_ksym 347 285 -62 (-17.87%) 20 19 -1 (-5.00%) pyperf600_bpf_loop.bpf.linked1.o on_event 3678 3736 +58 (+1.58%) 276 285 +9 (+3.26%) setget_sockopt.bpf.linked1.o skops_sockopt 4038 3947 -91 (-2.25%) 347 343 -4 (-1.15%) test_l4lb.bpf.linked1.o balancer_ingress 4559 2611 -1948 (-42.73%) 118 105 -13 (-11.02%) test_l4lb_noinline.bpf.linked1.o balancer_ingress 6279 6268 -11 (-0.18%) 237 236 -1 (-0.42%) test_misc_tcp_hdr_options.bpf.linked1.o misc_estab 1307 1303 -4 (-0.31%) 100 99 -1 (-1.00%) test_sk_lookup.bpf.linked1.o ctx_narrow_access 456 447 -9 (-1.97%) 39 38 -1 (-2.56%) test_sysctl_loop1.bpf.linked1.o sysctl_tcp_mem 1389 1384 -5 (-0.36%) 26 25 -1 (-3.85%) test_tc_dtime.bpf.linked1.o egress_fwdns_prio101 518 485 -33 (-6.37%) 51 46 -5 (-9.80%) test_tc_dtime.bpf.linked1.o egress_host 519 468 -51 (-9.83%) 50 44 -6 (-12.00%) test_tc_dtime.bpf.linked1.o ingress_fwdns_prio101 842 1000 +158 (+18.76%) 73 88 +15 (+20.55%) xdp_synproxy_kern.bpf.linked1.o syncookie_tc 405757 373173 -32584 (-8.03%) 25735 22882 -2853 (-11.09%) xdp_synproxy_kern.bpf.linked1.o syncookie_xdp 479055 371590 -107465 (-22.43%) 29145 22207 -6938 (-23.81%) --------------------------------------- ---------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- Slight regression in test_tc_dtime.bpf.linked1.o/ingress_fwdns_prio101 is left for a follow up, there might be some more precision-related bugs in existing BPF verifier logic. CILIUM RESULTS ============== $ ./veristat -C -e file,prog,insns,states ~/subprog-precise-results-cilium.csv ~/imprecise-early-results-cilium.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_host.o cil_from_host 762 556 -206 (-27.03%) 43 37 -6 (-13.95%) bpf_host.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%) bpf_host.o tail_nodeport_nat_egress_ipv4 33592 33566 -26 (-0.08%) 2163 2161 -2 (-0.09%) bpf_lxc.o tail_handle_nat_fwd_ipv4 23541 23426 -115 (-0.49%) 1538 1537 -1 (-0.07%) bpf_overlay.o tail_nodeport_nat_egress_ipv4 33581 33543 -38 (-0.11%) 2160 2157 -3 (-0.14%) bpf_xdp.o tail_handle_nat_fwd_ipv4 21659 20920 -739 (-3.41%) 1440 1376 -64 (-4.44%) bpf_xdp.o tail_handle_nat_fwd_ipv6 17084 17039 -45 (-0.26%) 907 905 -2 (-0.22%) bpf_xdp.o tail_lb_ipv4 73442 73430 -12 (-0.02%) 4370 4369 -1 (-0.02%) bpf_xdp.o tail_lb_ipv6 152114 151895 -219 (-0.14%) 6493 6479 -14 (-0.22%) bpf_xdp.o tail_nodeport_nat_egress_ipv4 17377 17200 -177 (-1.02%) 1125 1111 -14 (-1.24%) bpf_xdp.o tail_nodeport_nat_ingress_ipv6 6405 6397 -8 (-0.12%) 309 308 -1 (-0.32%) bpf_xdp.o tail_rev_nodeport_lb4 7126 6934 -192 (-2.69%) 414 402 -12 (-2.90%) bpf_xdp.o tail_rev_nodeport_lb6 18059 17905 -154 (-0.85%) 1105 1096 -9 (-0.81%) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221104163649.121784-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 103 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c1169ee1bc7c..ff3fc21ce99b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2749,8 +2749,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, /* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. + * We also skip current state and go straight to first parent state, + * because precision markings in current non-checkpointed state are + * not needed. See why in the comment in __mark_chain_precision below. */ - for (; st; st = st->parent) + for (st = st->parent; st; st = st->parent) { for (i = 0; i <= st->curframe; i++) { func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { @@ -2768,8 +2771,88 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, reg->precise = true; } } + } } +/* + * __mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) + * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked + * SCALARS, as well as any other registers and slots that contribute to + * a tracked state of given registers/stack slots, depending on specific BPF + * assembly instructions (see backtrack_insns() for exact instruction handling + * logic). This backtracking relies on recorded jmp_history and is able to + * traverse entire chain of parent states. This process ends only when all the + * necessary registers/slots and their transitive dependencies are marked as + * precise. + * + * One important and subtle aspect is that precise marks *do not matter* in + * the currently verified state (current state). It is important to understand + * why this is the case. + * + * First, note that current state is the state that is not yet "checkpointed", + * i.e., it is not yet put into env->explored_states, and it has no children + * states as well. It's ephemeral, and can end up either a) being discarded if + * compatible explored state is found at some point or BPF_EXIT instruction is + * reached or b) checkpointed and put into env->explored_states, branching out + * into one or more children states. + * + * In the former case, precise markings in current state are completely + * ignored by state comparison code (see regsafe() for details). Only + * checkpointed ("old") state precise markings are important, and if old + * state's register/slot is precise, regsafe() assumes current state's + * register/slot as precise and checks value ranges exactly and precisely. If + * states turn out to be compatible, current state's necessary precise + * markings and any required parent states' precise markings are enforced + * after the fact with propagate_precision() logic, after the fact. But it's + * important to realize that in this case, even after marking current state + * registers/slots as precise, we immediately discard current state. So what + * actually matters is any of the precise markings propagated into current + * state's parent states, which are always checkpointed (due to b) case above). + * As such, for scenario a) it doesn't matter if current state has precise + * markings set or not. + * + * Now, for the scenario b), checkpointing and forking into child(ren) + * state(s). Note that before current state gets to checkpointing step, any + * processed instruction always assumes precise SCALAR register/slot + * knowledge: if precise value or range is useful to prune jump branch, BPF + * verifier takes this opportunity enthusiastically. Similarly, when + * register's value is used to calculate offset or memory address, exact + * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to + * what we mentioned above about state comparison ignoring precise markings + * during state comparison, BPF verifier ignores and also assumes precise + * markings *at will* during instruction verification process. But as verifier + * assumes precision, it also propagates any precision dependencies across + * parent states, which are not yet finalized, so can be further restricted + * based on new knowledge gained from restrictions enforced by their children + * states. This is so that once those parent states are finalized, i.e., when + * they have no more active children state, state comparison logic in + * is_state_visited() would enforce strict and precise SCALAR ranges, if + * required for correctness. + * + * To build a bit more intuition, note also that once a state is checkpointed, + * the path we took to get to that state is not important. This is crucial + * property for state pruning. When state is checkpointed and finalized at + * some instruction index, it can be correctly and safely used to "short + * circuit" any *compatible* state that reaches exactly the same instruction + * index. I.e., if we jumped to that instruction from a completely different + * code path than original finalized state was derived from, it doesn't + * matter, current state can be discarded because from that instruction + * forward having a compatible state will ensure we will safely reach the + * exit. States describe preconditions for further exploration, but completely + * forget the history of how we got here. + * + * This also means that even if we needed precise SCALAR range to get to + * finalized state, but from that point forward *that same* SCALAR register is + * never used in a precise context (i.e., it's precise value is not needed for + * correctness), it's correct and safe to mark such register as "imprecise" + * (i.e., precise marking set to false). This is what we rely on when we do + * not set precise marking in current state. If no child state requires + * precision for any given SCALAR register, it's safe to dictate that it can + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. + */ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) { @@ -2787,6 +2870,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r if (!env->bpf_capable) return 0; + /* Do sanity checks against current state of register and/or stack + * slot, but don't set precise flag in current state, as precision + * tracking in the current state is unnecessary. + */ func = st->frame[frame]; if (regno >= 0) { reg = &func->regs[regno]; @@ -2794,11 +2881,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r WARN_ONCE(1, "backtracing misuse"); return -EFAULT; } - if (!reg->precise) - new_marks = true; - else - reg_mask = 0; - reg->precise = true; + new_marks = true; } while (spi >= 0) { @@ -2811,11 +2894,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r stack_mask = 0; break; } - if (!reg->precise) - new_marks = true; - else - stack_mask = 0; - reg->precise = true; + new_marks = true; break; } @@ -11534,7 +11613,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (env->explore_alu_limits) return false; if (rcur->type == SCALAR_VALUE) { - if (!rold->precise && !rcur->precise) + if (!rold->precise) return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && -- cgit v1.2.3 From 7a830b53c17bbadcf99f778f28aaaa4e6c41df5f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Nov 2022 09:36:48 -0700 Subject: bpf: aggressively forget precise markings during state checkpointing Exploit the property of about-to-be-checkpointed state to be able to forget all precise markings up to that point even more aggressively. We now clear all potentially inherited precise markings right before checkpointing and branching off into child state. If any of children states require precise knowledge of any SCALAR register, those will be propagated backwards later on before this state is finalized, preserving correctness. There is a single selftests BPF program change, but tremendous one: 25x reduction in number of verified instructions and states in trace_virtqueue_add_sgs. Cilium results are more modest, but happen across wider range of programs. SELFTESTS RESULTS ================= $ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results.csv ~/imprecise-aggressive-results.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ------------------- ----------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- loop6.bpf.linked1.o trace_virtqueue_add_sgs 398057 15114 -382943 (-96.20%) 8717 336 -8381 (-96.15%) ------------------- ----------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- CILIUM RESULTS ============== $ ./veristat -C -e file,prog,insns,states ~/imprecise-early-results-cilium.csv ~/imprecise-aggressive-results-cilium.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ------------- -------------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_host.o tail_handle_nat_fwd_ipv4 23426 23221 -205 (-0.88%) 1537 1515 -22 (-1.43%) bpf_host.o tail_handle_nat_fwd_ipv6 13009 12904 -105 (-0.81%) 719 708 -11 (-1.53%) bpf_host.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%) bpf_host.o tail_nodeport_nat_ipv6_egress 3446 3406 -40 (-1.16%) 203 198 -5 (-2.46%) bpf_lxc.o tail_handle_nat_fwd_ipv4 23426 23221 -205 (-0.88%) 1537 1515 -22 (-1.43%) bpf_lxc.o tail_handle_nat_fwd_ipv6 13009 12904 -105 (-0.81%) 719 708 -11 (-1.53%) bpf_lxc.o tail_ipv4_ct_egress 5074 4897 -177 (-3.49%) 255 248 -7 (-2.75%) bpf_lxc.o tail_ipv4_ct_ingress 5100 4923 -177 (-3.47%) 255 248 -7 (-2.75%) bpf_lxc.o tail_ipv4_ct_ingress_policy_only 5100 4923 -177 (-3.47%) 255 248 -7 (-2.75%) bpf_lxc.o tail_ipv6_ct_egress 4558 4536 -22 (-0.48%) 188 187 -1 (-0.53%) bpf_lxc.o tail_ipv6_ct_ingress 4578 4556 -22 (-0.48%) 188 187 -1 (-0.53%) bpf_lxc.o tail_ipv6_ct_ingress_policy_only 4578 4556 -22 (-0.48%) 188 187 -1 (-0.53%) bpf_lxc.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%) bpf_overlay.o tail_nodeport_nat_ingress_ipv6 5261 5196 -65 (-1.24%) 247 243 -4 (-1.62%) bpf_overlay.o tail_nodeport_nat_ipv6_egress 3482 3442 -40 (-1.15%) 204 201 -3 (-1.47%) bpf_xdp.o tail_nodeport_nat_egress_ipv4 17200 15619 -1581 (-9.19%) 1111 1010 -101 (-9.09%) ------------- -------------------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221104163649.121784-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff3fc21ce99b..d3b75aa0c54d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2774,6 +2774,31 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, } } +static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + } +} + /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -2852,6 +2877,14 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, * be imprecise. If any child state does require this register to be precise, * we'll mark it precise later retroactively during precise markings * propagation from child state to parent states. + * + * Skipping precise marking setting in current state is a mild version of + * relying on the above observation. But we can utilize this property even + * more aggressively by proactively forgetting any precise marking in the + * current state (which we inherited from the parent state), right before we + * checkpoint it and branch off into new child state. This is done by + * mark_all_scalars_imprecise() to hopefully get more permissive and generic + * finalized states which help in short circuiting more future states. */ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) @@ -12160,6 +12193,10 @@ next: env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; + /* forget precise markings we inherited, see __mark_chain_precision */ + if (env->bpf_capable) + mark_all_scalars_imprecise(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); -- cgit v1.2.3 From 161939abc80ba855ee1739be4a807c6b0f1b8c6c Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 7 Nov 2022 11:52:07 -0500 Subject: docs/bpf: Document BPF_MAP_TYPE_CPUMAP map Add documentation for BPF_MAP_TYPE_CPUMAP including kernel version introduced, usage and examples. Co-developed-by: Lorenzo Bianconi Signed-off-by: Maryam Tahhan Signed-off-by: Lorenzo Bianconi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221107165207.2682075-2-mtahhan@redhat.com --- Documentation/bpf/map_cpumap.rst | 166 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/cpumap.c | 9 ++- 2 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 Documentation/bpf/map_cpumap.rst (limited to 'kernel') diff --git a/Documentation/bpf/map_cpumap.rst b/Documentation/bpf/map_cpumap.rst new file mode 100644 index 000000000000..eaf57b38cafd --- /dev/null +++ b/Documentation/bpf/map_cpumap.rst @@ -0,0 +1,166 @@ +.. SPDX-License-Identifier: GPL-2.0-only +.. Copyright (C) 2022 Red Hat, Inc. + +=================== +BPF_MAP_TYPE_CPUMAP +=================== + +.. note:: + - ``BPF_MAP_TYPE_CPUMAP`` was introduced in kernel version 4.15 + +.. kernel-doc:: kernel/bpf/cpumap.c + :doc: cpu map + +An example use-case for this map type is software based Receive Side Scaling (RSS). + +The CPUMAP represents the CPUs in the system indexed as the map-key, and the +map-value is the config setting (per CPUMAP entry). Each CPUMAP entry has a dedicated +kernel thread bound to the given CPU to represent the remote CPU execution unit. + +Starting from Linux kernel version 5.9 the CPUMAP can run a second XDP program +on the remote CPU. This allows an XDP program to split its processing across +multiple CPUs. For example, a scenario where the initial CPU (that sees/receives +the packets) needs to do minimal packet processing and the remote CPU (to which +the packet is directed) can afford to spend more cycles processing the frame. The +initial CPU is where the XDP redirect program is executed. The remote CPU +receives raw ``xdp_frame`` objects. + +Usage +===== + +Kernel BPF +---------- +.. c:function:: + long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + + Redirect the packet to the endpoint referenced by ``map`` at index ``key``. + For ``BPF_MAP_TYPE_CPUMAP`` this map contains references to CPUs. + + The lower two bits of ``flags`` are used as the return code if the map lookup + fails. This is so that the return value can be one of the XDP program return + codes up to ``XDP_TX``, as chosen by the caller. + +Userspace +--------- +.. note:: + CPUMAP entries can only be updated/looked up/deleted from user space and not + from an eBPF program. Trying to call these functions from a kernel eBPF + program will result in the program failing to load and a verifier warning. + +.. c:function:: + int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags); + + CPU entries can be added or updated using the ``bpf_map_update_elem()`` + helper. This helper replaces existing elements atomically. The ``value`` parameter + can be ``struct bpf_cpumap_val``. + + .. code-block:: c + + struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; + }; + + The flags argument can be one of the following: + - BPF_ANY: Create a new element or update an existing element. + - BPF_NOEXIST: Create a new element only if it did not exist. + - BPF_EXIST: Update an existing element. + +.. c:function:: + int bpf_map_lookup_elem(int fd, const void *key, void *value); + + CPU entries can be retrieved using the ``bpf_map_lookup_elem()`` + helper. + +.. c:function:: + int bpf_map_delete_elem(int fd, const void *key); + + CPU entries can be deleted using the ``bpf_map_delete_elem()`` + helper. This helper will return 0 on success, or negative error in case of + failure. + +Examples +======== +Kernel +------ + +The following code snippet shows how to declare a ``BPF_MAP_TYPE_CPUMAP`` called +``cpu_map`` and how to redirect packets to a remote CPU using a round robin scheme. + +.. code-block:: c + + struct { + __uint(type, BPF_MAP_TYPE_CPUMAP); + __type(key, __u32); + __type(value, struct bpf_cpumap_val); + __uint(max_entries, 12); + } cpu_map SEC(".maps"); + + struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 12); + } cpus_available SEC(".maps"); + + struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); + } cpus_iterator SEC(".maps"); + + SEC("xdp") + int xdp_redir_cpu_round_robin(struct xdp_md *ctx) + { + __u32 key = 0; + __u32 cpu_dest = 0; + __u32 *cpu_selected, *cpu_iterator; + __u32 cpu_idx; + + cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key); + if (!cpu_iterator) + return XDP_ABORTED; + cpu_idx = *cpu_iterator; + + *cpu_iterator += 1; + if (*cpu_iterator == bpf_num_possible_cpus()) + *cpu_iterator = 0; + + cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + if (cpu_dest >= bpf_num_possible_cpus()) + return XDP_ABORTED; + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); + } + +Userspace +--------- + +The following code snippet shows how to dynamically set the max_entries for a +CPUMAP to the max number of cpus available on the system. + +.. code-block:: c + + int set_max_cpu_entries(struct bpf_map *cpu_map) + { + if (bpf_map__set_max_entries(cpu_map, libbpf_num_possible_cpus()) < 0) { + fprintf(stderr, "Failed to set max entries for cpu_map map: %s", + strerror(errno)); + return -1; + } + return 0; + } + +References +=========== + +- https://developers.redhat.com/blog/2021/05/13/receive-side-scaling-rss-with-ebpf-and-cpumap#redirecting_into_a_cpumap diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bb03fdba73bb..6b6a78c04b90 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -4,13 +4,16 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ -/* The 'cpumap' is primarily used as a backend map for XDP BPF helper +/** + * DOC: cpu map + * The 'cpumap' is primarily used as a backend map for XDP BPF helper * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. * - * Unlike devmap which redirects XDP frames out another NIC device, + * Unlike devmap which redirects XDP frames out to another NIC device, * this map type redirects raw XDP frames to another CPU. The remote * CPU will do SKB-allocation and call the normal network stack. - * + */ +/* * This is a scalability and isolation mechanism, that allow * separating the early driver network XDP layer, from the rest of the * netstack, and assigning dedicated CPUs for this stage. This -- cgit v1.2.3