diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/arraymap.c | 24 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 1055 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 6 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 665 | ||||
| -rw-r--r-- | kernel/bpf/cpumap.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/disasm.c | 34 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 80 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 98 | ||||
| -rw-r--r-- | kernel/bpf/local_storage.c | 106 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 61 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 23 | ||||
| -rw-r--r-- | kernel/bpf/offload.c | 119 | ||||
| -rw-r--r-- | kernel/bpf/percpu_freelist.c | 41 | ||||
| -rw-r--r-- | kernel/bpf/percpu_freelist.h | 4 | ||||
| -rw-r--r-- | kernel/bpf/queue_stack_maps.c | 18 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 20 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 301 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 2048 |
19 files changed, 3911 insertions, 797 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 24583da9ffd1..c72e0d8e1e65 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; + char *val; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -262,17 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) + if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + if (unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) + return -EINVAL; + + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); - else - memcpy(array->value + - array->elem_size * (index & array->index_mask), - value, map->value_size); + } else { + val = array->value + + array->elem_size * (index & array->index_mask); + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, val, value, false); + else + copy_map_value(map, val, value); + } return 0; } @@ -382,6 +391,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, } static int array_map_check_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 378cef70341c..bd3921b1514b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5,6 +5,7 @@ #include <uapi/linux/types.h> #include <linux/seq_file.h> #include <linux/compiler.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/anon_inodes.h> @@ -156,14 +157,14 @@ * */ -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INFO_MASK 0x8f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_VOLATILE] = "VOLATILE", [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", }; struct btf_kind_operations { @@ -271,6 +274,10 @@ struct btf_kind_operations { const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); + int (*check_kflag_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); void (*seq_show)(const struct btf *btf, const struct btf_type *t, @@ -281,6 +288,9 @@ struct btf_kind_operations { static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -306,15 +316,33 @@ static bool btf_type_is_modifier(const struct btf_type *t) static bool btf_type_is_void(const struct btf_type *t) { - /* void => no type and size info. - * Hence, FWD is also treated as void. - */ - return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; + return t == &btf_void; +} + +static bool btf_type_is_fwd(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_is_func(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +} + +static bool btf_type_is_func_proto(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +} + +static bool btf_type_nosize(const struct btf_type *t) +{ + return btf_type_is_void(t) || btf_type_is_fwd(t) || + btf_type_is_func(t) || btf_type_is_func_proto(t); } -static bool btf_type_is_void_or_null(const struct btf_type *t) +static bool btf_type_nosize_or_null(const struct btf_type *t) { - return !t || btf_type_is_void(t); + return !t || btf_type_nosize(t); } /* union is only a special case of struct: @@ -327,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + static bool btf_type_is_array(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; @@ -395,6 +428,25 @@ static u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static bool btf_type_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static u32 btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) + : member->offset; +} + +static u32 btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) + : 0; +} + static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -426,7 +478,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } -static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + /* offset must be valid */ + const char *src = &btf->strings[offset]; + const char *src_limit; + + if (!isalpha(*src) && *src != '_') + return false; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isalnum(*src) && *src != '_') + return false; + src++; + } + + return !*src; +} + +static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) return "(anon)"; @@ -436,7 +512,15 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } -static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + +const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { if (type_id > btf->nr_types) return NULL; @@ -446,7 +530,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) /* * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. + * u8/u16/u32/u64 or __int128. */ static bool btf_type_int_is_regular(const struct btf_type *t) { @@ -459,13 +543,55 @@ static bool btf_type_int_is_regular(const struct btf_type *t) if (BITS_PER_BYTE_MASKED(nr_bits) || BTF_INT_OFFSET(int_data) || (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && + nr_bytes != (2 * sizeof(u64)))) { return false; } return true; } +/* + * Check that given struct member is a regular int with expected + * offset and size. + */ +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size) +{ + const struct btf_type *t; + u32 id, int_data; + u8 nr_bits; + + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + if (!t || !btf_type_is_int(t)) + return false; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + if (btf_type_kflag(s)) { + u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); + u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); + + /* if kflag set, int should be a regular int and + * bit offset should be at byte boundary. + */ + return !bitfield_size && + BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && + BITS_ROUNDUP_BYTES(nr_bits) == expected_size; + } + + if (BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(m->offset) || + BITS_ROUNDUP_BYTES(m->offset) != expected_offset || + BITS_PER_BYTE_MASKED(nr_bits) || + BITS_ROUNDUP_BYTES(nr_bits) != expected_size) + return false; + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -506,7 +632,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], - btf_name_by_offset(btf, t->name_off), + __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) @@ -549,9 +675,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (env->phase != CHECK_META) btf_verifier_log_type(env, struct_type, NULL); - __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - btf_name_by_offset(btf, member->name_off), - member->type, member->offset); + if (btf_type_kflag(struct_type)) + __btf_verifier_log(log, + "\t%s type_id=%u bitfield_size=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, + BTF_MEMBER_BITFIELD_SIZE(member->offset), + BTF_MEMBER_BIT_OFFSET(member->offset)); + else + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, member->offset); if (fmt && *fmt) { __btf_verifier_log(log, " "); @@ -740,11 +874,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, /* int, enum or void is a sink */ return !btf_type_needs_resolve(next_type); case RESOLVE_PTR: - /* int, enum, void, struct or array is a sink for ptr */ + /* int, enum, void, struct, array, func or func_proto is a sink + * for ptr + */ return !btf_type_is_modifier(next_type) && !btf_type_is_ptr(next_type); case RESOLVE_STRUCT_OR_ARRAY: - /* int, enum, void or ptr is a sink for struct and array */ + /* int, enum, void, ptr, func or func_proto is a sink + * for struct and array + */ return !btf_type_is_modifier(next_type) && !btf_type_is_array(next_type) && !btf_type_is_struct(next_type); @@ -826,7 +964,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, u32 size = 0; size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_is_void_or_null(size_type)) + if (btf_type_nosize_or_null(size_type)) return NULL; if (btf_type_has_size(size_type)) { @@ -842,7 +980,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, size = btf->resolved_sizes[size_type_id]; size_type_id = btf->resolved_ids[size_type_id]; size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_is_void(size_type)) + if (btf_type_nosize_or_null(size_type)) return NULL; } @@ -863,6 +1001,38 @@ static int btf_df_check_member(struct btf_verifier_env *env, return -EINVAL; } +static int btf_df_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_kflag_member"); + return -EINVAL; +} + +/* Used for ptr, array and struct/union type members. + * int, enum and modifier types have their specific callback functions. + */ +static int btf_generic_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + /* bitfield size is 0, so member->offset represents bit offset only. + * It is safe to call non kflag check_member variants. + */ + return btf_type_ops(member_type)->check_member(env, struct_type, + member, + member_type); +} + static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -899,9 +1069,65 @@ static int btf_int_check_member(struct btf_verifier_env *env, nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static int btf_int_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; + u32 int_data = btf_type_int(member_type); + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + + /* a regular int type is required for the kflag int member */ + if (!btf_type_int_is_regular(member_type)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member base type"); + return -EINVAL; + } + + /* check sanity of bitfield size */ + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_int_data_bits = BTF_INT_BITS(int_data); + if (!nr_bits) { + /* Not a bitfield member, member offset must be at byte + * boundary. + */ + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member offset"); + return -EINVAL; + } + + nr_bits = nr_int_data_bits; + } else if (nr_bits > nr_int_data_bits) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); + if (nr_copy_bits > BITS_PER_U128) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -934,6 +1160,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + int_data = btf_type_int(t); if (int_data & ~BTF_INT_MASK) { btf_verifier_log_basic(env, t, "Invalid int_data:%x", @@ -943,9 +1174,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); - if (nr_bits > BITS_PER_U64) { + if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", - BITS_PER_U64); + BITS_PER_U128); return -EINVAL; } @@ -986,43 +1217,113 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int128_print(struct seq_file *m, void *data) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(u64 *)data; + lower_num = *(u64 *)(data + 8); +#else + upper_num = *(u64 *)(data + 8); + lower_num = *(u64 *)data; +#endif + if (upper_num == 0) + seq_printf(m, "0x%llx", lower_num); + else + seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +} + +static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, + u16 right_shift_bits) +{ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + +static void btf_bitfield_seq_show(void *data, u8 bits_offset, + u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; - u32 int_data = btf_type_int(t); - u8 nr_bits = BTF_INT_BITS(int_data); - u8 total_bits_offset; u8 nr_copy_bytes; u8 nr_copy_bits; - u64 print_num; + u64 print_num[2] = {}; - /* - * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. - */ - total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - data += BITS_ROUNDDOWN_BYTES(total_bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num = 0; - memcpy(&print_num, data, nr_copy_bytes); + memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else - left_shift_bits = BITS_PER_U64 - nr_copy_bits; + left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif - right_shift_bits = BITS_PER_U64 - nr_bits; + right_shift_bits = BITS_PER_U128 - nr_bits; + + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(m, print_num); +} - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; - seq_printf(m, "0x%llx", print_num); +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 128 bits. + */ + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_bitfield_seq_show(data, bits_offset, nr_bits, m); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -1041,6 +1342,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, } switch (nr_bits) { + case 128: + btf_int128_print(m, data); + break; case 64: if (sign) seq_printf(m, "%lld", *(s64 *)data); @@ -1074,6 +1378,7 @@ static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, + .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, .seq_show = btf_int_seq_show, }; @@ -1103,6 +1408,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env, resolved_type); } +static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, + &resolved_member, + resolved_type); +} + static int btf_ptr_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, @@ -1138,11 +1468,32 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } + /* typedef type must have a valid name, and other ref types, + * volatile, const, restrict, should have a null name. + */ + if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } else { + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1163,10 +1514,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return -EINVAL; } - /* "typedef void new_void", "const void"...etc */ - if (btf_type_is_void(next_type)) - goto resolved; - if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); @@ -1177,13 +1524,19 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * save us a few type-following when we use it later (e.g. in * pretty print). */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (env_type_is_resolved(env, next_type_id)) + next_type = btf_type_id_resolve(btf, &next_type_id); + + /* "typedef void new_void", "const void"...etc */ + if (!btf_type_is_void(next_type) && + !btf_type_is_fwd(next_type) && + !btf_type_is_func_proto(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } } -resolved: env_stack_pop_resolved(env, next_type_id, next_type_size); return 0; @@ -1196,7 +1549,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); if (!next_type) { @@ -1204,10 +1556,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, return -EINVAL; } - /* "void *" */ - if (btf_type_is_void(next_type)) - goto resolved; - if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); @@ -1234,13 +1582,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, resolved_type_id); } - if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; + if (!btf_type_id_size(btf, &next_type_id, NULL)) { + if (env_type_is_resolved(env, next_type_id)) + next_type = btf_type_id_resolve(btf, &next_type_id); + + if (!btf_type_is_void(next_type) && + !btf_type_is_fwd(next_type) && + !btf_type_is_func_proto(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } } -resolved: env_stack_pop_resolved(env, next_type_id, 0); return 0; @@ -1274,6 +1627,7 @@ static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, + .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_modifier_seq_show, }; @@ -1282,6 +1636,7 @@ static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_ptr_seq_show, }; @@ -1300,16 +1655,30 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* fwd type must have a valid name */ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; } +static void btf_fwd_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); +} + static struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, - .log_details = btf_ref_type_log, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_fwd_type_log, .seq_show = btf_df_seq_show, }; @@ -1356,11 +1725,22 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* array type should not have a name */ + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size) { btf_verifier_log_type(env, t, "size != 0"); return -EINVAL; @@ -1396,7 +1776,7 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_void_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1415,7 +1795,7 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_void_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -1484,6 +1864,7 @@ static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, .seq_show = btf_array_seq_show, }; @@ -1522,6 +1903,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; + u32 offset; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1532,6 +1914,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); last_offset = 0; @@ -1543,6 +1932,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct member either no name or a valid one */ + if (member->name_off && + !btf_name_valid_identifier(btf, member->name_off)) { + btf_verifier_log_member(env, t, member, "Invalid name"); + return -EINVAL; + } /* A member cannot be in type void */ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, @@ -1550,7 +1945,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (is_union && member->offset) { + offset = btf_member_bit_offset(t, member); + if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; @@ -1560,20 +1956,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, * ">" instead of ">=" because the last member could be * "char a[0];" */ - if (last_offset > member->offset) { + if (last_offset > offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } - if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + if (BITS_ROUNDUP_BYTES(offset) > struct_size) { btf_verifier_log_member(env, t, member, - "Memmber bits_offset exceeds its struct size"); + "Member bits_offset exceeds its struct size"); return -EINVAL; } btf_verifier_log_member(env, t, member, NULL); - last_offset = member->offset; + last_offset = offset; } return meta_needed; @@ -1603,9 +1999,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, last_member_type = btf_type_by_id(env->btf, last_member_type_id); - err = btf_type_ops(last_member_type)->check_member(env, v->t, - last_member, - last_member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, + last_member, + last_member_type); + else + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); if (err) return err; } @@ -1615,7 +2016,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_void_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; @@ -1627,9 +2028,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, return env_stack_push(env, member_type, member_type_id); } - err = btf_type_ops(member_type)->check_member(env, v->t, - member, - member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(member_type)->check_kflag_member(env, v->t, + member, + member_type); + else + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); if (err) return err; } @@ -1645,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + if (!__btf_type_is_struct(t)) + return -EINVAL; + + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + if (!__btf_type_is_struct(member_type)) + continue; + if (member_type->size != sizeof(struct bpf_spin_lock)) + continue; + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), + "bpf_spin_lock")) + continue; + if (off != -ENOENT) + /* only one 'struct bpf_spin_lock' is allowed */ + return -E2BIG; + off = btf_member_bit_offset(t, member); + if (off % 8) + /* valid C code cannot generate such BTF */ + return -EINVAL; + off /= 8; + if (off % __alignof__(struct bpf_spin_lock)) + /* valid struct bpf_spin_lock will be 4 byte aligned */ + return -EINVAL; + } + return off; +} + static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -1657,17 +2100,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - u32 member_offset = member->offset; - u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); const struct btf_kind_operations *ops; + u32 member_offset, bitfield_size; + u32 bytes_offset; + u8 bits8_offset; if (i) seq_puts(m, seq); - ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + member_offset = btf_member_bit_offset(t, member); + bitfield_size = btf_member_bitfield_size(t, member); + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + if (bitfield_size) { + btf_bitfield_seq_show(data + bytes_offset, bits8_offset, + bitfield_size, m); + } else { + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } } seq_puts(m, "}"); } @@ -1676,6 +2128,7 @@ static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, .seq_show = btf_struct_seq_show, }; @@ -1705,6 +2158,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env, return 0; } +static int btf_enum_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, bytes_end, struct_size; + u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; + + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + if (!nr_bits) { + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + nr_bits = int_bitsize; + } else if (nr_bits > int_bitsize) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); + if (struct_size < bytes_end) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1724,12 +2212,24 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size != sizeof(int)) { btf_verifier_log_type(env, t, "Expected size:%zu", sizeof(int)); return -EINVAL; } + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { @@ -1739,8 +2239,16 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", - btf_name_by_offset(btf, enums[i].name_off), + __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -1764,7 +2272,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, for (i = 0; i < nr_enums; i++) { if (v == enums[i].val) { seq_printf(m, "%s", - btf_name_by_offset(btf, enums[i].name_off)); + __btf_name_by_offset(btf, + enums[i].name_off)); return; } } @@ -1776,10 +2285,249 @@ static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .seq_show = btf_enum_seq_show, }; +static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_func_proto_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_param *args = (const struct btf_param *)(t + 1); + u16 nr_args = btf_type_vlen(t), i; + + btf_verifier_log(env, "return=%u args=(", t->type); + if (!nr_args) { + btf_verifier_log(env, "void"); + goto done; + } + + if (nr_args == 1 && !args[0].type) { + /* Only one vararg */ + btf_verifier_log(env, "vararg"); + goto done; + } + + btf_verifier_log(env, "%u %s", args[0].type, + __btf_name_by_offset(env->btf, + args[0].name_off)); + for (i = 1; i < nr_args - 1; i++) + btf_verifier_log(env, ", %u %s", args[i].type, + __btf_name_by_offset(env->btf, + args[i].name_off)); + + if (nr_args > 1) { + const struct btf_param *last_arg = &args[nr_args - 1]; + + if (last_arg->type) + btf_verifier_log(env, ", %u %s", last_arg->type, + __btf_name_by_offset(env->btf, + last_arg->name_off)); + else + btf_verifier_log(env, ", vararg"); + } + +done: + btf_verifier_log(env, ")"); +} + +static struct btf_kind_operations func_proto_ops = { + .check_meta = btf_func_proto_check_meta, + .resolve = btf_df_resolve, + /* + * BTF_KIND_FUNC_PROTO cannot be directly referred by + * a struct's member. + * + * It should be a funciton pointer instead. + * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) + * + * Hence, there is no btf_func_check_member(). + */ + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_func_proto_log, + .seq_show = btf_df_seq_show, +}; + +static s32 btf_func_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static struct btf_kind_operations func_ops = { + .check_meta = btf_func_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, +}; + +static int btf_func_proto_check(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_type *ret_type; + const struct btf_param *args; + const struct btf *btf; + u16 nr_args, i; + int err; + + btf = env->btf; + args = (const struct btf_param *)(t + 1); + nr_args = btf_type_vlen(t); + + /* Check func return type which could be "void" (t->type == 0) */ + if (t->type) { + u32 ret_type_id = t->type; + + ret_type = btf_type_by_id(btf, ret_type_id); + if (!ret_type) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + + if (btf_type_needs_resolve(ret_type) && + !env_type_is_resolved(env, ret_type_id)) { + err = btf_resolve(env, ret_type, ret_type_id); + if (err) + return err; + } + + /* Ensure the return type is a type that has a size */ + if (!btf_type_id_size(btf, &ret_type_id, NULL)) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + } + + if (!nr_args) + return 0; + + /* Last func arg type_id could be 0 if it is a vararg */ + if (!args[nr_args - 1].type) { + if (args[nr_args - 1].name_off) { + btf_verifier_log_type(env, t, "Invalid arg#%u", + nr_args); + return -EINVAL; + } + nr_args--; + } + + err = 0; + for (i = 0; i < nr_args; i++) { + const struct btf_type *arg_type; + u32 arg_type_id; + + arg_type_id = args[i].type; + arg_type = btf_type_by_id(btf, arg_type_id); + if (!arg_type) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + + if (args[i].name_off && + (!btf_name_offset_valid(btf, args[i].name_off) || + !btf_name_valid_identifier(btf, args[i].name_off))) { + btf_verifier_log_type(env, t, + "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + + if (btf_type_needs_resolve(arg_type) && + !env_type_is_resolved(env, arg_type_id)) { + err = btf_resolve(env, arg_type, arg_type_id); + if (err) + break; + } + + if (!btf_type_id_size(btf, &arg_type_id, NULL)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + } + + return err; +} + +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_type *proto_type; + const struct btf_param *args; + const struct btf *btf; + u16 nr_args, i; + + btf = env->btf; + proto_type = btf_type_by_id(btf, t->type); + + if (!proto_type || !btf_type_is_func_proto(proto_type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + args = (const struct btf_param *)(proto_type + 1); + nr_args = btf_type_vlen(proto_type); + for (i = 0; i < nr_args; i++) { + if (!args[i].name_off && args[i].type) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; + } + } + + return 0; +} + static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_INT] = &int_ops, [BTF_KIND_PTR] = &ptr_ops, @@ -1792,6 +2540,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_VOLATILE] = &modifier_ops, [BTF_KIND_CONST] = &modifier_ops, [BTF_KIND_RESTRICT] = &modifier_ops, + [BTF_KIND_FUNC] = &func_ops, + [BTF_KIND_FUNC_PROTO] = &func_proto_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -1863,30 +2613,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env) return 0; } -static int btf_resolve(struct btf_verifier_env *env, - const struct btf_type *t, u32 type_id) -{ - const struct resolve_vertex *v; - int err = 0; - - env->resolve_mode = RESOLVE_TBD; - env_stack_push(env, t, type_id); - while (!err && (v = env_stack_peak(env))) { - env->log_type_id = v->type_id; - err = btf_type_ops(v->t)->resolve(env, v); - } - - env->log_type_id = type_id; - if (err == -E2BIG) - btf_verifier_log_type(env, t, - "Exceeded max resolving depth:%u", - MAX_RESOLVE_DEPTH); - else if (err == -EEXIST) - btf_verifier_log_type(env, t, "Loop detected"); - - return err; -} - static bool btf_resolve_valid(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) @@ -1920,6 +2646,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; } +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + u32 save_log_type_id = env->log_type_id; + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) { + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + } else if (err == -EEXIST) { + btf_verifier_log_type(env, t, "Loop detected"); + } + + /* Final sanity check */ + if (!err && !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + err = -EINVAL; + } + + env->log_type_id = save_log_type_id; + return err; +} + static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; @@ -1942,10 +2701,16 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; } - if (btf_type_needs_resolve(t) && - !btf_resolve_valid(env, t, type_id)) { - btf_verifier_log_type(env, t, "Invalid resolve state"); - return -EINVAL; + if (btf_type_is_func_proto(t)) { + err = btf_func_proto_check(env, t); + if (err) + return err; + } + + if (btf_type_is_func(t)) { + err = btf_func_check(env, t); + if (err) + return err; } } @@ -2067,56 +2832,47 @@ static int btf_check_sec_info(struct btf_verifier_env *env, return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, - u32 btf_data_size) +static int btf_parse_hdr(struct btf_verifier_env *env) { + u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; - u32 hdr_len, hdr_copy; - /* - * Minimal part of the "struct btf_header" that - * contains the hdr_len. - */ - struct btf_min_header { - u16 magic; - u8 version; - u8 flags; - u32 hdr_len; - } __user *min_hdr; struct btf *btf; int err; btf = env->btf; - min_hdr = btf_data; + btf_data_size = btf->data_size; - if (btf_data_size < sizeof(*min_hdr)) { + if (btf_data_size < + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } - if (get_user(hdr_len, &min_hdr->hdr_len)) - return -EFAULT; - + hdr = btf->data; + hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); - if (err) { - if (err == -E2BIG) - btf_verifier_log(env, "Unsupported btf_header"); - return err; + /* Ensure the unsupported header fields are zero */ + if (hdr_len > sizeof(btf->hdr)) { + u8 *expected_zero = btf->data + sizeof(btf->hdr); + u8 *end = btf->data + hdr_len; + + for (; expected_zero < end; expected_zero++) { + if (*expected_zero) { + btf_verifier_log(env, "Unsupported btf_header"); + return -E2BIG; + } + } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) - return -EFAULT; + memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; - if (hdr->hdr_len != hdr_len) - return -EINVAL; - btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { @@ -2186,10 +2942,6 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, } env->btf = btf; - err = btf_parse_hdr(env, btf_data, btf_data_size); - if (err) - goto errout; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; @@ -2198,13 +2950,18 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; - btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + err = btf_parse_str_sec(env); if (err) goto errout; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9425c2fb872f..4e807973aa80 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -230,6 +230,7 @@ cleanup: * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach * @type: Type of attach operation + * @flags: Option flags * * Must be called with cgroup_mutex held. */ @@ -363,7 +364,7 @@ cleanup: * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 unused_flags) + enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; enum bpf_cgroup_storage_type stype; @@ -572,7 +573,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - bpf_prog_run_save_cb); + __bpf_prog_run_save_cb); bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; @@ -718,6 +719,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* fall through */ default: return NULL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c7eeea8cffc..ff09d32a8a1b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,12 +21,14 @@ * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ +#include <uapi/linux/btf.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/frame.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> @@ -52,6 +54,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -75,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; @@ -101,8 +104,119 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return fp; } + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + struct bpf_prog *prog; + int cpu; + + prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); + if (!prog) + return NULL; + + prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->aux->stats) { + kfree(prog->aux); + vfree(prog); + return NULL; + } + + for_each_possible_cpu(cpu) { + struct bpf_prog_stats *pstats; + + pstats = per_cpu_ptr(prog->aux->stats, cpu); + u64_stats_init(&pstats->syncp); + } + return prog; +} EXPORT_SYMBOL_GPL(bpf_prog_alloc); +int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) +{ + if (!prog->aux->nr_linfo || !prog->jit_requested) + return 0; + + prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, + sizeof(*prog->aux->jited_linfo), + GFP_KERNEL | __GFP_NOWARN); + if (!prog->aux->jited_linfo) + return -ENOMEM; + + return 0; +} + +void bpf_prog_free_jited_linfo(struct bpf_prog *prog) +{ + kfree(prog->aux->jited_linfo); + prog->aux->jited_linfo = NULL; +} + +void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) +{ + if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) + bpf_prog_free_jited_linfo(prog); +} + +/* The jit engine is responsible to provide an array + * for insn_off to the jited_off mapping (insn_to_jit_off). + * + * The idx to this array is the insn_off. Hence, the insn_off + * here is relative to the prog itself instead of the main prog. + * This array has one entry for each xlated bpf insn. + * + * jited_off is the byte off to the last byte of the jited insn. + * + * Hence, with + * insn_start: + * The first bpf insn off of the prog. The insn off + * here is relative to the main prog. + * e.g. if prog is a subprog, insn_start > 0 + * linfo_idx: + * The prog's idx to prog->aux->linfo and jited_linfo + * + * jited_linfo[linfo_idx] = prog->bpf_func + * + * For i > linfo_idx, + * + * jited_linfo[i] = prog->bpf_func + + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] + */ +void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, + const u32 *insn_to_jit_off) +{ + u32 linfo_idx, insn_start, insn_end, nr_linfo, i; + const struct bpf_line_info *linfo; + void **jited_linfo; + + if (!prog->aux->jited_linfo) + /* Userspace did not provide linfo */ + return; + + linfo_idx = prog->aux->linfo_idx; + linfo = &prog->aux->linfo[linfo_idx]; + insn_start = linfo[0].insn_off; + insn_end = insn_start + prog->len; + + jited_linfo = &prog->aux->jited_linfo[linfo_idx]; + jited_linfo[0] = prog->bpf_func; + + nr_linfo = prog->aux->nr_linfo - linfo_idx; + + for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) + /* The verifier ensures that linfo[i].insn_off is + * strictly increasing + */ + jited_linfo[i] = prog->bpf_func + + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; +} + +void bpf_prog_free_linfo(struct bpf_prog *prog) +{ + bpf_prog_free_jited_linfo(prog); + kvfree(prog->aux->linfo); +} + struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { @@ -143,7 +257,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->aux); + if (fp->aux) { + free_percpu(fp->aux->stats); + kfree(fp->aux); + } vfree(fp); } @@ -219,15 +336,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s32 delta = end_new - end_old; s64 imm = insn->imm; - if (curr < pos && curr + imm + 1 > pos) + if (curr < pos && curr + imm + 1 >= end_old) imm += delta; - else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; @@ -236,15 +354,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 delta = end_new - end_old; s32 off = insn->off; - if (curr < pos && curr + off + 1 > pos) + if (curr < pos && curr + off + 1 >= end_old) off += delta; - else if (curr > pos + delta && curr + off + 1 <= pos + delta) + else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; @@ -253,10 +372,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, - const bool probe_pass) +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, + s32 end_new, const bool probe_pass) { - u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); + u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; @@ -268,22 +387,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { - i += delta + 1; - insn++; + i = end_new; + insn = prog->insnsi + end_old; } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP || + if ((BPF_CLASS(code) != BPF_JMP && + BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; - ret = bpf_adj_delta_to_imm(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_imm(insn, pos, end_old, + end_new, i, probe_pass); } else { - ret = bpf_adj_delta_to_off(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_off(insn, pos, end_old, + end_new, i, probe_pass); } if (ret) break; @@ -292,6 +412,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, return ret; } +static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) +{ + struct bpf_line_info *linfo; + u32 i, nr_linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo || !delta) + return; + + linfo = prog->aux->linfo; + + for (i = 0; i < nr_linfo; i++) + if (off < linfo[i].insn_off) + break; + + /* Push all off < linfo[i].insn_off by delta */ + for (; i < nr_linfo; i++) + linfo[i].insn_off += delta; +} + struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { @@ -313,7 +453,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, insn_delta, true)) + bpf_adj_branches(prog, off, off + 1, off + len, true)) return NULL; /* Several new instructions need to be inserted. Make room @@ -345,11 +485,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ - BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); + + bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +{ + /* Branch offsets can't overflow when program is shrinking, no need + * to call bpf_adj_branches(..., true) here + */ + memmove(prog->insnsi + off, prog->insnsi + off + cnt, + sizeof(struct bpf_insn) * (prog->len - off - cnt)); + prog->len -= cnt; + + return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +} + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; @@ -369,6 +523,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; +long bpf_jit_limit __read_mostly; static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -384,9 +539,11 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; + const struct btf_type *type; + const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + @@ -401,6 +558,16 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + + /* prog->aux->name will be ignored if full btf name is available */ + if (prog->aux->func_info_cnt) { + type = btf_type_by_id(prog->aux->btf, + prog->aux->func_info[prog->aux->func_idx].type_id); + func_name = btf_name_by_offset(prog->aux->btf, type->name_off); + snprintf(sym, (size_t)(end - sym), "_%s", func_name); + return; + } + if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else @@ -550,7 +717,6 @@ bool is_bpf_text_address(unsigned long addr) int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - unsigned long symbol_start, symbol_end; struct bpf_prog_aux *aux; unsigned int it = 0; int ret = -ERANGE; @@ -563,10 +729,9 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); bpf_get_prog_name(aux->prog, sym); - *value = symbol_start; + *value = (unsigned long)aux->prog->bpf_func; *type = BPF_SYM_ELF_TYPE; ret = 0; @@ -577,27 +742,85 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +static atomic_long_t bpf_jit_current; + +/* Can be overridden by an arch's JIT compiler if it has a custom, + * dedicated BPF backend memory area, or if neither of the two + * below apply. + */ +u64 __weak bpf_jit_alloc_exec_limit(void) +{ +#if defined(MODULES_VADDR) + return MODULES_END - MODULES_VADDR; +#else + return VMALLOC_END - VMALLOC_START; +#endif +} + +static int __init bpf_jit_charge_init(void) +{ + /* Only used as heuristic here to derive limit. */ + bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + PAGE_SIZE), LONG_MAX); + return 0; +} +pure_initcall(bpf_jit_charge_init); + +static int bpf_jit_charge_modmem(u32 pages) +{ + if (atomic_long_add_return(pages, &bpf_jit_current) > + (bpf_jit_limit >> PAGE_SHIFT)) { + if (!capable(CAP_SYS_ADMIN)) { + atomic_long_sub(pages, &bpf_jit_current); + return -EPERM; + } + } + + return 0; +} + +static void bpf_jit_uncharge_modmem(u32 pages) +{ + atomic_long_sub(pages, &bpf_jit_current); +} + +void *__weak bpf_jit_alloc_exec(unsigned long size) +{ + return module_alloc(size); +} + +void __weak bpf_jit_free_exec(void *addr) +{ + module_memfree(addr); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - unsigned int size, hole, start; + u32 size, hole, start, pages; /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - hdr = module_alloc(size); - if (hdr == NULL) + pages = size / PAGE_SIZE; + + if (bpf_jit_charge_modmem(pages)) return NULL; + hdr = bpf_jit_alloc_exec(size); + if (!hdr) { + bpf_jit_uncharge_modmem(pages); + return NULL; + } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = size / PAGE_SIZE; + hdr->pages = pages; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -610,7 +833,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - module_memfree(hdr); + u32 pages = hdr->pages; + + bpf_jit_free_exec(hdr); + bpf_jit_uncharge_modmem(pages); } /* This symbol is only overridden by archs that have different @@ -631,6 +857,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -642,6 +902,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { @@ -698,6 +978,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, + off); + break; + case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -834,32 +1135,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ - INSN_3(ALU, ADD, X), \ - INSN_3(ALU, SUB, X), \ - INSN_3(ALU, AND, X), \ - INSN_3(ALU, OR, X), \ - INSN_3(ALU, LSH, X), \ - INSN_3(ALU, RSH, X), \ - INSN_3(ALU, XOR, X), \ - INSN_3(ALU, MUL, X), \ - INSN_3(ALU, MOV, X), \ - INSN_3(ALU, DIV, X), \ - INSN_3(ALU, MOD, X), \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, ARSH, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ - INSN_3(ALU, ADD, K), \ - INSN_3(ALU, SUB, K), \ - INSN_3(ALU, AND, K), \ - INSN_3(ALU, OR, K), \ - INSN_3(ALU, LSH, K), \ - INSN_3(ALU, RSH, K), \ - INSN_3(ALU, XOR, K), \ - INSN_3(ALU, MUL, K), \ - INSN_3(ALU, MOV, K), \ - INSN_3(ALU, DIV, K), \ - INSN_3(ALU, MOD, K), \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, ARSH, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ @@ -892,6 +1195,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ + /* 32-bit Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP32, JEQ, X), \ + INSN_3(JMP32, JNE, X), \ + INSN_3(JMP32, JGT, X), \ + INSN_3(JMP32, JLT, X), \ + INSN_3(JMP32, JGE, X), \ + INSN_3(JMP32, JLE, X), \ + INSN_3(JMP32, JSGT, X), \ + INSN_3(JMP32, JSLT, X), \ + INSN_3(JMP32, JSGE, X), \ + INSN_3(JMP32, JSLE, X), \ + INSN_3(JMP32, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP32, JEQ, K), \ + INSN_3(JMP32, JNE, K), \ + INSN_3(JMP32, JGT, K), \ + INSN_3(JMP32, JLT, K), \ + INSN_3(JMP32, JGE, K), \ + INSN_3(JMP32, JLE, K), \ + INSN_3(JMP32, JSGT, K), \ + INSN_3(JMP32, JSLT, K), \ + INSN_3(JMP32, JSGE, K), \ + INSN_3(JMP32, JSLE, K), \ + INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ @@ -964,14 +1292,14 @@ bool bpf_opcode_in_insntable(u8 code) #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context - * @ctx: is the data we are operating on + * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions + * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { - u64 tmp; #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { @@ -1038,6 +1366,12 @@ select_insn: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; + ALU_ARSH_X: + DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + CONT; + ALU_ARSH_K: + DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; @@ -1045,36 +1379,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1147,145 +1481,49 @@ select_insn: out: CONT; } - /* JMP */ JMP_JA: insn += insn->off; CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_X: - if (DST < SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_K: - if (DST < IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_X: - if (DST <= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_K: - if (DST <= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_X: - if (((s64) DST) < ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_K: - if (((s64) DST) < ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_X: - if (((s64) DST) <= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_K: - if (((s64) DST) <= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; JMP_EXIT: return BPF_R0; - + /* JMP */ +#define COND_JMP(SIGN, OPCODE, CMP_OP) \ + JMP_##OPCODE##_X: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_X: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP_##OPCODE##_K: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_K: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; + COND_JMP(u, JEQ, ==) + COND_JMP(u, JNE, !=) + COND_JMP(u, JGT, >) + COND_JMP(u, JLT, <) + COND_JMP(u, JGE, >=) + COND_JMP(u, JLE, <=) + COND_JMP(u, JSET, &) + COND_JMP(s, JSGT, >) + COND_JMP(s, JSLT, <) + COND_JMP(s, JSGE, >=) + COND_JMP(s, JSLE, <=) +#undef COND_JMP /* STX and ST and LDX*/ #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ @@ -1330,7 +1568,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -1343,7 +1581,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ @@ -1484,13 +1722,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_dev_bound(fp->aux)) { + *err = bpf_prog_alloc_jited_linfo(fp); + if (*err) + return fp; + fp = bpf_int_jit_compile(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { + bpf_prog_free_jited_linfo(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON *err = -ENOTSUPP; return fp; - } #endif + } else { + bpf_prog_free_unused_jited_linfo(fp); + } } else { *err = bpf_prog_offload_compile(fp); if (*err) @@ -1786,6 +2031,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_spin_lock_proto __weak; +const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -1851,6 +2098,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +EXPORT_SYMBOL(bpf_stats_enabled_key); +int sysctl_bpf_stats_enabled __read_mostly; + /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 24aac0d0f412..8974b3755670 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -183,7 +183,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - xdpf->headroom; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 141710b82a6c..191b79948424 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -512,8 +512,7 @@ static int dev_map_notification(struct notifier_block *notifier, struct bpf_dtab_netdev *dev, *odev; dev = READ_ONCE(dtab->netdev_map[i]); - if (!dev || - dev->dev->ifindex != netdev->ifindex) + if (!dev || netdev != dev->dev) continue; odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); if (dev == odev) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d6b76377cb6e..de73f55e42fd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = { [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", + [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; @@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", + class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { - verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", insn->imm); } } else if (class == BPF_STX) { @@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s %c%d goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], + class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { - verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s 0x%x goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2c1790288138..fed15cf94dca 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); BUILD_BUG_ON(offsetof(struct htab_elem, htab) != @@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr) */ return -EPERM; + if (zero_seed && !capable(CAP_SYS_ADMIN)) + /* Guard against local DoS, and discourage production use. */ + return -EPERM; + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ return -EINVAL; @@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_htab; - htab->hashrnd = get_random_int(); + if (htab->map.map_flags & BPF_F_ZERO_SEED) + htab->hashrnd = 0; + else + htab->hashrnd = get_random_int(); + for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); @@ -677,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } if (htab_is_prealloc(htab)) { - pcpu_freelist_push(&htab->freelist, &l->fnode); + __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); l->htab = htab; @@ -709,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) BITS_PER_LONG == 64; } -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) -{ - u32 size = htab->map.value_size; - - if (percpu || fd_htab_map_needs_adjust(htab)) - size = round_up(size, 8); - return size; -} - static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab_size_value(htab, percpu); + u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -739,7 +739,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else { struct pcpu_freelist_node *l; - l = pcpu_freelist_pop(&htab->freelist); + l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); @@ -761,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } + check_and_init_map_lock(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); if (percpu) { + size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -782,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); - } else { + } else if (fd_htab_map_needs_adjust(htab)) { + size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); + } else { + copy_map_value(&htab->map, + l_new->key + round_up(key_size, 8), + value); } l_new->hash = hash; @@ -796,11 +804,11 @@ dec_count: static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (l_old && map_flags == BPF_NOEXIST) + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; - if (!l_old && map_flags == BPF_EXIST) + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; @@ -819,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -832,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, b = __select_bucket(htab, hash); head = &b->head; + if (unlikely(map_flags & BPF_F_LOCK)) { + if (unlikely(!map_value_has_spin_lock(map))) + return -EINVAL; + /* find an element without taking the bucket lock */ + l_old = lookup_nulls_elem_raw(head, hash, key, key_size, + htab->n_buckets); + ret = check_flags(htab, l_old, map_flags); + if (ret) + return ret; + if (l_old) { + /* grab the element lock and update value in place */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + return 0; + } + /* fall through, grab the bucket lock and lookup again. + * 99.9% chance that the element won't be found, + * but second lookup under lock has to be done. + */ + } + /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); @@ -841,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { + /* first lookup without the bucket lock didn't find the element, + * but second lookup with the bucket lock found it. + * This case is highly unlikely, but has to be dealt with: + * grab the element lock in addition to the bucket lock + * and update element in place + */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + ret = 0; + goto err; + } + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ab0d5e3f9892..a411fc17d265 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -223,6 +221,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg2_type = ARG_CONST_SIZE, }; +#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + union { + __u32 val; + arch_spinlock_t lock; + } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; + + compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); + BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); + BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + arch_spin_lock(l); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + + arch_spin_unlock(l); +} + +#else + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); + do { + atomic_cond_read_relaxed(l, !VAL); + } while (atomic_xchg(l, 1)); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + atomic_set_release(l, 0); +} + +#endif + +static DEFINE_PER_CPU(unsigned long, irqsave_flags); + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + local_irq_save(flags); + __bpf_spin_lock(lock); + __this_cpu_write(irqsave_flags, flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_lock_proto = { + .func = bpf_spin_lock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + flags = __this_cpu_read(irqsave_flags); + __bpf_spin_unlock(lock); + local_irq_restore(flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_unlock_proto = { + .func = bpf_spin_unlock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src) +{ + struct bpf_spin_lock *lock; + + if (lock_src) + lock = src + map->spin_lock_off; + else + lock = dst + map->spin_lock_off; + preempt_disable(); + ____bpf_spin_lock(lock); + copy_map_value(map, dst, src); + ____bpf_spin_unlock(lock); + preempt_enable(); +} + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c97a8f968638..6b572e2de7fb 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,14 +1,15 @@ //SPDX-License-Identifier: GPL-2.0 #include <linux/bpf-cgroup.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/bug.h> #include <linux/filter.h> #include <linux/mm.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include <uapi/linux/btf.h> -DEFINE_PER_CPU(struct bpf_cgroup_storage*, - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF @@ -130,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags != BPF_ANY && flags != BPF_EXIST) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) + return -EINVAL; + + if (unlikely(flags & BPF_NOEXIST)) + return -EINVAL; + + if (unlikely((flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -138,13 +146,20 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, if (!storage) return -ENOENT; + if (flags & BPF_F_LOCK) { + copy_map_value_locked(map, storage->buf->data, value, false); + return 0; + } + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; memcpy(&new->data[0], value, map->value_size); + check_and_init_map_lock(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); @@ -308,6 +323,85 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static int cgroup_storage_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct btf_member *m; + u32 offset, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) + return -EINVAL; + + return 0; +} + +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, + struct seq_file *m) +{ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map_to_storage(map), key, false); + if (!storage) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) { + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &READ_ONCE(storage->buf)->data[0], m); + seq_puts(m, "\n"); + } else { + seq_puts(m, ": {\n"); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(storage->percpu_buf, cpu), + m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + } + rcu_read_unlock(); +} + const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -315,7 +409,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = cgroup_storage_check_btf, + .map_seq_show_elem = cgroup_storage_seq_show_elem, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) @@ -401,6 +496,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, storage->buf = kmalloc_node(size, flags, map->numa_node); if (!storage->buf) goto enomem; + check_and_init_map_lock(map, storage->buf->data); } else { storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); if (!storage->percpu_buf) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9058317ba9de..93a5cbbde421 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, const struct bpf_lpm_trie_key *key) { - size_t prefixlen = 0; - size_t i; + u32 limit = min(node->prefixlen, key->prefixlen); + u32 prefixlen = 0, i = 0; - for (i = 0; i < trie->data_size; i++) { - size_t b; + BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); - b = 8 - fls(node->data[i] ^ key->data[i]); - prefixlen += b; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) - if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) - return min(node->prefixlen, key->prefixlen); + /* data_size >= 16 has very small probability. + * We do not use a loop for optimal code generation. + */ + if (trie->data_size >= 8) { + u64 diff = be64_to_cpu(*(__be64 *)node->data ^ + *(__be64 *)key->data); + + prefixlen = 64 - fls64(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i = 8; + } +#endif + + while (trie->data_size >= i + 4) { + u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^ + *(__be32 *)&key->data[i]); + + prefixlen += 32 - fls(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i += 4; + } - if (b < 8) - break; + if (trie->data_size >= i + 2) { + u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ + *(__be16 *)&key->data[i]); + + prefixlen += 16 - fls(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i += 2; + } + + if (trie->data_size >= i + 1) { + prefixlen += 8 - fls(node->data[i] ^ key->data[i]); + + if (prefixlen >= limit) + return limit; } return prefixlen; @@ -432,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; @@ -689,6 +729,7 @@ free_stack: } static int trie_check_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 99d243e1ad6e..3dff41403583 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; + u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); @@ -36,7 +37,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (map_value_has_spin_lock(inner_map)) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + inner_map_meta_size = sizeof(*inner_map_meta); + /* In some cases verifier needs to access beyond just base map. */ + if (inner_map->ops == &array_map_ops) + inner_map_meta_size = sizeof(struct bpf_array); + + inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); @@ -46,8 +57,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; - inner_map_meta->ops = inner_map->ops; inner_map_meta->max_entries = inner_map->max_entries; + inner_map_meta->spin_lock_off = inner_map->spin_lock_off; + + /* Misc members not needed in bpf_map_meta_equal() check. */ + inner_map_meta->ops = inner_map->ops; + if (inner_map->ops == &array_map_ops) { + inner_map_meta->unpriv_array = inner_map->unpriv_array; + container_of(inner_map_meta, struct bpf_array, map)->index_mask = + container_of(inner_map, struct bpf_array, map)->index_mask; + } fdput(f); return inner_map_meta; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8e93c47f0779..ba635209ae9a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -33,7 +33,9 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { + const struct bpf_prog_offload_ops *ops; struct list_head netdevs; + void *priv; }; struct bpf_offload_netdev { @@ -106,6 +108,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) err = -EINVAL; goto err_unlock; } + offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); @@ -121,40 +124,20 @@ err_maybe_put: return err; } -static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, - struct netdev_bpf *data) +int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { - struct bpf_prog_offload *offload = prog->aux->offload; - struct net_device *netdev; - - ASSERT_RTNL(); - - if (!offload) - return -ENODEV; - netdev = offload->netdev; - - data->command = cmd; - - return netdev->netdev_ops->ndo_bpf(netdev, data); -} - -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) -{ - struct netdev_bpf data = {}; - int err; - - data.verifier.prog = env->prog; + struct bpf_prog_offload *offload; + int ret = -ENODEV; - rtnl_lock(); - err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); - if (err) - goto exit_unlock; + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + if (offload) { + ret = offload->offdev->ops->prepare(prog); + offload->dev_state = !ret; + } + up_read(&bpf_devs_lock); - env->prog->aux->offload->dev_ops = data.verifier.ops; - env->prog->aux->offload->dev_state = true; -exit_unlock: - rtnl_unlock(); - return err; + return ret; } int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, @@ -166,7 +149,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) - ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + ret = offload->offdev->ops->insn_hook(env, insn_idx, + prev_insn_idx); up_read(&bpf_devs_lock); return ret; @@ -180,8 +164,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { - if (offload->dev_ops->finalize) - ret = offload->dev_ops->finalize(env); + if (offload->offdev->ops->finalize) + ret = offload->offdev->ops->finalize(env); else ret = 0; } @@ -190,15 +174,47 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) return ret; } +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn) +{ + const struct bpf_prog_offload_ops *ops; + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + ops = offload->offdev->ops; + if (!offload->opt_failed && ops->replace_insn) + ret = ops->replace_insn(env, off, insn); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (!offload->opt_failed && offload->offdev->ops->remove_insns) + ret = offload->offdev->ops->remove_insns(env, off, cnt); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; - struct netdev_bpf data = {}; - - data.offload.prog = prog; if (offload->dev_state) - WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + offload->offdev->ops->destroy(prog); /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ bpf_prog_free_id(prog, true); @@ -210,24 +226,22 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) void bpf_prog_offload_destroy(struct bpf_prog *prog) { - rtnl_lock(); down_write(&bpf_devs_lock); if (prog->aux->offload) __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); - rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct netdev_bpf data = {}; - int ret; - - data.offload.prog = prog; + struct bpf_prog_offload *offload; + int ret = -ENODEV; - rtnl_lock(); - ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); - rtnl_unlock(); + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + if (offload) + ret = offload->offdev->ops->translate(prog); + up_read(&bpf_devs_lock); return ret; } @@ -655,7 +669,8 @@ unlock: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -struct bpf_offload_dev *bpf_offload_dev_create(void) +struct bpf_offload_dev * +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; int err; @@ -673,6 +688,8 @@ struct bpf_offload_dev *bpf_offload_dev_create(void) if (!offdev) return ERR_PTR(-ENOMEM); + offdev->ops = ops; + offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; @@ -685,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); + +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) +{ + return offdev->priv; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 673fa6fe2d73..0c1b4ba9e90e 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } -static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, - struct pcpu_freelist_node *node) +static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); node->next = head->first; @@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } -void pcpu_freelist_push(struct pcpu_freelist *s, +void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - __pcpu_freelist_push(head, node); + ___pcpu_freelist_push(head, node); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + unsigned long flags; + + local_irq_save(flags); + __pcpu_freelist_push(s, node); + local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, @@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - __pcpu_freelist_push(head, buf); + ___pcpu_freelist_push(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -74,14 +84,12 @@ again: local_irq_restore(flags); } -struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - unsigned long flags; int orig_cpu, cpu; - local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock_irqrestore(&head->lock, flags); + raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) { - local_irq_restore(flags); + if (cpu == orig_cpu) return NULL; - } } } + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_node *ret; + unsigned long flags; + + local_irq_save(flags); + ret = __pcpu_freelist_pop(s); + local_irq_restore(flags); + return ret; +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3049aae8ea1e..c3960118e617 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -22,8 +22,12 @@ struct pcpu_freelist_node { struct pcpu_freelist_node *next; }; +/* pcpu_freelist_* do spin_lock_irqsave. */ void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */ +void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *); void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems); int pcpu_freelist_init(struct pcpu_freelist *); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 12a93fb37449..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/capability.h> #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; - - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; + u64 size, queue_size, cost; - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); @@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } @@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 90daf285de03..950ab2f28922 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry) struct stack_map_irq_work *work; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read(work->sem); + up_read_non_owner(work->sem); work->sem = NULL; } @@ -180,11 +180,14 @@ static inline int stack_map_parse_build_id(void *page_addr, if (nhdr->n_type == BPF_BUILD_ID && nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - BPF_BUILD_ID_SIZE); + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BPF_BUILD_ID_SIZE - nhdr->n_descsz); return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -260,7 +263,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, return -EFAULT; /* page not mapped */ ret = -EINVAL; - page_addr = page_address(page); + page_addr = kmap_atomic(page); ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ @@ -276,6 +279,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ret = stack_map_get_build_id_64(page_addr, build_id); out: + kunmap_atomic(page_addr); put_page(page); return ret; } @@ -310,6 +314,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); } return; } @@ -320,6 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] @@ -332,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); + /* + * The irq_work will release the mmap_sem with + * up_read_non_owner(). The rwsem_release() is called + * here to release the lock from lockdep's perspective. + */ + rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ccb93277aae2..62f6bced3a3c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -79,7 +79,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; - if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size))) + if (unlikely(!access_ok(uaddr, actual_size))) return -EFAULT; if (actual_size <= expected_size) @@ -456,13 +456,14 @@ static int bpf_obj_name_cpy(char *dst, const char *src) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return -ENOTSUPP; } -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; @@ -477,8 +478,24 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; + map->spin_lock_off = btf_find_spin_lock(btf, value_type); + + if (map_value_has_spin_lock(map)) { + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + return -ENOTSUPP; + if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > + map->value_size) { + WARN_ONCE(1, + "verifier bug spin_lock_off %d value_size %d\n", + map->spin_lock_off, map->value_size); + return -EFAULT; + } + } + if (map->ops->map_check_btf) - ret = map->ops->map_check_btf(map, key_type, value_type); + ret = map->ops->map_check_btf(map, btf, key_type, value_type); return ret; } @@ -541,6 +558,8 @@ static int map_create(union bpf_attr *attr) map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; + } else { + map->spin_lock_off = -EINVAL; } err = security_bpf_map_alloc(map); @@ -558,12 +577,12 @@ static int map_create(union bpf_attr *attr) err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. - * bpf_map_put() is needed because the above + * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ - bpf_map_put(map); + bpf_map_put_with_uref(map); return err; } @@ -663,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { @@ -679,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -689,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -712,8 +740,13 @@ static int map_lookup_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + goto done; + } + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -739,11 +772,20 @@ static int map_lookup_elem(union bpf_attr *attr) err = -ENOENT; } else { err = 0; - memcpy(value, ptr, value_size); + if (attr->flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); } rcu_read_unlock(); } + this_cpu_dec(bpf_prog_active); + preempt_enable(); +done: if (err) goto free_value; @@ -799,6 +841,12 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -1210,9 +1258,13 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); + kvfree(prog->aux->func_info); + bpf_prog_free_linfo(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1232,24 +1284,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +static void bpf_prog_get_stats(const struct bpf_prog *prog, + struct bpf_prog_stats *stats) +{ + u64 nsecs = 0, cnt = 0; + int cpu; + + for_each_possible_cpu(cpu) { + const struct bpf_prog_stats *st; + unsigned int start; + u64 tnsecs, tcnt; + + st = per_cpu_ptr(prog->aux->stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&st->syncp); + tnsecs = st->nsecs; + tcnt = st->cnt; + } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + nsecs += tnsecs; + cnt += tcnt; + } + stats->nsecs = nsecs; + stats->cnt = cnt; +} + #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + struct bpf_prog_stats stats; + bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" - "prog_id:\t%u\n", + "prog_id:\t%u\n" + "run_time_ns:\t%llu\n" + "run_cnt:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, - prog->aux->id); + prog->aux->id, + stats.nsecs, + stats.cnt); } #endif @@ -1437,9 +1519,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type +#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt -static int bpf_prog_load(union bpf_attr *attr) +static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; @@ -1450,9 +1532,14 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) return -EINVAL; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) @@ -1464,11 +1551,6 @@ static int bpf_prog_load(union bpf_attr *attr) if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) return -E2BIG; - - if (type == BPF_PROG_TYPE_KPROBE && - attr->kern_version != LINUX_VERSION_CODE) - return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !capable(CAP_SYS_ADMIN)) @@ -1525,7 +1607,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(&prog, attr); + err = bpf_check(&prog, attr, uattr); if (err < 0) goto free_used_maps; @@ -1550,9 +1632,13 @@ static int bpf_prog_load(union bpf_attr *attr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: + bpf_prog_free_linfo(prog); + kvfree(prog->aux->func_info); + btf_put(prog->aux->btf); bpf_prog_kallsyms_del_subprogs(prog); free_used_maps(prog->aux); free_prog: @@ -1597,6 +1683,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) bpf_probe_unregister(raw_tp->btp, raw_tp->prog); bpf_prog_put(raw_tp->prog); } + bpf_put_raw_tracepoint(raw_tp->btp); kfree(raw_tp); return 0; } @@ -1622,13 +1709,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return -EFAULT; tp_name[sizeof(tp_name) - 1] = 0; - btp = bpf_find_raw_tracepoint(tp_name); + btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) - return -ENOMEM; + if (!raw_tp) { + err = -ENOMEM; + goto out_put_btp; + } raw_tp->btp = btp; prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, @@ -1656,6 +1745,8 @@ out_put_prog: bpf_prog_put(prog); out_free_tp: kfree(raw_tp); +out_put_btp: + bpf_put_raw_tracepoint(btp); return err; } @@ -1966,7 +2057,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) fd = bpf_map_new_fd(map, f_flags); if (fd < 0) - bpf_map_put(map); + bpf_map_put_with_uref(map); return fd; } @@ -2020,18 +2111,42 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) insns[i + 1].imm = 0; continue; } - - if (!bpf_dump_raw_ok() && - imm == (unsigned long)prog->aux) { - insns[i].imm = 0; - insns[i + 1].imm = 0; - continue; - } } return insns; } +static int set_info_rec_size(struct bpf_prog_info *info) +{ + /* + * Ensure info.*_rec_size is the same as kernel expected size + * + * or + * + * Only allow zero *_rec_size if both _rec_size and _cnt are + * zero. In this case, the kernel will set the expected + * _rec_size back to the info. + */ + + if ((info->nr_func_info || info->func_info_rec_size) && + info->func_info_rec_size != sizeof(struct bpf_func_info)) + return -EINVAL; + + if ((info->nr_line_info || info->line_info_rec_size) && + info->line_info_rec_size != sizeof(struct bpf_line_info)) + return -EINVAL; + + if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && + info->jited_line_info_rec_size != sizeof(__u64)) + return -EINVAL; + + info->func_info_rec_size = sizeof(struct bpf_func_info); + info->line_info_rec_size = sizeof(struct bpf_line_info); + info->jited_line_info_rec_size = sizeof(__u64); + + return 0; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2039,6 +2154,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info = {}; u32 info_len = attr->info.info_len; + struct bpf_prog_stats stats; char __user *uinsns; u32 ulen; int err; @@ -2074,10 +2190,22 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + err = set_info_rec_size(&info); + if (err) + return err; + + bpf_prog_get_stats(prog, &stats); + info.run_time_ns = stats.nsecs; + info.run_cnt = stats.cnt; + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; + info.nr_jited_func_lens = 0; + info.nr_func_info = 0; + info.nr_line_info = 0; + info.nr_jited_line_info = 0; goto done; } @@ -2158,11 +2286,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_ksyms; - info.nr_jited_ksyms = prog->aux->func_cnt; - if (info.nr_jited_ksyms && ulen) { + info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; + if (ulen) { if (bpf_dump_raw_ok()) { + unsigned long ksym_addr; u64 __user *user_ksyms; - ulong ksym_addr; u32 i; /* copy the address of the kernel symbol @@ -2170,10 +2298,17 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); - for (i = 0; i < ulen; i++) { - ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - ksym_addr &= PAGE_MASK; - if (put_user((u64) ksym_addr, &user_ksyms[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + ksym_addr = (unsigned long) + prog->aux->func[i]->bpf_func; + if (put_user((u64) ksym_addr, + &user_ksyms[i])) + return -EFAULT; + } + } else { + ksym_addr = (unsigned long) prog->bpf_func; + if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { @@ -2182,8 +2317,8 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_func_lens; - info.nr_jited_func_lens = prog->aux->func_cnt; - if (info.nr_jited_func_lens && ulen) { + info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; + if (ulen) { if (bpf_dump_raw_ok()) { u32 __user *user_lens; u32 func_len, i; @@ -2191,9 +2326,16 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); - for (i = 0; i < ulen; i++) { - func_len = prog->aux->func[i]->jited_len; - if (put_user(func_len, &user_lens[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + func_len = + prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + func_len = prog->jited_len; + if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { @@ -2201,6 +2343,77 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + if (prog->aux->btf) + info.btf_id = btf_id(prog->aux->btf); + + ulen = info.nr_func_info; + info.nr_func_info = prog->aux->func_info_cnt; + if (info.nr_func_info && ulen) { + char __user *user_finfo; + + user_finfo = u64_to_user_ptr(info.func_info); + ulen = min_t(u32, info.nr_func_info, ulen); + if (copy_to_user(user_finfo, prog->aux->func_info, + info.func_info_rec_size * ulen)) + return -EFAULT; + } + + ulen = info.nr_line_info; + info.nr_line_info = prog->aux->nr_linfo; + if (info.nr_line_info && ulen) { + __u8 __user *user_linfo; + + user_linfo = u64_to_user_ptr(info.line_info); + ulen = min_t(u32, info.nr_line_info, ulen); + if (copy_to_user(user_linfo, prog->aux->linfo, + info.line_info_rec_size * ulen)) + return -EFAULT; + } + + ulen = info.nr_jited_line_info; + if (prog->aux->jited_linfo) + info.nr_jited_line_info = prog->aux->nr_linfo; + else + info.nr_jited_line_info = 0; + if (info.nr_jited_line_info && ulen) { + if (bpf_dump_raw_ok()) { + __u64 __user *user_linfo; + u32 i; + + user_linfo = u64_to_user_ptr(info.jited_line_info); + ulen = min_t(u32, info.nr_jited_line_info, ulen); + for (i = 0; i < ulen; i++) { + if (put_user((__u64)(long)prog->aux->jited_linfo[i], + &user_linfo[i])) + return -EFAULT; + } + } else { + info.jited_line_info = 0; + } + } + + ulen = info.nr_prog_tags; + info.nr_prog_tags = prog->aux->func_cnt ? : 1; + if (ulen) { + __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; + u32 i; + + user_prog_tags = u64_to_user_ptr(info.prog_tags); + ulen = min_t(u32, info.nr_prog_tags, ulen); + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + if (copy_to_user(user_prog_tags[i], + prog->aux->func[i]->tag, + BPF_TAG_SIZE)) + return -EFAULT; + } + } else { + if (copy_to_user(user_prog_tags[0], + prog->tag, BPF_TAG_SIZE)) + return -EFAULT; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -2486,7 +2699,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = map_get_next_key(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr); + err = bpf_prog_load(&attr, uattr); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98fa0be35370..a7b96bf0e654 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11,10 +11,12 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include <uapi/linux/btf.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/bpf_verifier.h> #include <linux/filter.h> #include <net/netlink.h> @@ -24,6 +26,7 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/perf_event.h> +#include <linux/ctype.h> #include "disasm.h" @@ -175,6 +178,7 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ @@ -209,10 +213,32 @@ struct bpf_call_arg_meta { s64 msize_smax_value; u64 msize_umax_value; int ptr_id; + int func_id; }; static DEFINE_MUTEX(bpf_verifier_lock); +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { @@ -263,16 +289,61 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } +static bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL; + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -290,6 +361,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg) return type_is_refcounted(reg->type); } +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +{ + return reg->type == PTR_TO_MAP_VALUE && + map_value_has_spin_lock(reg->map_ptr); +} + static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) { return type_is_refcounted_or_null(reg->type); @@ -309,6 +386,12 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } +static bool is_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -324,6 +407,10 @@ static const char * const reg_type_str[] = { [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", }; static char slot_type_char[] = { @@ -336,12 +423,14 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); } static struct bpf_func_state *func(struct bpf_verifier_env *env, @@ -548,13 +637,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) } /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id) { int i, last_idx; - if (!ptr_id) - return -EFAULT; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { @@ -566,21 +652,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id) return 0; } } - return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ - struct bpf_func_state *state = cur_func(env); - int err; - - err = __release_reference_state(state, ptr_id); - if (WARN_ON_ONCE(err != 0)) - verbose(env, "verifier internal error: can't release reference\n"); - return err; + return -EINVAL; } static int transfer_reference_state(struct bpf_func_state *dst, @@ -647,7 +719,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; + dst_state->active_spin_lock = src->active_spin_lock; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -691,7 +765,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; @@ -709,6 +784,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -1029,7 +1105,7 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; - if (BPF_CLASS(code) != BPF_JMP) + if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; @@ -1071,6 +1147,12 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; + if (parent->live & REG_LIVE_DONE) { + verbose(env, "verifier BUG type %s var_off %lld off %d\n", + reg_type_str[parent->type], + parent->var_off.value, parent->off); + return -EFAULT; + } /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; @@ -1129,6 +1211,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1217,6 +1303,10 @@ static int check_stack_write(struct bpf_verifier_env *env, /* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; + /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ + if (state->stack[spi].slot_type[0] == STACK_SPILL) + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_MISC; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1234,6 +1324,7 @@ static int check_stack_write(struct bpf_verifier_env *env, register_is_null(&cur->regs[value_regno])) type = STACK_ZERO; + /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; @@ -1313,6 +1404,31 @@ static int check_stack_read(struct bpf_verifier_env *env, } } +static int check_stack_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* Stack accesses must be at a fixed offset, so that we + * can determine what type of data were returned. See + * check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -1344,13 +1460,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (env->log.level) print_verifier_state(env, state); + /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0) { + if (reg->smin_value < 0 && + (reg->smin_value == S64_MIN || + (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || + reg->smin_value + off < 0)) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -1377,6 +1497,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) verbose(env, "R%d max value is outside of the array range\n", regno); + + if (map_value_has_spin_lock(reg->map_ptr)) { + u32 lock = reg->map_ptr->spin_lock_off; + + /* if any part of struct bpf_spin_lock can be touched by + * load/store reject this program. + * To check that [x1, x2) overlaps with [y1, y2) + * it is sufficient to check x1 < y2 && y1 < x2. + */ + if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && + lock < reg->umax_value + off + size) { + verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -1387,21 +1522,24 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, enum bpf_access_type t) { switch (env->prog->type) { + /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: - /* dst_input() and dst_output() can't write for now */ + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; /* fallthrough */ + + /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -1452,6 +1590,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, verbose(env, "R%d offset is outside of the packet\n", regno); return err; } + + /* __check_packet_access has made sure "off + size - 1" is within u16. + * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * otherwise find_good_pkt_pointers would have refused to set range info + * that __check_packet_access would have rejected this pkt access. + * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + */ + env->prog->aux->max_pkt_offset = + max_t(u32, env->prog->aux->max_pkt_offset, + off + reg->umax_value + size - 1); + return err; } @@ -1497,12 +1646,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info; + struct bpf_insn_access_aux info = {}; + bool valid; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1510,13 +1661,31 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - if (!bpf_sock_is_valid_access(off, size, t, &info)) { - verbose(env, "invalid bpf_sock access off=%d size=%d\n", - off, size); - return -EACCES; + switch (reg->type) { + case PTR_TO_SOCK_COMMON: + valid = bpf_sock_common_is_valid_access(off, size, t, &info); + break; + case PTR_TO_SOCKET: + valid = bpf_sock_is_valid_access(off, size, t, &info); + break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; + default: + valid = false; } - return 0; + + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; } static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1542,8 +1711,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX || - reg->type == PTR_TO_SOCKET; + return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_sk_pointer(reg->type); } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1654,6 +1829,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCKET: pointer_desc = "sock "; break; + case PTR_TO_SOCK_COMMON: + pointer_desc = "sock_common "; + break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; default: break; } @@ -1857,33 +2038,22 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) + if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); - else + } else { mark_reg_known_zero(env, regs, value_regno); + if (reg_type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + } regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { - /* stack accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - * See check_stack_read(). - */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } off += reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, - size); - return -EACCES; - } + err = check_stack_access(env, reg, off, size); + if (err) + return err; state = func(env, reg); err = update_stack_depth(env, state, off); @@ -1921,12 +2091,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_SOCKET) { + } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "cannot write into socket\n"); + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); return -EACCES; } - err = check_sock_access(env, regno, off, size, t); + err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { @@ -1970,7 +2141,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg)) { + is_flow_key_reg(env, insn->dst_reg) || + is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2086,6 +2258,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* Implementation details: + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * Two bpf_map_lookups (even with the same key) will have different reg->id. + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after + * value_or_null->value transition, since the verifier only cares about + * the range of access to valid map value pointer and doesn't care about actual + * address of the map element. + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps + * reg->id > 0 after value_or_null->value transition. By doing so + * two bpf_map_lookups will be considered two different pointers that + * point to different bpf_spin_locks. + * The verifier allows taking only one bpf_spin_lock at a time to avoid + * dead-locks. + * Since only one bpf_spin_lock is allowed the checks are simpler than + * reg_is_refcounted() logic. The verifier needs to remember only + * one spin_lock instead of array of acquired_refs. + * cur_state->active_spin_lock remembers which map value element got locked + * and clears it after bpf_spin_unlock. + */ +static int process_spin_lock(struct bpf_verifier_env *env, int regno, + bool is_lock) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_verifier_state *cur = env->cur_state; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "R%d is not a pointer to map_value\n", regno); + return -EINVAL; + } + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, + "map '%s' has to have BTF in order to use bpf_spin_lock\n", + map->name); + return -EINVAL; + } + if (!map_value_has_spin_lock(map)) { + if (map->spin_lock_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_spin_lock'\n", + map->name); + else if (map->spin_lock_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_spin_lock'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + map->name); + return -EINVAL; + } + if (map->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", + val + reg->off); + return -EINVAL; + } + if (is_lock) { + if (cur->active_spin_lock) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + cur->active_spin_lock = reg->id; + } else { + if (!cur->active_spin_lock) { + verbose(env, "bpf_spin_unlock without taking a lock\n"); + return -EINVAL; + } + if (cur->active_spin_lock != reg->id) { + verbose(env, "bpf_spin_unlock of different lock\n"); + return -EINVAL; + } + cur->active_spin_lock = 0; + } + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -2152,6 +2409,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { + expected_type = PTR_TO_SOCK_COMMON; + /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ + if (!type_is_sk_pointer(type)) + goto err_type; } else if (arg_type == ARG_PTR_TO_SOCKET) { expected_type = PTR_TO_SOCKET; if (type != expected_type) @@ -2162,6 +2424,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return -EFAULT; } meta->ptr_id = reg->id; + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2555,7 +2828,7 @@ static int release_reference(struct bpf_verifier_env *env, for (i = 0; i <= vstate->curframe; i++) release_reg_references(env, vstate->frame[i], meta->ptr_id); - return release_reference_state(env, meta->ptr_id); + return release_reference_state(cur_func(env), meta->ptr_id); } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -2781,6 +3054,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } + meta.func_id = func_id; /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -2820,8 +3094,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } } else if (is_release_function(func_id)) { err = release_reference(env, &meta); - if (err) + if (err) { + verbose(env, "func %s#%d reference has not been acquired before\n", + func_id_name(func_id), func_id); return err; + } } regs = cur_regs(env); @@ -2849,10 +3126,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || fn->ret_type == RET_PTR_TO_MAP_VALUE) { - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - else - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -2865,14 +3138,32 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].id = ++env->id_gen; + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + if (map_value_has_spin_lock(meta.map_ptr)) + regs[BPF_REG_0].id = ++env->id_gen; + } else { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = id; + if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For release_reference() */ + regs[BPF_REG_0].id = id; + } else { + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = ++env->id_gen; + } + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -2963,6 +3254,125 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +} + +static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, + u32 alu_state, u32 alu_limit) +{ + /* If we arrived here from different branches with different + * state or limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + return 0; +} + +static int sanitize_val_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + if (update_alu_sanitation_state(aux, alu_state, alu_limit)) + return -EACCES; +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -2981,8 +3391,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg; + int ret; dst_reg = ®s[dst]; @@ -3012,9 +3423,20 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_VALUE: + if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", + off_reg == dst_reg ? dst : src); + return -EACCES; + } + /* fall-through */ default: break; } @@ -3031,6 +3453,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -3043,7 +3470,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; dst_reg->off = ptr_reg->off + smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. Note that off_reg->off @@ -3073,13 +3500,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -3105,7 +3538,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = ptr_reg->var_off; dst_reg->id = ptr_reg->id; dst_reg->off = ptr_reg->off - smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. If the subtrahend is known @@ -3131,11 +3564,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_AND: @@ -3158,6 +3592,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (!env->allow_ptr_leaks) { + if (dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } else if (dst_reg->type == PTR_TO_STACK && + check_stack_access(env, dst_reg, dst_reg->off + + dst_reg->var_off.value, 1)) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + } + return 0; } @@ -3176,6 +3629,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s64 smin_val, smax_val; u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + u32 dst = insn->dst_reg; + int ret; if (insn_bitness == 32) { /* Relevant for 32-bit RSH: Information can propagate towards @@ -3210,6 +3665,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to add from different pointers or scalars\n", dst); + return ret; + } if (signed_add_overflows(dst_reg->smin_value, smin_val) || signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; @@ -3229,6 +3689,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); + return ret; + } if (signed_sub_overflows(dst_reg->smin_value, smax_val) || signed_sub_overflows(dst_reg->smax_value, smin_val)) { /* Overflow possible, we know nothing */ @@ -3564,12 +4029,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = regs + insn->src_reg; + struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - regs[insn->dst_reg] = regs[insn->src_reg]; - regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -3577,9 +4045,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; + } else if (src_reg->type == SCALAR_VALUE) { + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; + } else { + mark_reg_unknown(env, regs, + insn->dst_reg); } - mark_reg_unknown(env, regs, insn->dst_reg); - coerce_reg_to_size(®s[insn->dst_reg], 4); + coerce_reg_to_size(dst_reg, 4); } } else { /* case: R = imm @@ -3630,11 +4103,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { - verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); - return -EINVAL; - } - if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; @@ -3745,6 +4213,147 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, } } +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] + */ +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) +{ + struct bpf_reg_state reg_lo; + s64 sval; + + if (__is_pointer_value(false, reg)) + return -1; + + if (is_jmp32) { + reg_lo = *reg; + reg = ®_lo; + /* For JMP32, only low 32 bits are compared, coerce_reg_to_size + * could truncate high bits and update umin/umax according to + * information of low bits. + */ + coerce_reg_to_size(reg, 4); + /* smin/smax need special handling. For example, after coerce, + * if smin_value is 0x00000000ffffffffLL, the value is -1 when + * used as operand to JMP32. It is a negative number from s32's + * point of view, while it is a positive number when seen as + * s64. The smin/smax are kept as s64, therefore, when used with + * JMP32, they need to be transformed into s32, then sign + * extended back to s64. + * + * Also, smin/smax were copied from umin/umax. If umin/umax has + * different sign bit, then min/max relationship doesn't + * maintain after casting into s32, for this case, set smin/smax + * to safest range. + */ + if ((reg->umax_value ^ reg->umin_value) & + (1ULL << 31)) { + reg->smin_value = S32_MIN; + reg->smax_value = S32_MAX; + } + reg->smin_value = (s64)(s32)reg->smin_value; + reg->smax_value = (s64)(s32)reg->smax_value; + + val = (u32)val; + sval = (s64)(s32)val; + } else { + sval = (s64)val; + } + + switch (opcode) { + case BPF_JEQ: + if (tnum_is_const(reg->var_off)) + return !!tnum_equals_const(reg->var_off, val); + break; + case BPF_JNE: + if (tnum_is_const(reg->var_off)) + return !tnum_equals_const(reg->var_off, val); + break; + case BPF_JSET: + if ((~reg->var_off.mask & reg->var_off.value) & val) + return 1; + if (!((reg->var_off.mask | reg->var_off.value) & val)) + return 0; + break; + case BPF_JGT: + if (reg->umin_value > val) + return 1; + else if (reg->umax_value <= val) + return 0; + break; + case BPF_JSGT: + if (reg->smin_value > sval) + return 1; + else if (reg->smax_value < sval) + return 0; + break; + case BPF_JLT: + if (reg->umax_value < val) + return 1; + else if (reg->umin_value >= val) + return 0; + break; + case BPF_JSLT: + if (reg->smax_value < sval) + return 1; + else if (reg->smin_value >= sval) + return 0; + break; + case BPF_JGE: + if (reg->umin_value >= val) + return 1; + else if (reg->umax_value < val) + return 0; + break; + case BPF_JSGE: + if (reg->smin_value >= sval) + return 1; + else if (reg->smax_value < sval) + return 0; + break; + case BPF_JLE: + if (reg->umax_value <= val) + return 1; + else if (reg->umin_value > val) + return 0; + break; + case BPF_JSLE: + if (reg->smax_value <= sval) + return 1; + else if (reg->smin_value > sval) + return 0; + break; + } + + return -1; +} + +/* Generate min value of the high 32-bit from TNUM info. */ +static u64 gen_hi_min(struct tnum var) +{ + return var.value & ~0xffffffffULL; +} + +/* Generate max value of the high 32-bit from TNUM info. */ +static u64 gen_hi_max(struct tnum var) +{ + return (var.value | var.mask) & ~0xffffffffULL; +} + +/* Return true if VAL is compared with a s64 sign extended from s32, and they + * are with the same signedness. + */ +static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +{ + return ((s32)sval >= 0 && + reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || + ((s32)sval < 0 && + reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -3752,8 +4361,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { + s64 sval; + /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. @@ -3763,51 +4374,93 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + /* For BPF_JEQ, if this is false we know nothing Jon Snow, but + * if it is true we know the value for sure. Likewise for + * BPF_JNE. */ - __mark_reg_known(false_reg, val); - break; - case BPF_JGT: - false_reg->umax_value = min(false_reg->umax_value, val); - true_reg->umin_value = max(true_reg->umin_value, val + 1); - break; - case BPF_JSGT: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - break; - case BPF_JLT: - false_reg->umin_value = max(false_reg->umin_value, val); - true_reg->umax_value = min(true_reg->umax_value, val - 1); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; - case BPF_JSLT: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); + } + case BPF_JSET: + false_reg->var_off = tnum_and(false_reg->var_off, + tnum_const(~val)); + if (is_power_of_2(val)) + true_reg->var_off = tnum_or(true_reg->var_off, + tnum_const(val)); break; case BPF_JGE: - false_reg->umax_value = min(false_reg->umax_value, val - 1); - true_reg->umin_value = max(true_reg->umin_value, val); + case BPF_JGT: + { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSGE: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); + case BPF_JSGT: + { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + /* If the full s64 was not sign-extended from s32 then don't + * deduct further info. + */ + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } case BPF_JLE: - false_reg->umin_value = max(false_reg->umin_value, val + 1); - true_reg->umax_value = min(true_reg->umax_value, val); + case BPF_JLT: + { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSLE: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + case BPF_JSLT: + { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } default: break; } @@ -3830,56 +4483,93 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { + s64 sval; + if (__is_pointer_value(false, false_reg)) return; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; - */ - __mark_reg_known(false_reg, val); - break; - case BPF_JGT: - true_reg->umax_value = min(true_reg->umax_value, val - 1); - false_reg->umin_value = max(false_reg->umin_value, val); - break; - case BPF_JSGT: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - break; - case BPF_JLT: - true_reg->umin_value = max(true_reg->umin_value, val + 1); - false_reg->umax_value = min(false_reg->umax_value, val); + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; - case BPF_JSLT: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); + } + case BPF_JSET: + false_reg->var_off = tnum_and(false_reg->var_off, + tnum_const(~val)); + if (is_power_of_2(val)) + true_reg->var_off = tnum_or(true_reg->var_off, + tnum_const(val)); break; case BPF_JGE: - true_reg->umax_value = min(true_reg->umax_value, val); - false_reg->umin_value = max(false_reg->umin_value, val + 1); + case BPF_JGT: + { + u64 false_umin = opcode == BPF_JGT ? val : val + 1; + u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSGE: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); + case BPF_JSGT: + { + s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } case BPF_JLE: - true_reg->umin_value = max(true_reg->umin_value, val); - false_reg->umax_value = min(false_reg->umax_value, val - 1); + case BPF_JLT: + { + u64 false_umax = opcode == BPF_JLT ? val : val - 1; + u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSLE: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + case BPF_JSLT: + { + s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } default: break; } @@ -3970,8 +4660,13 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; + } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { + reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } - if (is_null || !reg_is_refcounted(reg)) { + if (is_null || !(reg_is_refcounted(reg) || + reg_may_point_to_spin_lock(reg))) { /* We don't need id from this point onwards anymore, * thus we should better reset it, so that state * pruning has chances to take effect. @@ -3993,7 +4688,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; if (reg_is_refcounted_or_null(®s[regno]) && is_null) - __release_reference_state(state, id); + release_reference_state(state, id); for (i = 0; i < MAX_BPF_REG; i++) mark_ptr_or_null_reg(state, ®s[i], id, is_null); @@ -4017,6 +4712,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, if (BPF_SRC(insn->code) != BPF_X) return false; + /* Pointers are always 64-bit. */ + if (BPF_CLASS(insn->code) == BPF_JMP32) + return false; + switch (BPF_OP(insn->code)) { case BPF_JGT: if ((dst_reg->type == PTR_TO_PACKET && @@ -4109,16 +4808,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); + bool is_jmp32; int err; - if (opcode > BPF_JSLE) { - verbose(env, "invalid BPF_JMP opcode %x\n", opcode); + /* Only conditional jumps are expected to reach here. */ + if (opcode == BPF_JA || opcode > BPF_JSLE) { + verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } @@ -4134,7 +4835,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } else { if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } } @@ -4145,29 +4846,26 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; dst_reg = ®s[insn->dst_reg]; + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - /* detect if R == 0 where R was initialized to zero earlier */ - if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == SCALAR_VALUE && - tnum_is_const(dst_reg->var_off)) { - if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || - (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { - /* if (imm == imm) goto pc+off; - * only follow the goto, ignore fall-through - */ + if (BPF_SRC(insn->code) == BPF_K) { + int pred = is_branch_taken(dst_reg, insn->imm, opcode, + is_jmp32); + + if (pred == 1) { + /* only follow the goto, ignore fall-through */ *insn_idx += insn->off; return 0; - } else { - /* if (imm != imm) goto pc+off; - * only follow fall-through branch, since + } else if (pred == 0) { + /* only follow fall-through branch, since * that's where the program will go */ return 0; } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; @@ -4180,30 +4878,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state lo_reg0 = *dst_reg; + struct bpf_reg_state lo_reg1 = *src_reg; + struct bpf_reg_state *src_lo, *dst_lo; + + dst_lo = &lo_reg0; + src_lo = &lo_reg1; + coerce_reg_to_size(dst_lo, 4); + coerce_reg_to_size(src_lo, 4); + if (dst_reg->type == SCALAR_VALUE && - regs[insn->src_reg].type == SCALAR_VALUE) { - if (tnum_is_const(regs[insn->src_reg].var_off)) + src_reg->type == SCALAR_VALUE) { + if (tnum_is_const(src_reg->var_off) || + (is_jmp32 && tnum_is_const(src_lo->var_off))) reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, regs[insn->src_reg].var_off.value, - opcode); - else if (tnum_is_const(dst_reg->var_off)) + dst_reg, + is_jmp32 + ? src_lo->var_off.value + : src_reg->var_off.value, + opcode, is_jmp32); + else if (tnum_is_const(dst_reg->var_off) || + (is_jmp32 && tnum_is_const(dst_lo->var_off))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - ®s[insn->src_reg], - dst_reg->var_off.value, opcode); - else if (opcode == BPF_JEQ || opcode == BPF_JNE) + src_reg, + is_jmp32 + ? dst_lo->var_off.value + : dst_reg->var_off.value, + opcode, is_jmp32); + else if (!is_jmp32 && + (opcode == BPF_JEQ || opcode == BPF_JNE)) /* Comparing for equality, we can combine knowledge */ reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], - ®s[insn->src_reg], - ®s[insn->dst_reg], opcode); + src_reg, dst_reg, opcode); } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode); + dst_reg, insn->imm, opcode, is_jmp32); } - /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ - if (BPF_SRC(insn->code) == BPF_K && + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * NOTE: these optimizations below are related with pointer comparison + * which will never be JMP32. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either @@ -4345,6 +5064,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + if (env->cur_state->active_spin_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); + return -EINVAL; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4471,6 +5195,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { + verbose_linfo(env, t, "%d: ", t); verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -4488,6 +5213,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose_linfo(env, t, "%d: ", t); + verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { @@ -4510,10 +5237,6 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - ret = check_subprogs(env); - if (ret < 0) - return ret; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -4533,7 +5256,8 @@ peek_stack: goto check_state; t = insn_stack[cur_stack - 1]; - if (BPF_CLASS(insns[t].code) == BPF_JMP) { + if (BPF_CLASS(insns[t].code) == BPF_JMP || + BPF_CLASS(insns[t].code) == BPF_JMP32) { u8 opcode = BPF_OP(insns[t].code); if (opcode == BPF_EXIT) { @@ -4622,6 +5346,278 @@ err_free: return ret; } +/* The minimum supported BTF func info size */ +#define MIN_BPF_FUNCINFO_SIZE 8 +#define MAX_FUNCINFO_REC_SIZE 252 + +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + u32 i, nfuncs, urec_size, min_size; + u32 krec_size = sizeof(struct bpf_func_info); + struct bpf_func_info *krecord; + const struct btf_type *type; + struct bpf_prog *prog; + const struct btf *btf; + void __user *urecord; + u32 prev_offset = 0; + int ret = 0; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) + return 0; + + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + if (urec_size < MIN_BPF_FUNCINFO_SIZE || + urec_size > MAX_FUNCINFO_REC_SIZE || + urec_size % sizeof(u32)) { + verbose(env, "invalid func info rec size %u\n", urec_size); + return -EINVAL; + } + + prog = env->prog; + btf = prog->aux->btf; + + urecord = u64_to_user_ptr(attr->func_info); + min_size = min_t(u32, krec_size, urec_size); + + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + if (!krecord) + return -ENOMEM; + + for (i = 0; i < nfuncs; i++) { + ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); + if (ret) { + if (ret == -E2BIG) { + verbose(env, "nonzero tailing record in func info"); + /* set the size kernel expects so loader can zero + * out the rest of the record. + */ + if (put_user(min_size, &uattr->func_info_rec_size)) + ret = -EFAULT; + } + goto err_free; + } + + if (copy_from_user(&krecord[i], urecord, min_size)) { + ret = -EFAULT; + goto err_free; + } + + /* check insn_off */ + if (i == 0) { + if (krecord[i].insn_off) { + verbose(env, + "nonzero insn_off %u for the first func info record", + krecord[i].insn_off); + ret = -EINVAL; + goto err_free; + } + } else if (krecord[i].insn_off <= prev_offset) { + verbose(env, + "same or smaller insn offset (%u) than previous func info record (%u)", + krecord[i].insn_off, prev_offset); + ret = -EINVAL; + goto err_free; + } + + if (env->subprog_info[i].start != krecord[i].insn_off) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + ret = -EINVAL; + goto err_free; + } + + /* check type_id */ + type = btf_type_by_id(btf, krecord[i].type_id); + if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + verbose(env, "invalid type id %d in func info", + krecord[i].type_id); + ret = -EINVAL; + goto err_free; + } + + prev_offset = krecord[i].insn_off; + urecord += urec_size; + } + + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; + return 0; + +err_free: + kvfree(krecord); + return ret; +} + +static void adjust_btf_func(struct bpf_verifier_env *env) +{ + int i; + + if (!env->prog->aux->func_info) + return; + + for (i = 0; i < env->subprog_cnt; i++) + env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; +} + +#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ + sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_btf_line(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; + struct bpf_subprog_info *sub; + struct bpf_line_info *linfo; + struct bpf_prog *prog; + const struct btf *btf; + void __user *ulinfo; + int err; + + nr_linfo = attr->line_info_cnt; + if (!nr_linfo) + return 0; + + rec_size = attr->line_info_rec_size; + if (rec_size < MIN_BPF_LINEINFO_SIZE || + rec_size > MAX_LINEINFO_REC_SIZE || + rec_size & (sizeof(u32) - 1)) + return -EINVAL; + + /* Need to zero it in case the userspace may + * pass in a smaller bpf_line_info object. + */ + linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), + GFP_KERNEL | __GFP_NOWARN); + if (!linfo) + return -ENOMEM; + + prog = env->prog; + btf = prog->aux->btf; + + s = 0; + sub = env->subprog_info; + ulinfo = u64_to_user_ptr(attr->line_info); + expected_size = sizeof(struct bpf_line_info); + ncopy = min_t(u32, expected_size, rec_size); + for (i = 0; i < nr_linfo; i++) { + err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in line_info"); + if (put_user(expected_size, + &uattr->line_info_rec_size)) + err = -EFAULT; + } + goto err_free; + } + + if (copy_from_user(&linfo[i], ulinfo, ncopy)) { + err = -EFAULT; + goto err_free; + } + + /* + * Check insn_off to ensure + * 1) strictly increasing AND + * 2) bounded by prog->len + * + * The linfo[0].insn_off == 0 check logically falls into + * the later "missing bpf_line_info for func..." case + * because the first linfo[0].insn_off must be the + * first sub also and the first sub must have + * subprog_info[0].start == 0. + */ + if ((i && linfo[i].insn_off <= prev_offset) || + linfo[i].insn_off >= prog->len) { + verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", + i, linfo[i].insn_off, prev_offset, + prog->len); + err = -EINVAL; + goto err_free; + } + + if (!prog->insnsi[linfo[i].insn_off].code) { + verbose(env, + "Invalid insn code at line_info[%u].insn_off\n", + i); + err = -EINVAL; + goto err_free; + } + + if (!btf_name_by_offset(btf, linfo[i].line_off) || + !btf_name_by_offset(btf, linfo[i].file_name_off)) { + verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); + err = -EINVAL; + goto err_free; + } + + if (s != env->subprog_cnt) { + if (linfo[i].insn_off == sub[s].start) { + sub[s].linfo_idx = i; + s++; + } else if (sub[s].start < linfo[i].insn_off) { + verbose(env, "missing bpf_line_info for func#%u\n", s); + err = -EINVAL; + goto err_free; + } + } + + prev_offset = linfo[i].insn_off; + ulinfo += rec_size; + } + + if (s != env->subprog_cnt) { + verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", + env->subprog_cnt - s, s); + err = -EINVAL; + goto err_free; + } + + prog->aux->linfo = linfo; + prog->aux->nr_linfo = nr_linfo; + + return 0; + +err_free: + kvfree(linfo); + return err; +} + +static int check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct btf *btf; + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) + return 0; + + btf = btf_get_by_fd(attr->prog_btf_fd); + if (IS_ERR(btf)) + return PTR_ERR(btf); + env->prog->aux->btf = btf; + + err = check_btf_func(env, attr, uattr); + if (err) + return err; + + err = check_btf_line(env, attr, uattr); + if (err) + return err; + + return 0; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -4668,6 +5664,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) return false; } +static void clean_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *st) +{ + enum bpf_reg_liveness live; + int i, j; + + for (i = 0; i < BPF_REG_FP; i++) { + live = st->regs[i].live; + /* liveness must not touch this register anymore */ + st->regs[i].live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) + /* since the register is unused, clear its state + * to make further comparison simpler + */ + __mark_reg_not_init(&st->regs[i]); + } + + for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { + live = st->stack[i].spilled_ptr.live; + /* liveness must not touch this stack slot anymore */ + st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) { + __mark_reg_not_init(&st->stack[i].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + st->stack[i].slot_type[j] = STACK_INVALID; + } + } +} + +static void clean_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + int i; + + if (st->frame[0]->regs[0].live & REG_LIVE_DONE) + /* all regs in this state in all frames were already marked */ + return; + + for (i = 0; i <= st->curframe; i++) + clean_func_state(env, st->frame[i]); +} + +/* the parentage chains form a tree. + * the verifier states are added to state lists at given insn and + * pushed into state stack for future exploration. + * when the verifier reaches bpf_exit insn some of the verifer states + * stored in the state lists have their final liveness state already, + * but a lot of states will get revised from liveness point of view when + * the verifier explores other branches. + * Example: + * 1: r0 = 1 + * 2: if r1 == 100 goto pc+1 + * 3: r0 = 2 + * 4: exit + * when the verifier reaches exit insn the register r0 in the state list of + * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. At the insn 4 it will walk the + * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * + * Since the verifier pushes the branch states as it sees them while exploring + * the program the condition of walking the branch instruction for the second + * time means that all states below this branch were already explored and + * their final liveness markes are already propagated. + * Hence when the verifier completes the search of state list in is_state_visited() + * we can call this clean_live_states() function to mark all liveness states + * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * will not be used. + * This function also clears the registers and stack for states that !READ + * to simplify state merging. + * + * Important note here that walking the same branch instruction in the callee + * doesn't meant that the states are DONE. The verifier has to compare + * the callsites + */ +static void clean_live_states(struct bpf_verifier_env *env, int insn, + struct bpf_verifier_state *cur) +{ + struct bpf_verifier_state_list *sl; + int i; + + sl = env->explored_states[insn]; + if (!sl) + return; + + while (sl != STATE_LIST_MARK) { + if (sl->state.curframe != cur->curframe) + goto next; + for (i = 0; i <= cur->curframe; i++) + if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) + goto next; + clean_verifier_state(env, &sl->state); +next: + sl = sl->next; + } +} + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) @@ -4713,8 +5805,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * We don't care about the 'id' value, because nothing - * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + * 'id' is not compared, since it's only used for maps with + * bpf_spin_lock inside map element and in such cases if + * the rest of the prog is valid for one map element then + * it's valid for all map elements regardless of the key + * used in bpf_map_lookup() */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && @@ -4762,6 +5857,10 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -4781,12 +5880,6 @@ static bool stacksafe(struct bpf_func_state *old, { int i, spi; - /* if explored stack has more populated slots than current stack - * such stacks are not equivalent - */ - if (old->allocated_stack > cur->allocated_stack) - return false; - /* walk slots of the explored stack and ignore any additional * slots in the current stack, since explored(safe) state * didn't use them @@ -4794,12 +5887,21 @@ static bool stacksafe(struct bpf_func_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { + i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; + } if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + + /* explored stack has more populated slots than current stack + * and these slots were used + */ + if (i >= cur->allocated_stack) + return false; + /* if old state was safe with misc data in the stack * it will be safe with zero-initialized stack. * The opposite is not true @@ -4908,6 +6010,15 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + + if (old->active_spin_lock != cur->active_spin_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -4974,7 +6085,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; - int i, j, err; + int i, j, err, states_cnt = 0; sl = env->explored_states[insn_idx]; if (!sl) @@ -4983,6 +6094,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + clean_live_states(env, insn_idx, cur); + while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, @@ -5001,8 +6114,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 1; } sl = sl->next; + states_cnt++; } + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + return 0; + /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) @@ -5024,9 +6141,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; - /* connect new state to parentage chain */ - for (i = 0; i < BPF_REG_FP; i++) - cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; + /* connect new state to parentage chain. Current frame needs all + * registers connected. Only r6 - r9 of the callers are alive (pushed + * to the stack implicitly by JITs) so in callers' frames connect just + * r6 - r9 as an optimization. Callers will have r1 - r5 connected to + * the state of the call instruction (with WRITTEN set), and r0 comes + * from callee with its full parentage chain, anyway. + */ + for (j = 0; j <= cur->curframe; j++) + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -5057,6 +6181,10 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_CTX: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return false; default: return true; @@ -5087,14 +6215,16 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len, i; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; + env->prev_linfo = NULL; + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; state->curframe = 0; + state->speculative = false; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5105,19 +6235,19 @@ static int do_check(struct bpf_verifier_env *env) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, 0 /* subprogno, zero == main subprog */); - insn_idx = 0; + for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -5127,30 +6257,37 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ if (env->log.level) { if (do_print_state) - verbose(env, "\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else - verbose(env, "%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", env->insn_idx); } goto process_bpf_exit; } + if (signal_pending(current)) + return -EAGAIN; + if (need_resched()) cond_resched(); if (env->log.level > 1 || (env->log.level && do_print_state)) { if (env->log.level > 1) - verbose(env, "%d:", insn_idx); + verbose(env, "%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d:", - prev_insn_idx, insn_idx); + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -5161,19 +6298,20 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; - verbose(env, "%d: ", insn_idx); + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { - err = bpf_prog_offload_verify_insn(env, insn_idx, - prev_insn_idx); + err = bpf_prog_offload_verify_insn(env, env->insn_idx, + env->prev_insn_idx); if (err) return err; } regs = cur_regs(env); - env->insn_aux_data[insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -5199,13 +6337,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -5230,10 +6368,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -5249,13 +6387,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -5283,13 +6421,13 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -5297,15 +6435,22 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL) || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } + if (env->cur_state->active_spin_lock && + (insn->src_reg == BPF_PSEUDO_CALL || + insn->imm != BPF_FUNC_spin_unlock)) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } if (insn->src_reg == BPF_PSEUDO_CALL) - err = check_func_call(env, insn, &insn_idx); + err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, insn_idx); + err = check_helper_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -5313,27 +6458,34 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } + if (env->cur_state->active_spin_lock) { + verbose(env, "bpf_spin_unlock is missing\n"); + return -EINVAL; + } + if (state->curframe) { /* exit from nested function */ - prev_insn_idx = insn_idx; - err = prepare_func_exit(env, &insn_idx); + env->prev_insn_idx = env->insn_idx; + err = prepare_func_exit(env, &env->insn_idx); if (err) return err; do_print_state = true; @@ -5363,7 +6515,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &prev_insn_idx, &insn_idx); + err = pop_stack(env, &env->prev_insn_idx, + &env->insn_idx); if (err < 0) { if (err != -ENOENT) return err; @@ -5373,7 +6526,7 @@ process_bpf_exit: continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -5390,8 +6543,8 @@ process_bpf_exit: if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -5401,7 +6554,7 @@ process_bpf_exit: return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose(env, "processed %d insns (limit %d), stack depth ", @@ -5426,6 +6579,19 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } +static bool is_tracing_prog_type(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return true; + default: + return false; + } +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -5448,6 +6614,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if ((is_tracing_prog_type(prog->type) || + prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + map_value_has_spin_lock(map)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -5644,7 +6817,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len return; /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) + if (env->subprog_info[i].start <= off) continue; env->subprog_info[i].start += len - 1; } @@ -5664,6 +6837,153 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, + u32 off, u32 cnt) +{ + int i, j; + + /* find first prog starting at or after off (first to remove) */ + for (i = 0; i < env->subprog_cnt; i++) + if (env->subprog_info[i].start >= off) + break; + /* find first prog starting at or after off + cnt (first to stay) */ + for (j = i; j < env->subprog_cnt; j++) + if (env->subprog_info[j].start >= off + cnt) + break; + /* if j doesn't start exactly at off + cnt, we are just removing + * the front of previous prog + */ + if (env->subprog_info[j].start != off + cnt) + j--; + + if (j > i) { + struct bpf_prog_aux *aux = env->prog->aux; + int move; + + /* move fake 'exit' subprog as well */ + move = env->subprog_cnt + 1 - j; + + memmove(env->subprog_info + i, + env->subprog_info + j, + sizeof(*env->subprog_info) * move); + env->subprog_cnt -= j - i; + + /* remove func_info */ + if (aux->func_info) { + move = aux->func_info_cnt - j; + + memmove(aux->func_info + i, + aux->func_info + j, + sizeof(*aux->func_info) * move); + aux->func_info_cnt -= j - i; + /* func_info->insn_off is set after all code rewrites, + * in adjust_btf_func() - no need to adjust + */ + } + } else { + /* convert i from "first prog to remove" to "first to adjust" */ + if (env->subprog_info[i].start == off) + i++; + } + + /* update fake 'exit' subprog as well */ + for (; i <= env->subprog_cnt; i++) + env->subprog_info[i].start -= cnt; + + return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, + u32 cnt) +{ + struct bpf_prog *prog = env->prog; + u32 i, l_off, l_cnt, nr_linfo; + struct bpf_line_info *linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo) + return 0; + + linfo = prog->aux->linfo; + + /* find first line info to remove, count lines to be removed */ + for (i = 0; i < nr_linfo; i++) + if (linfo[i].insn_off >= off) + break; + + l_off = i; + l_cnt = 0; + for (; i < nr_linfo; i++) + if (linfo[i].insn_off < off + cnt) + l_cnt++; + else + break; + + /* First live insn doesn't match first live linfo, it needs to "inherit" + * last removed linfo. prog is already modified, so prog->len == off + * means no live instructions after (tail of the program was removed). + */ + if (prog->len != off && l_cnt && + (i == nr_linfo || linfo[i].insn_off != off + cnt)) { + l_cnt--; + linfo[--i].insn_off = off + cnt; + } + + /* remove the line info which refer to the removed instructions */ + if (l_cnt) { + memmove(linfo + l_off, linfo + i, + sizeof(*linfo) * (nr_linfo - i)); + + prog->aux->nr_linfo -= l_cnt; + nr_linfo = prog->aux->nr_linfo; + } + + /* pull all linfo[i].insn_off >= off + cnt in by cnt */ + for (i = l_off; i < nr_linfo; i++) + linfo[i].insn_off -= cnt; + + /* fix up all subprogs (incl. 'exit') which start >= off */ + for (i = 0; i <= env->subprog_cnt; i++) + if (env->subprog_info[i].linfo_idx > l_off) { + /* program may have started in the removed region but + * may not be fully removed + */ + if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) + env->subprog_info[i].linfo_idx -= l_cnt; + else + env->subprog_info[i].linfo_idx = l_off; + } + + return 0; +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + unsigned int orig_prog_len = env->prog->len; + int err; + + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_remove_insns(env, off, cnt); + + err = bpf_remove_insns(env->prog, off, cnt); + if (err) + return err; + + err = adjust_subprog_starts_after_remove(env, off, cnt); + if (err) + return err; + + err = bpf_adj_linfo_after_remove(env, off, cnt); + if (err) + return err; + + memmove(aux_data + off, aux_data + off + cnt, + sizeof(*aux_data) * (orig_prog_len - off - cnt)); + + return 0; +} + /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can * have dead code too. Therefore replace all dead at-run-time code @@ -5690,6 +7010,91 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } +static bool insn_is_cond_jump(u8 code) +{ + u8 op; + + if (BPF_CLASS(code) == BPF_JMP32) + return true; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + + op = BPF_OP(code); + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!insn_is_cond_jump(insn->code)) + continue; + + if (!aux_data[i + 1].seen) + ja.off = insn->off; + else if (!aux_data[i + 1 + insn->off].seen) + ja.off = 0; + else + continue; + + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_replace_insn(env, i, &ja); + + memcpy(insn, &ja, sizeof(ja)); + } +} + +static int opt_remove_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + int j; + + j = 0; + while (i + j < insn_cnt && !aux_data[i + j].seen) + j++; + if (!j) + continue; + + err = verifier_remove_insns(env, i, j); + if (err) + return err; + insn_cnt = env->prog->len; + } + + return 0; +} + +static int opt_remove_nops(struct bpf_verifier_env *env) +{ + const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + if (memcmp(&insn[i], &ja, sizeof(ja))) + continue; + + err = verifier_remove_insns(env, i, 1); + if (err) + return err; + insn_cnt--; + i--; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -5701,12 +7106,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; + u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; - u32 target_size; - if (ops->gen_prologue) { + if (ops->gen_prologue || env->seen_direct_write) { + if (!ops->gen_prologue) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { @@ -5778,8 +7187,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = ops->convert_ctx_access; break; case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; default: continue; } @@ -5793,9 +7206,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * we will apply proper mask to the result. */ is_narrower_load = size < ctx_field_size; + size_default = bpf_ctx_off_adjust_machine(ctx_field_size); + off = insn->off; if (is_narrower_load) { - u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); - u32 off = insn->off; u8 size_code; if (type == BPF_WRITE) { @@ -5823,12 +7236,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - if (ctx_field_size <= 4) + u8 shift = (off & (size_default - 1)) * 8; + + if (ctx_field_size <= 4) { + if (shift) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); - else + } else { + if (shift) + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); + } } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); @@ -5851,7 +7275,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err = -ENOMEM; + int err; if (env->subprog_cnt <= 1) return 0; @@ -5882,6 +7306,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } + err = bpf_prog_alloc_jited_linfo(prog); + if (err) + goto out_undo_insn; + + err = -ENOMEM; func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) goto out_undo_insn; @@ -5891,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + /* BPF_PROG_RUN doesn't call subprogs directly, + * hence main prog stats include the runtime of subprogs. + * subprogs don't have IDs and not reachable via prog_get_next_id + * func[i]->aux->stats will never be accessed and stays NULL + */ + func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], @@ -5901,12 +7335,21 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->aux->func_idx = i; + /* the btf and func_info will be freed only at prog->aux */ + func[i]->aux->btf = prog->aux->btf; + func[i]->aux->func_info = prog->aux->func_info; + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->linfo = prog->aux->linfo; + func[i]->aux->nr_linfo = prog->aux->nr_linfo; + func[i]->aux->jited_linfo = prog->aux->jited_linfo; + func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -5980,6 +7423,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; + bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: for (i = 0; i < env->subprog_cnt; i++) @@ -5996,6 +7440,7 @@ out_undo_insn: insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; } + bpf_prog_free_jited_linfo(prog); return err; } @@ -6109,6 +7554,58 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) @@ -6128,6 +7625,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ prog->cb_access = 1; env->prog->aux->stack_depth = MAX_BPF_STACK; + env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal @@ -6292,11 +7790,13 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, + union bpf_attr __user *uattr) { struct bpf_verifier_env *env; struct bpf_verifier_log *log; - int ret = -EINVAL; + int i, len, ret = -EINVAL; + bool is_priv; /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) @@ -6310,12 +7810,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) return -ENOMEM; log = &env->log; + len = (*prog)->len; env->insn_aux_data = - vzalloc(array_size(sizeof(struct bpf_insn_aux_data), - (*prog)->len)); + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; + for (i = 0; i < len; i++) + env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -6340,13 +7842,18 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) + env->strict_alignment = false; + + is_priv = capable(CAP_SYS_ADMIN); + env->allow_ptr_leaks = is_priv; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); + ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) goto skip_full_check; } @@ -6358,7 +7865,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_subprogs(env); + if (ret < 0) + goto skip_full_check; + + ret = check_btf_info(env, attr, uattr); + if (ret < 0) + goto skip_full_check; ret = check_cfg(env); if (ret < 0) @@ -6378,11 +7891,21 @@ skip_full_check: free_states(env); if (ret == 0) - sanitize_dead_code(env); - - if (ret == 0) ret = check_max_stack_depth(env); + /* instruction rewrites happen after this point */ + if (is_priv) { + if (ret == 0) + opt_hard_wire_dead_code_branches(env); + if (ret == 0) + ret = opt_remove_dead_code(env); + if (ret == 0) + ret = opt_remove_nops(env); + } else { + if (ret == 0) + sanitize_dead_code(env); + } + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); @@ -6421,6 +7944,9 @@ skip_full_check: convert_pseudo_ld_imm64(env); } + if (ret == 0) + adjust_btf_func(env); + err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release |
