From 11e17ae423778f48c84da6a2e215f140610e1973 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 22 Mar 2022 14:21:49 +0800 Subject: bpf: Use swap() instead of open coding it Clean the following coccicheck warning: ./kernel/trace/bpf_trace.c:2263:34-35: WARNING opportunity for swap(). ./kernel/trace/bpf_trace.c:2264:40-41: WARNING opportunity for swap(). Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220322062149.109180-1-jiapeng.chong@linux.alibaba.com --- kernel/trace/bpf_trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fa2ebc07f60..836f021cb609 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2254,15 +2254,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void const struct bpf_kprobe_multi_link *link = priv; unsigned long *addr_a = a, *addr_b = b; u64 *cookie_a, *cookie_b; - unsigned long tmp1; - u64 tmp2; cookie_a = link->cookies + (addr_a - link->addrs); cookie_b = link->cookies + (addr_b - link->addrs); /* swap addr_a/addr_b and cookie_a/cookie_b values */ - tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1; - tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2; + swap(*addr_a, *addr_b); + swap(*cookie_a, *cookie_b); } static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) -- cgit v1.2.3 From 8eb943fc5e5fa62751248302fe8f6c9856d81c09 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 23 Mar 2022 15:36:26 +0800 Subject: bpf: Remove redundant assignment to smap->map.value_size The attr->value_size is already assigned to smap->map.value_size in bpf_map_init_from_attr(), there is no need to do it again in stack_map_alloc(). Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Acked-by: Joanne Koong Link: https://lore.kernel.org/bpf/20220323073626.958652-1-ytcoode@gmail.com --- kernel/bpf/stackmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 34725bfa1e97..6131b4a19572 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -106,7 +106,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); - smap->map.value_size = value_size; smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); -- cgit v1.2.3 From 185da3da9379948ffbe45051b16d526c428fb06e Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 31 Mar 2022 11:19:29 +0200 Subject: bpf: Replace usage of supported with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use the found variable (existed & supported) and simply checking if the variable was set, can determine if the break/goto was hit. [1] https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220331091929.647057-1-jakobkoschel@gmail.com --- kernel/bpf/bpf_iter.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 110029ede71e..dea920b3b840 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -330,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; + struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; - struct bpf_iter_target_info *tinfo; int prefix_len = strlen(prefix); - bool supported = false; if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { - supported = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id && iter->btf_id == prog_btf_id) { + tinfo = iter; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { - cache_btf_id(tinfo, prog); - supported = true; + if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { + cache_btf_id(iter, prog); + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (supported) { + if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } - return supported; + return tinfo != NULL; } const struct bpf_func_proto * @@ -499,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { + struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; - struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; - bool existed = false; bpfptr_t ulinfo; int err; @@ -530,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id == prog_btf_id) { - existed = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id == prog_btf_id) { + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (!existed) + if (!tinfo) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); -- cgit v1.2.3 From cfc1d277891eb499b3b5354df33b30f598683e90 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:31 +0000 Subject: module: Move all into module/ No functional changes. This patch moves all module related code into a separate directory, modifies each file name and creates a new Makefile. Note: this effort is in preparation to refactor core module code. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- MAINTAINERS | 2 +- kernel/Makefile | 5 +- kernel/module-internal.h | 50 - kernel/module.c | 4810 -------------------------------------------- kernel/module/Makefile | 12 + kernel/module/decompress.c | 273 +++ kernel/module/internal.h | 50 + kernel/module/main.c | 4810 ++++++++++++++++++++++++++++++++++++++++++++ kernel/module/signing.c | 45 + kernel/module_decompress.c | 273 --- kernel/module_signing.c | 45 - 11 files changed, 5192 insertions(+), 5183 deletions(-) delete mode 100644 kernel/module-internal.h delete mode 100644 kernel/module.c create mode 100644 kernel/module/Makefile create mode 100644 kernel/module/decompress.c create mode 100644 kernel/module/internal.h create mode 100644 kernel/module/main.c create mode 100644 kernel/module/signing.c delete mode 100644 kernel/module_decompress.c delete mode 100644 kernel/module_signing.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index fd768d43e048..5e7778cd437f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13264,7 +13264,7 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules-next F: include/linux/module.h -F: kernel/module.c +F: kernel/module/ MONOLITHIC POWER SYSTEM PMIC DRIVER M: Saravanan Sekar diff --git a/kernel/Makefile b/kernel/Makefile index 471d71935e90..a7f8c088cc94 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -29,7 +29,6 @@ KCOV_INSTRUMENT_softirq.o := n KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n KCOV_INSTRUMENT_extable.o := n KCOV_INSTRUMENT_stacktrace.o := n # Don't self-instrument. @@ -53,6 +52,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -66,9 +66,6 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o diff --git a/kernel/module-internal.h b/kernel/module-internal.h deleted file mode 100644 index 8c381c99062f..000000000000 --- a/kernel/module-internal.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Module internals - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include -#include - -struct load_info { - const char *name; - /* pointer to module in temporary copy, freed at end of load_module() */ - struct module *mod; - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug *debug; - unsigned int num_debug; - bool sig_ok; -#ifdef CONFIG_KALLSYMS - unsigned long mod_kallsyms_init_off; -#endif -#ifdef CONFIG_MODULE_DECOMPRESS - struct page **pages; - unsigned int max_pages; - unsigned int used_pages; -#endif - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - -extern int mod_verify_sig(const void *mod, struct load_info *info); - -#ifdef CONFIG_MODULE_DECOMPRESS -int module_decompress(struct load_info *info, const void *buf, size_t size); -void module_decompress_cleanup(struct load_info *info); -#else -static inline int module_decompress(struct load_info *info, - const void *buf, size_t size) -{ - return -EOPNOTSUPP; -} -static inline void module_decompress_cleanup(struct load_info *info) -{ -} -#endif diff --git a/kernel/module.c b/kernel/module.c deleted file mode 100644 index 6cea788fd965..000000000000 --- a/kernel/module.c +++ /dev/null @@ -1,4810 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2002 Richard Henderson - * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. - */ - -#define INCLUDE_VERMAGIC - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "module-internal.h" - -#define CREATE_TRACE_POINTS -#include - -#ifndef ARCH_SHF_SMALL -#define ARCH_SHF_SMALL 0 -#endif - -/* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -# define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif - -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) - -/* - * Mutex protects: - * 1) List of modules (also safely readable with preempt_disable), - * 2) module_use links, - * 3) module_addr_min/module_addr_max. - * (delete and add uses RCU list operations). - */ -static DEFINE_MUTEX(module_mutex); -static LIST_HEAD(modules); - -/* Work queue for freeing init sections in success case */ -static void do_free_init(struct work_struct *w); -static DECLARE_WORK(init_free_wq, do_free_init); -static LLIST_HEAD(init_free_list); - -#ifdef CONFIG_MODULES_TREE_LOOKUP - -/* - * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. - * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. - */ - -static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->base; -} - -static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->size; -} - -static __always_inline bool -mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) -{ - return __mod_tree_val(a) < __mod_tree_val(b); -} - -static __always_inline int -mod_tree_comp(void *key, struct latch_tree_node *n) -{ - unsigned long val = (unsigned long)key; - unsigned long start, end; - - start = __mod_tree_val(n); - if (val < start) - return -1; - - end = start + __mod_tree_size(n); - if (val >= end) - return 1; - - return 0; -} - -static const struct latch_tree_ops mod_tree_ops = { - .less = mod_tree_less, - .comp = mod_tree_comp, -}; - -static struct mod_tree_root { - struct latch_tree_root root; - unsigned long addr_min; - unsigned long addr_max; -} mod_tree __cacheline_aligned = { - .addr_min = -1UL, -}; - -#define module_addr_min mod_tree.addr_min -#define module_addr_max mod_tree.addr_max - -static noinline void __mod_tree_insert(struct mod_tree_node *node) -{ - latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); -} - -static void __mod_tree_remove(struct mod_tree_node *node) -{ - latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); -} - -/* - * These modifications: insert, remove_init and remove; are serialized by the - * module_mutex. - */ -static void mod_tree_insert(struct module *mod) -{ - mod->core_layout.mtn.mod = mod; - mod->init_layout.mtn.mod = mod; - - __mod_tree_insert(&mod->core_layout.mtn); - if (mod->init_layout.size) - __mod_tree_insert(&mod->init_layout.mtn); -} - -static void mod_tree_remove_init(struct module *mod) -{ - if (mod->init_layout.size) - __mod_tree_remove(&mod->init_layout.mtn); -} - -static void mod_tree_remove(struct module *mod) -{ - __mod_tree_remove(&mod->core_layout.mtn); - mod_tree_remove_init(mod); -} - -static struct module *mod_find(unsigned long addr) -{ - struct latch_tree_node *ltn; - - ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); - if (!ltn) - return NULL; - - return container_of(ltn, struct mod_tree_node, node)->mod; -} - -#else /* MODULES_TREE_LOOKUP */ - -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - -static void mod_tree_insert(struct module *mod) { } -static void mod_tree_remove_init(struct module *mod) { } -static void mod_tree_remove(struct module *mod) { } - -static struct module *mod_find(unsigned long addr) -{ - struct module *mod; - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - if (within_module(addr, mod)) - return mod; - } - - return NULL; -} - -#endif /* MODULES_TREE_LOOKUP */ - -/* - * Bounds of module text, for speeding up __module_address. - * Protected by module_mutex. - */ -static void __mod_update_bounds(void *base, unsigned int size) -{ - unsigned long min = (unsigned long)base; - unsigned long max = min + size; - - if (min < module_addr_min) - module_addr_min = min; - if (max > module_addr_max) - module_addr_max = max; -} - -static void mod_update_bounds(struct module *mod) -{ - __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); - if (mod->init_layout.size) - __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); -} - -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ - -static void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - -#ifdef CONFIG_MODULE_SIG -static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); -module_param(sig_enforce, bool_enable_only, 0644); - -void set_module_sig_enforced(void) -{ - sig_enforce = true; -} -#else -#define sig_enforce false -#endif - -/* - * Export sig_enforce kernel cmdline parameter to allow other subsystems rely - * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. - */ -bool is_module_sig_enforced(void) -{ - return sig_enforce; -} -EXPORT_SYMBOL(is_module_sig_enforced); - -/* Block module loading/unloading? */ -int modules_disabled = 0; -core_param(nomodule, modules_disabled, bint, 0); - -/* Waiting for a module to finish initializing? */ -static DECLARE_WAIT_QUEUE_HEAD(module_wq); - -static BLOCKING_NOTIFIER_HEAD(module_notify_list); - -int register_module_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&module_notify_list, nb); -} -EXPORT_SYMBOL(register_module_notifier); - -int unregister_module_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&module_notify_list, nb); -} -EXPORT_SYMBOL(unregister_module_notifier); - -/* - * We require a truly strong try_module_get(): 0 means success. - * Otherwise an error is returned due to ongoing or failed - * initialization etc. - */ -static inline int strong_try_module_get(struct module *mod) -{ - BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); - if (mod && mod->state == MODULE_STATE_COMING) - return -EBUSY; - if (try_module_get(mod)) - return 0; - else - return -ENOENT; -} - -static inline void add_taint_module(struct module *mod, unsigned flag, - enum lockdep_ok lockdep_ok) -{ - add_taint(flag, lockdep_ok); - set_bit(flag, &mod->taints); -} - -/* - * A thread that wants to hold a reference to a module only while it - * is running can call this to safely exit. - */ -void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) -{ - module_put(mod); - kthread_exit(code); -} -EXPORT_SYMBOL(__module_put_and_kthread_exit); - -/* Find a module section: 0 means not found. */ -static unsigned int find_sec(const struct load_info *info, const char *name) -{ - unsigned int i; - - for (i = 1; i < info->hdr->e_shnum; i++) { - Elf_Shdr *shdr = &info->sechdrs[i]; - /* Alloc bit cleared means "ignore it." */ - if ((shdr->sh_flags & SHF_ALLOC) - && strcmp(info->secstrings + shdr->sh_name, name) == 0) - return i; - } - return 0; -} - -/* Find a module section, or NULL. */ -static void *section_addr(const struct load_info *info, const char *name) -{ - /* Section 0 has sh_addr 0. */ - return (void *)info->sechdrs[find_sec(info, name)].sh_addr; -} - -/* Find a module section, or NULL. Fill in number of "objects" in section. */ -static void *section_objs(const struct load_info *info, - const char *name, - size_t object_size, - unsigned int *num) -{ - unsigned int sec = find_sec(info, name); - - /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = info->sechdrs[sec].sh_size / object_size; - return (void *)info->sechdrs[sec].sh_addr; -} - -/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */ -static unsigned int find_any_sec(const struct load_info *info, const char *name) -{ - unsigned int i; - - for (i = 1; i < info->hdr->e_shnum; i++) { - Elf_Shdr *shdr = &info->sechdrs[i]; - if (strcmp(info->secstrings + shdr->sh_name, name) == 0) - return i; - } - return 0; -} - -/* - * Find a module section, or NULL. Fill in number of "objects" in section. - * Ignores SHF_ALLOC flag. - */ -static __maybe_unused void *any_section_objs(const struct load_info *info, - const char *name, - size_t object_size, - unsigned int *num) -{ - unsigned int sec = find_any_sec(info, name); - - /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = info->sechdrs[sec].sh_size / object_size; - return (void *)info->sechdrs[sec].sh_addr; -} - -/* Provided by the linker */ -extern const struct kernel_symbol __start___ksymtab[]; -extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const s32 __start___kcrctab[]; -extern const s32 __start___kcrctab_gpl[]; - -#ifndef CONFIG_MODVERSIONS -#define symversion(base, idx) NULL -#else -#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) -#endif - -struct symsearch { - const struct kernel_symbol *start, *stop; - const s32 *crcs; - enum mod_license { - NOT_GPL_ONLY, - GPL_ONLY, - } license; -}; - -struct find_symbol_arg { - /* Input */ - const char *name; - bool gplok; - bool warn; - - /* Output */ - struct module *owner; - const s32 *crc; - const struct kernel_symbol *sym; - enum mod_license license; -}; - -static bool check_exported_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) -{ - struct find_symbol_arg *fsa = data; - - if (!fsa->gplok && syms->license == GPL_ONLY) - return false; - fsa->owner = owner; - fsa->crc = symversion(syms->crcs, symnum); - fsa->sym = &syms->start[symnum]; - fsa->license = syms->license; - return true; -} - -static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) -{ -#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS - return (unsigned long)offset_to_ptr(&sym->value_offset); -#else - return sym->value; -#endif -} - -static const char *kernel_symbol_name(const struct kernel_symbol *sym) -{ -#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS - return offset_to_ptr(&sym->name_offset); -#else - return sym->name; -#endif -} - -static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) -{ -#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS - if (!sym->namespace_offset) - return NULL; - return offset_to_ptr(&sym->namespace_offset); -#else - return sym->namespace; -#endif -} - -static int cmp_name(const void *name, const void *sym) -{ - return strcmp(name, kernel_symbol_name(sym)); -} - -static bool find_exported_symbol_in_section(const struct symsearch *syms, - struct module *owner, - void *data) -{ - struct find_symbol_arg *fsa = data; - struct kernel_symbol *sym; - - sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, - sizeof(struct kernel_symbol), cmp_name); - - if (sym != NULL && check_exported_symbol(syms, owner, - sym - syms->start, data)) - return true; - - return false; -} - -/* - * Find an exported symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. - */ -static bool find_symbol(struct find_symbol_arg *fsa) -{ - static const struct symsearch arr[] = { - { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - NOT_GPL_ONLY }, - { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, - GPL_ONLY }, - }; - struct module *mod; - unsigned int i; - - module_assert_mutex_or_preempt(); - - for (i = 0; i < ARRAY_SIZE(arr); i++) - if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) - return true; - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - struct symsearch arr[] = { - { mod->syms, mod->syms + mod->num_syms, mod->crcs, - NOT_GPL_ONLY }, - { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, - GPL_ONLY }, - }; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - for (i = 0; i < ARRAY_SIZE(arr); i++) - if (find_exported_symbol_in_section(&arr[i], mod, fsa)) - return true; - } - - pr_debug("Failed to find symbol %s\n", fsa->name); - return false; -} - -/* - * Search for module by name: must hold module_mutex (or preempt disabled - * for read-only access). - */ -static struct module *find_module_all(const char *name, size_t len, - bool even_unformed) -{ - struct module *mod; - - module_assert_mutex_or_preempt(); - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) - continue; - if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) - return mod; - } - return NULL; -} - -struct module *find_module(const char *name) -{ - return find_module_all(name, strlen(name), false); -} - -#ifdef CONFIG_SMP - -static inline void __percpu *mod_percpu(struct module *mod) -{ - return mod->percpu; -} - -static int percpu_modalloc(struct module *mod, struct load_info *info) -{ - Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; - unsigned long align = pcpusec->sh_addralign; - - if (!pcpusec->sh_size) - return 0; - - if (align > PAGE_SIZE) { - pr_warn("%s: per-cpu alignment %li > %li\n", - mod->name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); - if (!mod->percpu) { - pr_warn("%s: Could not allocate %lu bytes percpu data\n", - mod->name, (unsigned long)pcpusec->sh_size); - return -ENOMEM; - } - mod->percpu_size = pcpusec->sh_size; - return 0; -} - -static void percpu_modfree(struct module *mod) -{ - free_percpu(mod->percpu); -} - -static unsigned int find_pcpusec(struct load_info *info) -{ - return find_sec(info, ".data..percpu"); -} - -static void percpu_modcopy(struct module *mod, - const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); -} - -bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) -{ - struct module *mod; - unsigned int cpu; - - preempt_disable(); - - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (!mod->percpu_size) - continue; - for_each_possible_cpu(cpu) { - void *start = per_cpu_ptr(mod->percpu, cpu); - void *va = (void *)addr; - - if (va >= start && va < start + mod->percpu_size) { - if (can_addr) { - *can_addr = (unsigned long) (va - start); - *can_addr += (unsigned long) - per_cpu_ptr(mod->percpu, - get_boot_cpu_id()); - } - preempt_enable(); - return true; - } - } - } - - preempt_enable(); - return false; -} - -/** - * is_module_percpu_address() - test whether address is from module static percpu - * @addr: address to test - * - * Test whether @addr belongs to module static percpu area. - * - * Return: %true if @addr is from module static percpu area - */ -bool is_module_percpu_address(unsigned long addr) -{ - return __is_module_percpu_address(addr, NULL); -} - -#else /* ... !CONFIG_SMP */ - -static inline void __percpu *mod_percpu(struct module *mod) -{ - return NULL; -} -static int percpu_modalloc(struct module *mod, struct load_info *info) -{ - /* UP modules shouldn't have this section: ENOMEM isn't quite right */ - if (info->sechdrs[info->index.pcpu].sh_size != 0) - return -ENOMEM; - return 0; -} -static inline void percpu_modfree(struct module *mod) -{ -} -static unsigned int find_pcpusec(struct load_info *info) -{ - return 0; -} -static inline void percpu_modcopy(struct module *mod, - const void *from, unsigned long size) -{ - /* pcpusec should be 0, and size of that section should be 0. */ - BUG_ON(size != 0); -} -bool is_module_percpu_address(unsigned long addr) -{ - return false; -} - -bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) -{ - return false; -} - -#endif /* CONFIG_SMP */ - -#define MODINFO_ATTR(field) \ -static void setup_modinfo_##field(struct module *mod, const char *s) \ -{ \ - mod->field = kstrdup(s, GFP_KERNEL); \ -} \ -static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ - struct module_kobject *mk, char *buffer) \ -{ \ - return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ -} \ -static int modinfo_##field##_exists(struct module *mod) \ -{ \ - return mod->field != NULL; \ -} \ -static void free_modinfo_##field(struct module *mod) \ -{ \ - kfree(mod->field); \ - mod->field = NULL; \ -} \ -static struct module_attribute modinfo_##field = { \ - .attr = { .name = __stringify(field), .mode = 0444 }, \ - .show = show_modinfo_##field, \ - .setup = setup_modinfo_##field, \ - .test = modinfo_##field##_exists, \ - .free = free_modinfo_##field, \ -}; - -MODINFO_ATTR(version); -MODINFO_ATTR(srcversion); - -static char last_unloaded_module[MODULE_NAME_LEN+1]; - -#ifdef CONFIG_MODULE_UNLOAD - -EXPORT_TRACEPOINT_SYMBOL(module_get); - -/* MODULE_REF_BASE is the base reference count by kmodule loader. */ -#define MODULE_REF_BASE 1 - -/* Init the unload section of the module. */ -static int module_unload_init(struct module *mod) -{ - /* - * Initialize reference counter to MODULE_REF_BASE. - * refcnt == 0 means module is going. - */ - atomic_set(&mod->refcnt, MODULE_REF_BASE); - - INIT_LIST_HEAD(&mod->source_list); - INIT_LIST_HEAD(&mod->target_list); - - /* Hold reference count during initialization. */ - atomic_inc(&mod->refcnt); - - return 0; -} - -/* Does a already use b? */ -static int already_uses(struct module *a, struct module *b) -{ - struct module_use *use; - - list_for_each_entry(use, &b->source_list, source_list) { - if (use->source == a) { - pr_debug("%s uses %s!\n", a->name, b->name); - return 1; - } - } - pr_debug("%s does not use %s!\n", a->name, b->name); - return 0; -} - -/* - * Module a uses b - * - we add 'a' as a "source", 'b' as a "target" of module use - * - the module_use is added to the list of 'b' sources (so - * 'b' can walk the list to see who sourced them), and of 'a' - * targets (so 'a' can see what modules it targets). - */ -static int add_module_usage(struct module *a, struct module *b) -{ - struct module_use *use; - - pr_debug("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) - return -ENOMEM; - - use->source = a; - use->target = b; - list_add(&use->source_list, &b->source_list); - list_add(&use->target_list, &a->target_list); - return 0; -} - -/* Module a uses b: caller needs module_mutex() */ -static int ref_module(struct module *a, struct module *b) -{ - int err; - - if (b == NULL || already_uses(a, b)) - return 0; - - /* If module isn't available, we fail. */ - err = strong_try_module_get(b); - if (err) - return err; - - err = add_module_usage(a, b); - if (err) { - module_put(b); - return err; - } - return 0; -} - -/* Clear the unload stuff of the module. */ -static void module_unload_free(struct module *mod) -{ - struct module_use *use, *tmp; - - mutex_lock(&module_mutex); - list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { - struct module *i = use->target; - pr_debug("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->source_list); - list_del(&use->target_list); - kfree(use); - } - mutex_unlock(&module_mutex); -} - -#ifdef CONFIG_MODULE_FORCE_UNLOAD -static inline int try_force_unload(unsigned int flags) -{ - int ret = (flags & O_TRUNC); - if (ret) - add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); - return ret; -} -#else -static inline int try_force_unload(unsigned int flags) -{ - return 0; -} -#endif /* CONFIG_MODULE_FORCE_UNLOAD */ - -/* Try to release refcount of module, 0 means success. */ -static int try_release_module_ref(struct module *mod) -{ - int ret; - - /* Try to decrement refcnt which we set at loading */ - ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); - BUG_ON(ret < 0); - if (ret) - /* Someone can put this right now, recover with checking */ - ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); - - return ret; -} - -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - /* If it's not unused, quit unless we're forcing. */ - if (try_release_module_ref(mod) != 0) { - *forced = try_force_unload(flags); - if (!(*forced)) - return -EWOULDBLOCK; - } - - /* Mark it as dying. */ - mod->state = MODULE_STATE_GOING; - - return 0; -} - -/** - * module_refcount() - return the refcount or -1 if unloading - * @mod: the module we're checking - * - * Return: - * -1 if the module is in the process of unloading - * otherwise the number of references in the kernel to the module - */ -int module_refcount(struct module *mod) -{ - return atomic_read(&mod->refcnt) - MODULE_REF_BASE; -} -EXPORT_SYMBOL(module_refcount); - -/* This exists whether we can unload or not */ -static void free_module(struct module *mod); - -SYSCALL_DEFINE2(delete_module, const char __user *, name_user, - unsigned int, flags) -{ - struct module *mod; - char name[MODULE_NAME_LEN]; - int ret, forced = 0; - - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; - - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; - - audit_log_kern_module(name); - - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - - mod = find_module(name); - if (!mod) { - ret = -ENOENT; - goto out; - } - - if (!list_empty(&mod->source_list)) { - /* Other modules depend on us: get rid of them first. */ - ret = -EWOULDBLOCK; - goto out; - } - - /* Doing init or already dying? */ - if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count damn the torpedoes */ - pr_debug("%s already dying\n", mod->name); - ret = -EBUSY; - goto out; - } - - /* If it has an init func, it must have an exit func to unload */ - if (mod->init && !mod->exit) { - forced = try_force_unload(flags); - if (!forced) { - /* This module can't be removed */ - ret = -EBUSY; - goto out; - } - } - - ret = try_stop_module(mod, flags, &forced); - if (ret != 0) - goto out; - - mutex_unlock(&module_mutex); - /* Final destruction now no one is using it. */ - if (mod->exit != NULL) - mod->exit(); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - klp_module_going(mod); - ftrace_release_mod(mod); - - async_synchronize_full(); - - /* Store the name of the last unloaded module for diagnostic purposes */ - strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - - free_module(mod); - /* someone could wait for the module in add_unformed_module() */ - wake_up_all(&module_wq); - return 0; -out: - mutex_unlock(&module_mutex); - return ret; -} - -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - struct module_use *use; - int printed_something = 0; - - seq_printf(m, " %i ", module_refcount(mod)); - - /* - * Always include a trailing , so userspace can differentiate - * between this and the old multi-field proc format. - */ - list_for_each_entry(use, &mod->source_list, source_list) { - printed_something = 1; - seq_printf(m, "%s,", use->source->name); - } - - if (mod->init != NULL && mod->exit == NULL) { - printed_something = 1; - seq_puts(m, "[permanent],"); - } - - if (!printed_something) - seq_puts(m, "-"); -} - -void __symbol_put(const char *symbol) -{ - struct find_symbol_arg fsa = { - .name = symbol, - .gplok = true, - }; - - preempt_disable(); - BUG_ON(!find_symbol(&fsa)); - module_put(fsa.owner); - preempt_enable(); -} -EXPORT_SYMBOL(__symbol_put); - -/* Note this assumes addr is a function, which it currently always is. */ -void symbol_put_addr(void *addr) -{ - struct module *modaddr; - unsigned long a = (unsigned long)dereference_function_descriptor(addr); - - if (core_kernel_text(a)) - return; - - /* - * Even though we hold a reference on the module; we still need to - * disable preemption in order to safely traverse the data structure. - */ - preempt_disable(); - modaddr = __module_text_address(a); - BUG_ON(!modaddr); - module_put(modaddr); - preempt_enable(); -} -EXPORT_SYMBOL_GPL(symbol_put_addr); - -static ssize_t show_refcnt(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - return sprintf(buffer, "%i\n", module_refcount(mk->mod)); -} - -static struct module_attribute modinfo_refcnt = - __ATTR(refcnt, 0444, show_refcnt, NULL); - -void __module_get(struct module *module) -{ - if (module) { - preempt_disable(); - atomic_inc(&module->refcnt); - trace_module_get(module, _RET_IP_); - preempt_enable(); - } -} -EXPORT_SYMBOL(__module_get); - -bool try_module_get(struct module *module) -{ - bool ret = true; - - if (module) { - preempt_disable(); - /* Note: here, we can fail to get a reference */ - if (likely(module_is_live(module) && - atomic_inc_not_zero(&module->refcnt) != 0)) - trace_module_get(module, _RET_IP_); - else - ret = false; - - preempt_enable(); - } - return ret; -} -EXPORT_SYMBOL(try_module_get); - -void module_put(struct module *module) -{ - int ret; - - if (module) { - preempt_disable(); - ret = atomic_dec_if_positive(&module->refcnt); - WARN_ON(ret < 0); /* Failed to put refcount */ - trace_module_put(module, _RET_IP_); - preempt_enable(); - } -} -EXPORT_SYMBOL(module_put); - -#else /* !CONFIG_MODULE_UNLOAD */ -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - /* We don't know the usage count, or what modules are using. */ - seq_puts(m, " - -"); -} - -static inline void module_unload_free(struct module *mod) -{ -} - -static int ref_module(struct module *a, struct module *b) -{ - return strong_try_module_get(b); -} - -static inline int module_unload_init(struct module *mod) -{ - return 0; -} -#endif /* CONFIG_MODULE_UNLOAD */ - -static size_t module_flags_taint(struct module *mod, char *buf) -{ - size_t l = 0; - int i; - - for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &mod->taints)) - buf[l++] = taint_flags[i].c_true; - } - - return l; -} - -static ssize_t show_initstate(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - const char *state = "unknown"; - - switch (mk->mod->state) { - case MODULE_STATE_LIVE: - state = "live"; - break; - case MODULE_STATE_COMING: - state = "coming"; - break; - case MODULE_STATE_GOING: - state = "going"; - break; - default: - BUG(); - } - return sprintf(buffer, "%s\n", state); -} - -static struct module_attribute modinfo_initstate = - __ATTR(initstate, 0444, show_initstate, NULL); - -static ssize_t store_uevent(struct module_attribute *mattr, - struct module_kobject *mk, - const char *buffer, size_t count) -{ - int rc; - - rc = kobject_synth_uevent(&mk->kobj, buffer, count); - return rc ? rc : count; -} - -struct module_attribute module_uevent = - __ATTR(uevent, 0200, NULL, store_uevent); - -static ssize_t show_coresize(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - return sprintf(buffer, "%u\n", mk->mod->core_layout.size); -} - -static struct module_attribute modinfo_coresize = - __ATTR(coresize, 0444, show_coresize, NULL); - -static ssize_t show_initsize(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - return sprintf(buffer, "%u\n", mk->mod->init_layout.size); -} - -static struct module_attribute modinfo_initsize = - __ATTR(initsize, 0444, show_initsize, NULL); - -static ssize_t show_taint(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - size_t l; - - l = module_flags_taint(mk->mod, buffer); - buffer[l++] = '\n'; - return l; -} - -static struct module_attribute modinfo_taint = - __ATTR(taint, 0444, show_taint, NULL); - -static struct module_attribute *modinfo_attrs[] = { - &module_uevent, - &modinfo_version, - &modinfo_srcversion, - &modinfo_initstate, - &modinfo_coresize, - &modinfo_initsize, - &modinfo_taint, -#ifdef CONFIG_MODULE_UNLOAD - &modinfo_refcnt, -#endif - NULL, -}; - -static const char vermagic[] = VERMAGIC_STRING; - -static int try_to_force_load(struct module *mod, const char *reason) -{ -#ifdef CONFIG_MODULE_FORCE_LOAD - if (!test_taint(TAINT_FORCED_MODULE)) - pr_warn("%s: %s: kernel tainted.\n", mod->name, reason); - add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); - return 0; -#else - return -ENOEXEC; -#endif -} - -#ifdef CONFIG_MODVERSIONS - -static u32 resolve_rel_crc(const s32 *crc) -{ - return *(u32 *)((void *)crc + *crc); -} - -static int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - Elf_Shdr *sechdrs = info->sechdrs; - unsigned int versindex = info->index.vers; - unsigned int i, num_versions; - struct modversion_info *versions; - - /* Exporting module didn't supply crcs? OK, we're already tainted. */ - if (!crc) - return 1; - - /* No versions at all? modprobe --force does this. */ - if (versindex == 0) - return try_to_force_load(mod, symname) == 0; - - versions = (void *) sechdrs[versindex].sh_addr; - num_versions = sechdrs[versindex].sh_size - / sizeof(struct modversion_info); - - for (i = 0; i < num_versions; i++) { - u32 crcval; - - if (strcmp(versions[i].name, symname) != 0) - continue; - - if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) - crcval = resolve_rel_crc(crc); - else - crcval = *crc; - if (versions[i].crc == crcval) - return 1; - pr_debug("Found checksum %X vs module %lX\n", - crcval, versions[i].crc); - goto bad_version; - } - - /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", info->name, symname); - return 1; - -bad_version: - pr_warn("%s: disagrees about version of symbol %s\n", - info->name, symname); - return 0; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - struct find_symbol_arg fsa = { - .name = "module_layout", - .gplok = true, - }; - - /* - * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. - */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); - return check_version(info, "module_layout", mod, fsa.crc); -} - -/* First part is kernel version, which we ignore if module has crcs. */ -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - if (has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); - } - return strcmp(amagic, bmagic) == 0; -} -#else -static inline int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - return 1; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - return 1; -} - -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - return strcmp(amagic, bmagic) == 0; -} -#endif /* CONFIG_MODVERSIONS */ - -static char *get_modinfo(const struct load_info *info, const char *tag); -static char *get_next_modinfo(const struct load_info *info, const char *tag, - char *prev); - -static int verify_namespace_is_imported(const struct load_info *info, - const struct kernel_symbol *sym, - struct module *mod) -{ - const char *namespace; - char *imported_namespace; - - namespace = kernel_symbol_namespace(sym); - if (namespace && namespace[0]) { - imported_namespace = get_modinfo(info, "import_ns"); - while (imported_namespace) { - if (strcmp(namespace, imported_namespace) == 0) - return 0; - imported_namespace = get_next_modinfo( - info, "import_ns", imported_namespace); - } -#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS - pr_warn( -#else - pr_err( -#endif - "%s: module uses symbol (%s) from namespace %s, but does not import it.\n", - mod->name, kernel_symbol_name(sym), namespace); -#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS - return -EINVAL; -#endif - } - return 0; -} - -static bool inherit_taint(struct module *mod, struct module *owner) -{ - if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) - return true; - - if (mod->using_gplonly_symbols) { - pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", - mod->name, owner->name); - return false; - } - - if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { - pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", - mod->name, owner->name); - set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); - } - return true; -} - -/* Resolve a symbol for this module. I.e. if we find one, record usage. */ -static const struct kernel_symbol *resolve_symbol(struct module *mod, - const struct load_info *info, - const char *name, - char ownername[]) -{ - struct find_symbol_arg fsa = { - .name = name, - .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), - .warn = true, - }; - int err; - - /* - * The module_mutex should not be a heavily contended lock; - * if we get the occasional sleep here, we'll go an extra iteration - * in the wait_event_interruptible(), which is harmless. - */ - sched_annotate_sleep(); - mutex_lock(&module_mutex); - if (!find_symbol(&fsa)) - goto unlock; - - if (fsa.license == GPL_ONLY) - mod->using_gplonly_symbols = true; - - if (!inherit_taint(mod, fsa.owner)) { - fsa.sym = NULL; - goto getname; - } - - if (!check_version(info, name, mod, fsa.crc)) { - fsa.sym = ERR_PTR(-EINVAL); - goto getname; - } - - err = verify_namespace_is_imported(info, fsa.sym, mod); - if (err) { - fsa.sym = ERR_PTR(err); - goto getname; - } - - err = ref_module(mod, fsa.owner); - if (err) { - fsa.sym = ERR_PTR(err); - goto getname; - } - -getname: - /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); -unlock: - mutex_unlock(&module_mutex); - return fsa.sym; -} - -static const struct kernel_symbol * -resolve_symbol_wait(struct module *mod, - const struct load_info *info, - const char *name) -{ - const struct kernel_symbol *ksym; - char owner[MODULE_NAME_LEN]; - - if (wait_event_interruptible_timeout(module_wq, - !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) - || PTR_ERR(ksym) != -EBUSY, - 30 * HZ) <= 0) { - pr_warn("%s: gave up waiting for init of module %s.\n", - mod->name, owner); - } - return ksym; -} - -#ifdef CONFIG_KALLSYMS -static inline bool sect_empty(const Elf_Shdr *sect) -{ - return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; -} -#endif - -/* - * /sys/module/foo/sections stuff - * J. Corbet - */ -#ifdef CONFIG_SYSFS - -#ifdef CONFIG_KALLSYMS -struct module_sect_attr { - struct bin_attribute battr; - unsigned long address; -}; - -struct module_sect_attrs { - struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[]; -}; - -#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) -static ssize_t module_sect_read(struct file *file, struct kobject *kobj, - struct bin_attribute *battr, - char *buf, loff_t pos, size_t count) -{ - struct module_sect_attr *sattr = - container_of(battr, struct module_sect_attr, battr); - char bounce[MODULE_SECT_READ_SIZE + 1]; - size_t wrote; - - if (pos != 0) - return -EINVAL; - - /* - * Since we're a binary read handler, we must account for the - * trailing NUL byte that sprintf will write: if "buf" is - * too small to hold the NUL, or the NUL is exactly the last - * byte, the read will look like it got truncated by one byte. - * Since there is no way to ask sprintf nicely to not write - * the NUL, we have to use a bounce buffer. - */ - wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", - kallsyms_show_value(file->f_cred) - ? (void *)sattr->address : NULL); - count = min(count, wrote); - memcpy(buf, bounce, count); - - return count; -} - -static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -{ - unsigned int section; - - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].battr.attr.name); - kfree(sect_attrs); -} - -static void add_sect_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int nloaded = 0, i, size[2]; - struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct bin_attribute **gattr; - - /* Count loaded sections and allocate structures */ - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i])) - nloaded++; - size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.bin_attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); - if (sect_attrs == NULL) - return; - - /* Setup section attributes. */ - sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; - - sect_attrs->nsections = 0; - sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.bin_attrs[0]; - for (i = 0; i < info->hdr->e_shnum; i++) { - Elf_Shdr *sec = &info->sechdrs[i]; - if (sect_empty(sec)) - continue; - sysfs_bin_attr_init(&sattr->battr); - sattr->address = sec->sh_addr; - sattr->battr.attr.name = - kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (sattr->battr.attr.name == NULL) - goto out; - sect_attrs->nsections++; - sattr->battr.read = module_sect_read; - sattr->battr.size = MODULE_SECT_READ_SIZE; - sattr->battr.attr.mode = 0400; - *(gattr++) = &(sattr++)->battr; - } - *gattr = NULL; - - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) - goto out; - - mod->sect_attrs = sect_attrs; - return; - out: - free_sect_attrs(sect_attrs); -} - -static void remove_sect_attrs(struct module *mod) -{ - if (mod->sect_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->sect_attrs->grp); - /* - * We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. - */ - free_sect_attrs(mod->sect_attrs); - mod->sect_attrs = NULL; - } -} - -/* - * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. - */ - -struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[]; -}; - -static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) -{ - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } - kfree(notes_attrs); -} - -static void add_notes_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int notes, loaded, i; - struct module_notes_attrs *notes_attrs; - struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; - - /* Count notes sections and allocate structures. */ - notes = 0; - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i]) && - (info->sechdrs[i].sh_type == SHT_NOTE)) - ++notes; - - if (notes == 0) - return; - - notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), - GFP_KERNEL); - if (notes_attrs == NULL) - return; - - notes_attrs->notes = notes; - nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { - if (sect_empty(&info->sechdrs[i])) - continue; - if (info->sechdrs[i].sh_type == SHT_NOTE) { - sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; - nattr->attr.mode = S_IRUGO; - nattr->size = info->sechdrs[i].sh_size; - nattr->private = (void *) info->sechdrs[i].sh_addr; - nattr->read = module_notes_read; - ++nattr; - } - ++loaded; - } - - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) - goto out; - - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) - goto out; - - mod->notes_attrs = notes_attrs; - return; - - out: - free_notes_attrs(notes_attrs, i); -} - -static void remove_notes_attrs(struct module *mod) -{ - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); -} - -#else - -static inline void add_sect_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_sect_attrs(struct module *mod) -{ -} - -static inline void add_notes_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_notes_attrs(struct module *mod) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void del_usage_links(struct module *mod) -{ -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); - mutex_unlock(&module_mutex); -#endif -} - -static int add_usage_links(struct module *mod) -{ - int ret = 0; -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - ret = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - if (ret) - break; - } - mutex_unlock(&module_mutex); - if (ret) - del_usage_links(mod); -#endif - return ret; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end); - -static int module_add_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - struct module_attribute *temp_attr; - int error = 0; - int i; - - mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * - (ARRAY_SIZE(modinfo_attrs) + 1)), - GFP_KERNEL); - if (!mod->modinfo_attrs) - return -ENOMEM; - - temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (!attr->test || attr->test(mod)) { - memcpy(temp_attr, attr, sizeof(*temp_attr)); - sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj, - &temp_attr->attr); - if (error) - goto error_out; - ++temp_attr; - } - } - - return 0; - -error_out: - if (i > 0) - module_remove_modinfo_attrs(mod, --i); - else - kfree(mod->modinfo_attrs); - return error; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { - if (end >= 0 && i > end) - break; - /* pick a field to test for end of list */ - if (!attr->attr.name) - break; - sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); - if (attr->free) - attr->free(mod); - } - kfree(mod->modinfo_attrs); -} - -static void mod_kobject_put(struct module *mod) -{ - DECLARE_COMPLETION_ONSTACK(c); - mod->mkobj.kobj_completion = &c; - kobject_put(&mod->mkobj.kobj); - wait_for_completion(&c); -} - -static int mod_sysfs_init(struct module *mod) -{ - int err; - struct kobject *kobj; - - if (!module_sysfs_initialized) { - pr_err("%s: module sysfs not initialized\n", mod->name); - err = -EINVAL; - goto out; - } - - kobj = kset_find_obj(module_kset, mod->name); - if (kobj) { - pr_err("%s: module is already loaded\n", mod->name); - kobject_put(kobj); - err = -EINVAL; - goto out; - } - - mod->mkobj.mod = mod; - - memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); - mod->mkobj.kobj.kset = module_kset; - err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, - "%s", mod->name); - if (err) - mod_kobject_put(mod); - -out: - return err; -} - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - int err; - - err = mod_sysfs_init(mod); - if (err) - goto out; - - mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); - if (!mod->holders_dir) { - err = -ENOMEM; - goto out_unreg; - } - - err = module_param_sysfs_setup(mod, kparam, num_params); - if (err) - goto out_unreg_holders; - - err = module_add_modinfo_attrs(mod); - if (err) - goto out_unreg_param; - - err = add_usage_links(mod); - if (err) - goto out_unreg_modinfo_attrs; - - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); - - return 0; - -out_unreg_modinfo_attrs: - module_remove_modinfo_attrs(mod, -1); -out_unreg_param: - module_param_sysfs_remove(mod); -out_unreg_holders: - kobject_put(mod->holders_dir); -out_unreg: - mod_kobject_put(mod); -out: - return err; -} - -static void mod_sysfs_fini(struct module *mod) -{ - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_put(mod); -} - -static void init_param_lock(struct module *mod) -{ - mutex_init(&mod->param_lock); -} -#else /* !CONFIG_SYSFS */ - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static void mod_sysfs_fini(struct module *mod) -{ -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ -} - -static void del_usage_links(struct module *mod) -{ -} - -static void init_param_lock(struct module *mod) -{ -} -#endif /* CONFIG_SYSFS */ - -static void mod_sysfs_teardown(struct module *mod) -{ - del_usage_links(mod); - module_remove_modinfo_attrs(mod, -1); - module_param_sysfs_remove(mod); - kobject_put(mod->mkobj.drivers_dir); - kobject_put(mod->holders_dir); - mod_sysfs_fini(mod); -} - -/* - * LKM RO/NX protection: protect module's text/ro-data - * from modification and any data from execution. - * - * General layout of module is: - * [text] [read-only-data] [ro-after-init] [writable data] - * text_size -----^ ^ ^ ^ - * ro_size ------------------------| | | - * ro_after_init_size -----------------------------| | - * size -----------------------------------------------------------| - * - * These values are always page-aligned (as is base) - */ - -/* - * Since some arches are moving towards PAGE_KERNEL module allocations instead - * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the - * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of - * whether we are strict. - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -static void frob_text(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base, - layout->text_size >> PAGE_SHIFT); -} - -static void module_enable_x(const struct module *mod) -{ - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); -} -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - -#ifdef CONFIG_STRICT_MODULE_RWX -static void frob_rodata(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->text_size, - (layout->ro_size - layout->text_size) >> PAGE_SHIFT); -} - -static void frob_ro_after_init(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); -} - -static void frob_writable_data(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_after_init_size, - (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); -} - -static void module_enable_ro(const struct module *mod, bool after_init) -{ - if (!rodata_enabled) - return; - - set_vm_flush_reset_perms(mod->core_layout.base); - set_vm_flush_reset_perms(mod->init_layout.base); - frob_text(&mod->core_layout, set_memory_ro); - - frob_rodata(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_rodata(&mod->init_layout, set_memory_ro); - - if (after_init) - frob_ro_after_init(&mod->core_layout, set_memory_ro); -} - -static void module_enable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_nx); - frob_ro_after_init(&mod->core_layout, set_memory_nx); - frob_writable_data(&mod->core_layout, set_memory_nx); - frob_rodata(&mod->init_layout, set_memory_nx); - frob_writable_data(&mod->init_layout, set_memory_nx); -} - -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; - int i; - - for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { - pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", - mod->name, secstrings + sechdrs[i].sh_name, i); - return -ENOEXEC; - } - } - - return 0; -} - -#else /* !CONFIG_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_ro(const struct module *mod, bool after_init) {} -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - return 0; -} -#endif /* CONFIG_STRICT_MODULE_RWX */ - -#ifdef CONFIG_LIVEPATCH -/* - * Persist Elf information about a module. Copy the Elf header, - * section header table, section string table, and symtab section - * index from info to mod->klp_info. - */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - unsigned int size, symndx; - int ret; - - size = sizeof(*mod->klp_info); - mod->klp_info = kmalloc(size, GFP_KERNEL); - if (mod->klp_info == NULL) - return -ENOMEM; - - /* Elf header */ - size = sizeof(mod->klp_info->hdr); - memcpy(&mod->klp_info->hdr, info->hdr, size); - - /* Elf section header table */ - size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); - if (mod->klp_info->sechdrs == NULL) { - ret = -ENOMEM; - goto free_info; - } - - /* Elf section name string table */ - size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); - if (mod->klp_info->secstrings == NULL) { - ret = -ENOMEM; - goto free_sechdrs; - } - - /* Elf symbol section index */ - symndx = info->index.sym; - mod->klp_info->symndx = symndx; - - /* - * For livepatch modules, core_kallsyms.symtab is a complete - * copy of the original symbol table. Adjust sh_addr to point - * to core_kallsyms.symtab since the copy of the symtab in module - * init memory is freed at the end of do_init_module(). - */ - mod->klp_info->sechdrs[symndx].sh_addr = \ - (unsigned long) mod->core_kallsyms.symtab; - - return 0; - -free_sechdrs: - kfree(mod->klp_info->sechdrs); -free_info: - kfree(mod->klp_info); - return ret; -} - -static void free_module_elf(struct module *mod) -{ - kfree(mod->klp_info->sechdrs); - kfree(mod->klp_info->secstrings); - kfree(mod->klp_info); -} -#else /* !CONFIG_LIVEPATCH */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - return 0; -} - -static void free_module_elf(struct module *mod) -{ -} -#endif /* CONFIG_LIVEPATCH */ - -void __weak module_memfree(void *module_region) -{ - /* - * This memory may be RO, and freeing RO memory in an interrupt is not - * supported by vmalloc. - */ - WARN_ON(in_interrupt()); - vfree(module_region); -} - -void __weak module_arch_cleanup(struct module *mod) -{ -} - -void __weak module_arch_freeing_init(struct module *mod) -{ -} - -static void cfi_cleanup(struct module *mod); - -/* Free a module, remove from lists, etc. */ -static void free_module(struct module *mod) -{ - trace_module_free(mod); - - mod_sysfs_teardown(mod); - - /* - * We leave it in list to prevent duplicate loads, but make sure - * that noone uses it while it's being deconstructed. - */ - mutex_lock(&module_mutex); - mod->state = MODULE_STATE_UNFORMED; - mutex_unlock(&module_mutex); - - /* Remove dynamic debug info */ - ddebug_remove_module(mod->name); - - /* Arch-specific cleanup. */ - module_arch_cleanup(mod); - - /* Module unload stuff */ - module_unload_free(mod); - - /* Free any allocated parameters. */ - destroy_params(mod->kp, mod->num_kp); - - if (is_livepatch_module(mod)) - free_module_elf(mod); - - /* Now we can delete it from the lists */ - mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); - mod_tree_remove(mod); - /* Remove this module from bug list, this uses list_del_rcu */ - module_bug_cleanup(mod); - /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ - synchronize_rcu(); - mutex_unlock(&module_mutex); - - /* Clean up CFI for the module. */ - cfi_cleanup(mod); - - /* This may be empty, but that's OK */ - module_arch_freeing_init(mod); - module_memfree(mod->init_layout.base); - kfree(mod->args); - percpu_modfree(mod); - - /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); - - /* Finally, free the core (containing the module structure) */ - module_memfree(mod->core_layout.base); -} - -void *__symbol_get(const char *symbol) -{ - struct find_symbol_arg fsa = { - .name = symbol, - .gplok = true, - .warn = true, - }; - - preempt_disable(); - if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) { - preempt_enable(); - return NULL; - } - preempt_enable(); - return (void *)kernel_symbol_value(fsa.sym); -} -EXPORT_SYMBOL_GPL(__symbol_get); - -/* - * Ensure that an exported symbol [global namespace] does not already exist - * in the kernel or in some other module's exported symbol table. - * - * You must hold the module_mutex. - */ -static int verify_exported_symbols(struct module *mod) -{ - unsigned int i; - const struct kernel_symbol *s; - struct { - const struct kernel_symbol *sym; - unsigned int num; - } arr[] = { - { mod->syms, mod->num_syms }, - { mod->gpl_syms, mod->num_gpl_syms }, - }; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - struct find_symbol_arg fsa = { - .name = kernel_symbol_name(s), - .gplok = true, - }; - if (find_symbol(&fsa)) { - pr_err("%s: exports duplicate symbol %s" - " (owned by %s)\n", - mod->name, kernel_symbol_name(s), - module_name(fsa.owner)); - return -ENOEXEC; - } - } - } - return 0; -} - -static bool ignore_undef_symbol(Elf_Half emachine, const char *name) -{ - /* - * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as - * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64. - * i386 has a similar problem but may not deserve a fix. - * - * If we ever have to ignore many symbols, consider refactoring the code to - * only warn if referenced by a relocation. - */ - if (emachine == EM_386 || emachine == EM_X86_64) - return !strcmp(name, "_GLOBAL_OFFSET_TABLE_"); - return false; -} - -/* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(struct module *mod, const struct load_info *info) -{ - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - Elf_Sym *sym = (void *)symsec->sh_addr; - unsigned long secbase; - unsigned int i; - int ret = 0; - const struct kernel_symbol *ksym; - - for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { - const char *name = info->strtab + sym[i].st_name; - - switch (sym[i].st_shndx) { - case SHN_COMMON: - /* Ignore common symbols */ - if (!strncmp(name, "__gnu_lto", 9)) - break; - - /* - * We compiled with -fno-common. These are not - * supposed to happen. - */ - pr_debug("Common symbol: %s\n", name); - pr_warn("%s: please compile with -fno-common\n", - mod->name); - ret = -ENOEXEC; - break; - - case SHN_ABS: - /* Don't need to do anything */ - pr_debug("Absolute symbol: 0x%08lx\n", - (long)sym[i].st_value); - break; - - case SHN_LIVEPATCH: - /* Livepatch symbols are resolved by livepatch */ - break; - - case SHN_UNDEF: - ksym = resolve_symbol_wait(mod, info, name); - /* Ok if resolved. */ - if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = kernel_symbol_value(ksym); - break; - } - - /* Ok if weak or ignored. */ - if (!ksym && - (ELF_ST_BIND(sym[i].st_info) == STB_WEAK || - ignore_undef_symbol(info->hdr->e_machine, name))) - break; - - ret = PTR_ERR(ksym) ?: -ENOENT; - pr_warn("%s: Unknown symbol %s (err %d)\n", - mod->name, name, ret); - break; - - default: - /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == info->index.pcpu) - secbase = (unsigned long)mod_percpu(mod); - else - secbase = info->sechdrs[sym[i].st_shndx].sh_addr; - sym[i].st_value += secbase; - break; - } - } - - return ret; -} - -static int apply_relocations(struct module *mod, const struct load_info *info) -{ - unsigned int i; - int err = 0; - - /* Now do relocations. */ - for (i = 1; i < info->hdr->e_shnum; i++) { - unsigned int infosec = info->sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (infosec >= info->hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) - continue; - - if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) - err = klp_apply_section_relocs(mod, info->sechdrs, - info->secstrings, - info->strtab, - info->index.sym, i, - NULL); - else if (info->sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(info->sechdrs, info->strtab, - info->index.sym, i, mod); - else if (info->sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(info->sechdrs, info->strtab, - info->index.sym, i, mod); - if (err < 0) - break; - } - return err; -} - -/* Additional bytes needed by arch in front of individual sections */ -unsigned int __weak arch_mod_section_prepend(struct module *mod, - unsigned int section) -{ - /* default implementation just returns zero */ - return 0; -} - -/* Update size with this section: return offset. */ -static long get_offset(struct module *mod, unsigned int *size, - Elf_Shdr *sechdr, unsigned int section) -{ - long ret; - - *size += arch_mod_section_prepend(mod, section); - ret = ALIGN(*size, sechdr->sh_addralign ?: 1); - *size = ret + sechdr->sh_size; - return ret; -} - -static bool module_init_layout_section(const char *sname) -{ -#ifndef CONFIG_MODULE_UNLOAD - if (module_exit_section(sname)) - return true; -#endif - return module_init_section(sname); -} - -/* - * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld - * might -- code, read-only data, read-write data, small data. Tally - * sizes, and place the offsets into sh_entsize fields: high bit means it - * belongs in init. - */ -static void layout_sections(struct module *mod, struct load_info *info) -{ - static unsigned long const masks[][2] = { - /* - * NOTE: all executable code must be the first section - * in this array; otherwise modify the text_size - * finder in the two loops below - */ - { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, - { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, - { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, - { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, - { ARCH_SHF_SMALL | SHF_ALLOC, 0 } - }; - unsigned int m, i; - - for (i = 0; i < info->hdr->e_shnum; i++) - info->sechdrs[i].sh_entsize = ~0UL; - - pr_debug("Core section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < info->hdr->e_shnum; ++i) { - Elf_Shdr *s = &info->sechdrs[i]; - const char *sname = info->secstrings + s->sh_name; - - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || module_init_layout_section(sname)) - continue; - s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); - pr_debug("\t%s\n", sname); - } - switch (m) { - case 0: /* executable */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.text_size = mod->core_layout.size; - break; - case 1: /* RO: text and ro-data */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_size = mod->core_layout.size; - break; - case 2: /* RO after init */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_after_init_size = mod->core_layout.size; - break; - case 4: /* whole core */ - mod->core_layout.size = debug_align(mod->core_layout.size); - break; - } - } - - pr_debug("Init section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < info->hdr->e_shnum; ++i) { - Elf_Shdr *s = &info->sechdrs[i]; - const char *sname = info->secstrings + s->sh_name; - - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || !module_init_layout_section(sname)) - continue; - s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) - | INIT_OFFSET_MASK); - pr_debug("\t%s\n", sname); - } - switch (m) { - case 0: /* executable */ - mod->init_layout.size = debug_align(mod->init_layout.size); - mod->init_layout.text_size = mod->init_layout.size; - break; - case 1: /* RO: text and ro-data */ - mod->init_layout.size = debug_align(mod->init_layout.size); - mod->init_layout.ro_size = mod->init_layout.size; - break; - case 2: - /* - * RO after init doesn't apply to init_layout (only - * core_layout), so it just takes the value of ro_size. - */ - mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; - break; - case 4: /* whole init */ - mod->init_layout.size = debug_align(mod->init_layout.size); - break; - } - } -} - -static void set_license(struct module *mod, const char *license) -{ - if (!license) - license = "unspecified"; - - if (!license_is_gpl_compatible(license)) { - if (!test_taint(TAINT_PROPRIETARY_MODULE)) - pr_warn("%s: module license '%s' taints kernel.\n", - mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); - } -} - -/* Parse tag=value strings from .modinfo section */ -static char *next_string(char *string, unsigned long *secsize) -{ - /* Skip non-zero chars */ - while (string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - - /* Skip any zero padding. */ - while (!string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - return string; -} - -static char *get_next_modinfo(const struct load_info *info, const char *tag, - char *prev) -{ - char *p; - unsigned int taglen = strlen(tag); - Elf_Shdr *infosec = &info->sechdrs[info->index.info]; - unsigned long size = infosec->sh_size; - - /* - * get_modinfo() calls made before rewrite_section_headers() - * must use sh_offset, as sh_addr isn't set! - */ - char *modinfo = (char *)info->hdr + infosec->sh_offset; - - if (prev) { - size -= prev - modinfo; - modinfo = next_string(prev, &size); - } - - for (p = modinfo; p; p = next_string(p, &size)) { - if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') - return p + taglen + 1; - } - return NULL; -} - -static char *get_modinfo(const struct load_info *info, const char *tag) -{ - return get_next_modinfo(info, tag, NULL); -} - -static void setup_modinfo(struct module *mod, struct load_info *info) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (attr->setup) - attr->setup(mod, get_modinfo(info, attr->attr.name)); - } -} - -static void free_modinfo(struct module *mod) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (attr->free) - attr->free(mod); - } -} - -#ifdef CONFIG_KALLSYMS - -/* Lookup exported symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_exported_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - return bsearch(name, start, stop - start, - sizeof(struct kernel_symbol), cmp_name); -} - -static int is_exported(const char *name, unsigned long value, - const struct module *mod) -{ - const struct kernel_symbol *ks; - if (!mod) - ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); - else - ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); - - return ks != NULL && kernel_symbol_value(ks) == value; -} - -/* As per nm */ -static char elf_type(const Elf_Sym *sym, const struct load_info *info) -{ - const Elf_Shdr *sechdrs = info->sechdrs; - - if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { - if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) - return 'v'; - else - return 'w'; - } - if (sym->st_shndx == SHN_UNDEF) - return 'U'; - if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) - return 'a'; - if (sym->st_shndx >= SHN_LORESERVE) - return '?'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) - return 't'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC - && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { - if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) - return 'r'; - else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 'g'; - else - return 'd'; - } - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 's'; - else - return 'b'; - } - if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, - ".debug")) { - return 'n'; - } - return '?'; -} - -static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum, unsigned int pcpundx) -{ - const Elf_Shdr *sec; - - if (src->st_shndx == SHN_UNDEF - || src->st_shndx >= shnum - || !src->st_name) - return false; - -#ifdef CONFIG_KALLSYMS_ALL - if (src->st_shndx == pcpundx) - return true; -#endif - - sec = sechdrs + src->st_shndx; - if (!(sec->sh_flags & SHF_ALLOC) -#ifndef CONFIG_KALLSYMS_ALL - || !(sec->sh_flags & SHF_EXECINSTR) -#endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) - return false; - - return true; -} - -/* - * We only allocate and copy the strings needed by the parts of symtab - * we keep. This is simple, but has the effect of making multiple - * copies of duplicates. We could be more sophisticated, see - * linux-kernel thread starting with - * <73defb5e4bca04a6431392cc341112b1@localhost>. - */ -static void layout_symtab(struct module *mod, struct load_info *info) -{ - Elf_Shdr *symsect = info->sechdrs + info->index.sym; - Elf_Shdr *strsect = info->sechdrs + info->index.str; - const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size = 0; - - /* Put symbol section at end of init part of module. */ - symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, - info->index.sym) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + symsect->sh_name); - - src = (void *)info->hdr + symsect->sh_offset; - nsrc = symsect->sh_size / sizeof(*src); - - /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = 0; i < nsrc; i++) { - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - strtab_size += strlen(&info->strtab[src[i].st_name])+1; - ndst++; - } - } - - /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_layout.size += strtab_size; - info->core_typeoffs = mod->core_layout.size; - mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = debug_align(mod->core_layout.size); - - /* Put string table section at end of init part of module. */ - strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, - info->index.str) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + strsect->sh_name); - - /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_layout.size = ALIGN(mod->init_layout.size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_layout.size; - mod->init_layout.size += sizeof(struct mod_kallsyms); - info->init_typeoffs = mod->init_layout.size; - mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = debug_align(mod->init_layout.size); -} - -/* - * We use the full symtab and strtab which layout_symtab arranged to - * be appended to the init section. Later we switch to the cut-down - * core-only ones. - */ -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ - unsigned int i, ndst; - const Elf_Sym *src; - Elf_Sym *dst; - char *s; - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - - /* Set up to point into init section. */ - mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; - - mod->kallsyms->symtab = (void *)symsec->sh_addr; - mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); - /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - - /* - * Now populate the cut down core kallsyms for after init - * and set types up while we still have access to sections. - */ - mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; - src = mod->kallsyms->symtab; - for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - mod->kallsyms->typetab[i] = elf_type(src + i, info); - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - mod->core_kallsyms.typetab[ndst] = - mod->kallsyms->typetab[i]; - dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], - KSYM_NAME_LEN) + 1; - } - } - mod->core_kallsyms.num_symtab = ndst; -} -#else -static inline void layout_symtab(struct module *mod, struct load_info *info) -{ -} - -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ -} -#endif /* CONFIG_KALLSYMS */ - -#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) -static void init_build_id(struct module *mod, const struct load_info *info) -{ - const Elf_Shdr *sechdr; - unsigned int i; - - for (i = 0; i < info->hdr->e_shnum; i++) { - sechdr = &info->sechdrs[i]; - if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && - !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, - sechdr->sh_size)) - break; - } -} -#else -static void init_build_id(struct module *mod, const struct load_info *info) -{ -} -#endif - -static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) -{ - if (!debug) - return; - ddebug_add_module(debug, num, mod->name); -} - -static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) -{ - if (debug) - ddebug_remove_module(mod->name); -} - -void * __weak module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - -bool __weak module_init_section(const char *name) -{ - return strstarts(name, ".init"); -} - -bool __weak module_exit_section(const char *name) -{ - return strstarts(name, ".exit"); -} - -#ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); - } -} -#else -static inline void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ -} -#endif - -#ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, int flags) -{ - int err = -ENODATA; - const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - const char *reason; - const void *mod = info->hdr; - bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | - MODULE_INIT_IGNORE_VERMAGIC); - /* - * Do not allow mangled modules as a module with version information - * removed is no longer the module that was signed. - */ - if (!mangled_module && - info->len > markerlen && - memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { - /* We truncate the module to discard the signature */ - info->len -= markerlen; - err = mod_verify_sig(mod, info); - if (!err) { - info->sig_ok = true; - return 0; - } - } - - /* - * We don't permit modules to be loaded into the trusted kernels - * without a valid signature on them, but if we're not enforcing, - * certain errors are non-fatal. - */ - switch (err) { - case -ENODATA: - reason = "unsigned module"; - break; - case -ENOPKG: - reason = "module with unsupported crypto"; - break; - case -ENOKEY: - reason = "module with unavailable key"; - break; - - default: - /* - * All other errors are fatal, including lack of memory, - * unparseable signatures, and signature check failures -- - * even if signatures aren't required. - */ - return err; - } - - if (is_module_sig_enforced()) { - pr_notice("Loading of %s is rejected\n", reason); - return -EKEYREJECTED; - } - - return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); -} -#else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, int flags) -{ - return 0; -} -#endif /* !CONFIG_MODULE_SIG */ - -static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) -{ -#if defined(CONFIG_64BIT) - unsigned long long secend; -#else - unsigned long secend; -#endif - - /* - * Check for both overflow and offset/size being - * too large. - */ - secend = shdr->sh_offset + shdr->sh_size; - if (secend < shdr->sh_offset || secend > info->len) - return -ENOEXEC; - - return 0; -} - -/* - * Sanity checks against invalid binaries, wrong arch, weird elf version. - * - * Also do basic validity checks against section offsets and sizes, the - * section name string table, and the indices used for it (sh_name). - */ -static int elf_validity_check(struct load_info *info) -{ - unsigned int i; - Elf_Shdr *shdr, *strhdr; - int err; - - if (info->len < sizeof(*(info->hdr))) { - pr_err("Invalid ELF header len %lu\n", info->len); - goto no_exec; - } - - if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) { - pr_err("Invalid ELF header magic: != %s\n", ELFMAG); - goto no_exec; - } - if (info->hdr->e_type != ET_REL) { - pr_err("Invalid ELF header type: %u != %u\n", - info->hdr->e_type, ET_REL); - goto no_exec; - } - if (!elf_check_arch(info->hdr)) { - pr_err("Invalid architecture in ELF header: %u\n", - info->hdr->e_machine); - goto no_exec; - } - if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) { - pr_err("Invalid ELF section header size\n"); - goto no_exec; - } - - /* - * e_shnum is 16 bits, and sizeof(Elf_Shdr) is - * known and small. So e_shnum * sizeof(Elf_Shdr) - * will not overflow unsigned long on any platform. - */ - if (info->hdr->e_shoff >= info->len - || (info->hdr->e_shnum * sizeof(Elf_Shdr) > - info->len - info->hdr->e_shoff)) { - pr_err("Invalid ELF section header overflow\n"); - goto no_exec; - } - - info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; - - /* - * Verify if the section name table index is valid. - */ - if (info->hdr->e_shstrndx == SHN_UNDEF - || info->hdr->e_shstrndx >= info->hdr->e_shnum) { - pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n", - info->hdr->e_shstrndx, info->hdr->e_shstrndx, - info->hdr->e_shnum); - goto no_exec; - } - - strhdr = &info->sechdrs[info->hdr->e_shstrndx]; - err = validate_section_offset(info, strhdr); - if (err < 0) { - pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type); - return err; - } - - /* - * The section name table must be NUL-terminated, as required - * by the spec. This makes strcmp and pr_* calls that access - * strings in the section safe. - */ - info->secstrings = (void *)info->hdr + strhdr->sh_offset; - if (info->secstrings[strhdr->sh_size - 1] != '\0') { - pr_err("ELF Spec violation: section name table isn't null terminated\n"); - goto no_exec; - } - - /* - * The code assumes that section 0 has a length of zero and - * an addr of zero, so check for it. - */ - if (info->sechdrs[0].sh_type != SHT_NULL - || info->sechdrs[0].sh_size != 0 - || info->sechdrs[0].sh_addr != 0) { - pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n", - info->sechdrs[0].sh_type); - goto no_exec; - } - - for (i = 1; i < info->hdr->e_shnum; i++) { - shdr = &info->sechdrs[i]; - switch (shdr->sh_type) { - case SHT_NULL: - case SHT_NOBITS: - continue; - case SHT_SYMTAB: - if (shdr->sh_link == SHN_UNDEF - || shdr->sh_link >= info->hdr->e_shnum) { - pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n", - shdr->sh_link, shdr->sh_link, - info->hdr->e_shnum); - goto no_exec; - } - fallthrough; - default: - err = validate_section_offset(info, shdr); - if (err < 0) { - pr_err("Invalid ELF section in module (section %u type %u)\n", - i, shdr->sh_type); - return err; - } - - if (shdr->sh_flags & SHF_ALLOC) { - if (shdr->sh_name >= strhdr->sh_size) { - pr_err("Invalid ELF section name in module (section %u type %u)\n", - i, shdr->sh_type); - return -ENOEXEC; - } - } - break; - } - } - - return 0; - -no_exec: - return -ENOEXEC; -} - -#define COPY_CHUNK_SIZE (16*PAGE_SIZE) - -static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) -{ - do { - unsigned long n = min(len, COPY_CHUNK_SIZE); - - if (copy_from_user(dst, usrc, n) != 0) - return -EFAULT; - cond_resched(); - dst += n; - usrc += n; - len -= n; - } while (len); - return 0; -} - -#ifdef CONFIG_LIVEPATCH -static int check_modinfo_livepatch(struct module *mod, struct load_info *info) -{ - if (get_modinfo(info, "livepatch")) { - mod->klp = true; - add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); - pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", - mod->name); - } - - return 0; -} -#else /* !CONFIG_LIVEPATCH */ -static int check_modinfo_livepatch(struct module *mod, struct load_info *info) -{ - if (get_modinfo(info, "livepatch")) { - pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", - mod->name); - return -ENOEXEC; - } - - return 0; -} -#endif /* CONFIG_LIVEPATCH */ - -static void check_modinfo_retpoline(struct module *mod, struct load_info *info) -{ - if (retpoline_module_ok(get_modinfo(info, "retpoline"))) - return; - - pr_warn("%s: loading module not compiled with retpoline compiler.\n", - mod->name); -} - -/* Sets info->hdr and info->len. */ -static int copy_module_from_user(const void __user *umod, unsigned long len, - struct load_info *info) -{ - int err; - - info->len = len; - if (info->len < sizeof(*(info->hdr))) - return -ENOEXEC; - - err = security_kernel_load_data(LOADING_MODULE, true); - if (err) - return err; - - /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); - if (!info->hdr) - return -ENOMEM; - - if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { - err = -EFAULT; - goto out; - } - - err = security_kernel_post_load_data((char *)info->hdr, info->len, - LOADING_MODULE, "init_module"); -out: - if (err) - vfree(info->hdr); - - return err; -} - -static void free_copy(struct load_info *info, int flags) -{ - if (flags & MODULE_INIT_COMPRESSED_FILE) - module_decompress_cleanup(info); - else - vfree(info->hdr); -} - -static int rewrite_section_headers(struct load_info *info, int flags) -{ - unsigned int i; - - /* This should always be true, but let's be sure. */ - info->sechdrs[0].sh_addr = 0; - - for (i = 1; i < info->hdr->e_shnum; i++) { - Elf_Shdr *shdr = &info->sechdrs[i]; - - /* - * Mark all sections sh_addr with their address in the - * temporary image. - */ - shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; - - } - - /* Track but don't keep modinfo and version sections. */ - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - - return 0; -} - -/* - * Set up our basic convenience variables (pointers to section headers, - * search for module section index etc), and do some basic section - * verification. - * - * Set info->mod to the temporary copy of the module in info->hdr. The final one - * will be allocated in move_module(). - */ -static int setup_load_info(struct load_info *info, int flags) -{ - unsigned int i; - - /* Try to find a name early so we can log errors with a module name */ - info->index.info = find_sec(info, ".modinfo"); - if (info->index.info) - info->name = get_modinfo(info, "name"); - - /* Find internal symbols and strings. */ - for (i = 1; i < info->hdr->e_shnum; i++) { - if (info->sechdrs[i].sh_type == SHT_SYMTAB) { - info->index.sym = i; - info->index.str = info->sechdrs[i].sh_link; - info->strtab = (char *)info->hdr - + info->sechdrs[info->index.str].sh_offset; - break; - } - } - - if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", - info->name ?: "(missing .modinfo section or name field)"); - return -ENOEXEC; - } - - info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); - if (!info->index.mod) { - pr_warn("%s: No module found in object\n", - info->name ?: "(missing .modinfo section or name field)"); - return -ENOEXEC; - } - /* This is temporary: point mod into copy of data. */ - info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset; - - /* - * If we didn't load the .modinfo 'name' field earlier, fall back to - * on-disk struct mod 'name' field. - */ - if (!info->name) - info->name = info->mod->name; - - if (flags & MODULE_INIT_IGNORE_MODVERSIONS) - info->index.vers = 0; /* Pretend no __versions section! */ - else - info->index.vers = find_sec(info, "__versions"); - - info->index.pcpu = find_pcpusec(info); - - return 0; -} - -static int check_modinfo(struct module *mod, struct load_info *info, int flags) -{ - const char *modmagic = get_modinfo(info, "vermagic"); - int err; - - if (flags & MODULE_INIT_IGNORE_VERMAGIC) - modmagic = NULL; - - /* This is allowed: modprobe --force will invalidate it. */ - if (!modmagic) { - err = try_to_force_load(mod, "bad vermagic"); - if (err) - return err; - } else if (!same_magic(modmagic, vermagic, info->index.vers)) { - pr_err("%s: version magic '%s' should be '%s'\n", - info->name, modmagic, vermagic); - return -ENOEXEC; - } - - if (!get_modinfo(info, "intree")) { - if (!test_taint(TAINT_OOT_MODULE)) - pr_warn("%s: loading out-of-tree module taints kernel.\n", - mod->name); - add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); - } - - check_modinfo_retpoline(mod, info); - - if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); - pr_warn("%s: module is from the staging directory, the quality " - "is unknown, you have been warned.\n", mod->name); - } - - err = check_modinfo_livepatch(mod, info); - if (err) - return err; - - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(info, "license")); - - return 0; -} - -static int find_module_sections(struct module *mod, struct load_info *info) -{ - mod->kp = section_objs(info, "__param", - sizeof(*mod->kp), &mod->num_kp); - mod->syms = section_objs(info, "__ksymtab", - sizeof(*mod->syms), &mod->num_syms); - mod->crcs = section_addr(info, "__kcrctab"); - mod->gpl_syms = section_objs(info, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); - -#ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(info, ".ctors", - sizeof(*mod->ctors), &mod->num_ctors); - if (!mod->ctors) - mod->ctors = section_objs(info, ".init_array", - sizeof(*mod->ctors), &mod->num_ctors); - else if (find_sec(info, ".init_array")) { - /* - * This shouldn't happen with same compiler and binutils - * building all parts of the module. - */ - pr_warn("%s: has both .ctors and .init_array.\n", - mod->name); - return -EINVAL; - } -#endif - - mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, - &mod->noinstr_text_size); - -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", - sizeof(*mod->tracepoints_ptrs), - &mod->num_tracepoints); -#endif -#ifdef CONFIG_TREE_SRCU - mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", - sizeof(*mod->srcu_struct_ptrs), - &mod->num_srcu_structs); -#endif -#ifdef CONFIG_BPF_EVENTS - mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", - sizeof(*mod->bpf_raw_events), - &mod->num_bpf_raw_events); -#endif -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES - mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); -#endif -#ifdef CONFIG_JUMP_LABEL - mod->jump_entries = section_objs(info, "__jump_table", - sizeof(*mod->jump_entries), - &mod->num_jump_entries); -#endif -#ifdef CONFIG_EVENT_TRACING - mod->trace_events = section_objs(info, "_ftrace_events", - sizeof(*mod->trace_events), - &mod->num_trace_events); - mod->trace_evals = section_objs(info, "_ftrace_eval_map", - sizeof(*mod->trace_evals), - &mod->num_trace_evals); -#endif -#ifdef CONFIG_TRACING - mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", - sizeof(*mod->trace_bprintk_fmt_start), - &mod->num_trace_bprintk_fmt); -#endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD - /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, - sizeof(*mod->ftrace_callsites), - &mod->num_ftrace_callsites); -#endif -#ifdef CONFIG_FUNCTION_ERROR_INJECTION - mod->ei_funcs = section_objs(info, "_error_injection_whitelist", - sizeof(*mod->ei_funcs), - &mod->num_ei_funcs); -#endif -#ifdef CONFIG_KPROBES - mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, - &mod->kprobes_text_size); - mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", - sizeof(unsigned long), - &mod->num_kprobe_blacklist); -#endif -#ifdef CONFIG_PRINTK_INDEX - mod->printk_index_start = section_objs(info, ".printk_index", - sizeof(*mod->printk_index_start), - &mod->printk_index_size); -#endif -#ifdef CONFIG_HAVE_STATIC_CALL_INLINE - mod->static_call_sites = section_objs(info, ".static_call_sites", - sizeof(*mod->static_call_sites), - &mod->num_static_call_sites); -#endif - mod->extable = section_objs(info, "__ex_table", - sizeof(*mod->extable), &mod->num_exentries); - - if (section_addr(info, "__obsparm")) - pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - - info->debug = section_objs(info, "__dyndbg", - sizeof(*info->debug), &info->num_debug); - - return 0; -} - -static int move_module(struct module *mod, struct load_info *info) -{ - int i; - void *ptr; - - /* Do the allocs. */ - ptr = module_alloc(mod->core_layout.size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. Just mark it as not being a - * leak. - */ - kmemleak_not_leak(ptr); - if (!ptr) - return -ENOMEM; - - memset(ptr, 0, mod->core_layout.size); - mod->core_layout.base = ptr; - - if (mod->init_layout.size) { - ptr = module_alloc(mod->init_layout.size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ - kmemleak_ignore(ptr); - if (!ptr) { - module_memfree(mod->core_layout.base); - return -ENOMEM; - } - memset(ptr, 0, mod->init_layout.size); - mod->init_layout.base = ptr; - } else - mod->init_layout.base = NULL; - - /* Transfer each section which specifies SHF_ALLOC */ - pr_debug("final section addresses:\n"); - for (i = 0; i < info->hdr->e_shnum; i++) { - void *dest; - Elf_Shdr *shdr = &info->sechdrs[i]; - - if (!(shdr->sh_flags & SHF_ALLOC)) - continue; - - if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->init_layout.base - + (shdr->sh_entsize & ~INIT_OFFSET_MASK); - else - dest = mod->core_layout.base + shdr->sh_entsize; - - if (shdr->sh_type != SHT_NOBITS) - memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); - /* Update sh_addr to point to copy in image. */ - shdr->sh_addr = (unsigned long)dest; - pr_debug("\t0x%lx %s\n", - (long)shdr->sh_addr, info->secstrings + shdr->sh_name); - } - - return 0; -} - -static int check_module_license_and_versions(struct module *mod) -{ - int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); - - /* - * ndiswrapper is under GPL by itself, but loads proprietary modules. - * Don't use add_taint_module(), as it would prevent ndiswrapper from - * using GPL-only symbols it needs. - */ - if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); - - /* driverloader was caught wrongly pretending to be under GPL */ - if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); - - /* lve claims to be GPL but upstream won't provide source */ - if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); - - if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) - pr_warn("%s: module license taints kernel.\n", mod->name); - -#ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !mod->crcs) || - (mod->num_gpl_syms && !mod->gpl_crcs)) { - return try_to_force_load(mod, - "no versions for exported symbols"); - } -#endif - return 0; -} - -static void flush_module_icache(const struct module *mod) -{ - /* - * Flush the instruction cache, since we've played with text. - * Do it before processing of module parameters, so the module - * can provide parameter accessor functions of its own. - */ - if (mod->init_layout.base) - flush_icache_range((unsigned long)mod->init_layout.base, - (unsigned long)mod->init_layout.base - + mod->init_layout.size); - flush_icache_range((unsigned long)mod->core_layout.base, - (unsigned long)mod->core_layout.base + mod->core_layout.size); -} - -int __weak module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - -/* module_blacklist is a comma-separated list of module names */ -static char *module_blacklist; -static bool blacklisted(const char *module_name) -{ - const char *p; - size_t len; - - if (!module_blacklist) - return false; - - for (p = module_blacklist; *p; p += len) { - len = strcspn(p, ","); - if (strlen(module_name) == len && !memcmp(module_name, p, len)) - return true; - if (p[len] == ',') - len++; - } - return false; -} -core_param(module_blacklist, module_blacklist, charp, 0400); - -static struct module *layout_and_allocate(struct load_info *info, int flags) -{ - struct module *mod; - unsigned int ndx; - int err; - - err = check_modinfo(info->mod, info, flags); - if (err) - return ERR_PTR(err); - - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(info->hdr, info->sechdrs, - info->secstrings, info->mod); - if (err < 0) - return ERR_PTR(err); - - err = module_enforce_rwx_sections(info->hdr, info->sechdrs, - info->secstrings, info->mod); - if (err < 0) - return ERR_PTR(err); - - /* We will do a special allocation for per-cpu sections later. */ - info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; - - /* - * Mark ro_after_init section with SHF_RO_AFTER_INIT so that - * layout_sections() can put it in the right place. - * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. - */ - ndx = find_sec(info, ".data..ro_after_init"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; - /* - * Mark the __jump_table section as ro_after_init as well: these data - * structures are never modified, with the exception of entries that - * refer to code in the __init section, which are annotated as such - * at module load time. - */ - ndx = find_sec(info, "__jump_table"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; - - /* - * Determine total sizes, and put offsets in sh_entsize. For now - * this is done generically; there doesn't appear to be any - * special cases for the architectures. - */ - layout_sections(info->mod, info); - layout_symtab(info->mod, info); - - /* Allocate and move to the final place */ - err = move_module(info->mod, info); - if (err) - return ERR_PTR(err); - - /* Module has been copied to its final place now: return it. */ - mod = (void *)info->sechdrs[info->index.mod].sh_addr; - kmemleak_load_module(mod, info); - return mod; -} - -/* mod is no longer valid after this! */ -static void module_deallocate(struct module *mod, struct load_info *info) -{ - percpu_modfree(mod); - module_arch_freeing_init(mod); - module_memfree(mod->init_layout.base); - module_memfree(mod->core_layout.base); -} - -int __weak module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - -static int post_relocation(struct module *mod, const struct load_info *info) -{ - /* Sort exception table now relocations are done. */ - sort_extable(mod->extable, mod->extable + mod->num_exentries); - - /* Copy relocated percpu area over. */ - percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, - info->sechdrs[info->index.pcpu].sh_size); - - /* Setup kallsyms-specific fields. */ - add_kallsyms(mod, info); - - /* Arch-specific module finalizing. */ - return module_finalize(info->hdr, info->sechdrs, mod); -} - -/* Is this module of this name done loading? No locks held. */ -static bool finished_loading(const char *name) -{ - struct module *mod; - bool ret; - - /* - * The module_mutex should not be a heavily contended lock; - * if we get the occasional sleep here, we'll go an extra iteration - * in the wait_event_interruptible(), which is harmless. - */ - sched_annotate_sleep(); - mutex_lock(&module_mutex); - mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE; - mutex_unlock(&module_mutex); - - return ret; -} - -/* Call module constructors. */ -static void do_mod_ctors(struct module *mod) -{ -#ifdef CONFIG_CONSTRUCTORS - unsigned long i; - - for (i = 0; i < mod->num_ctors; i++) - mod->ctors[i](); -#endif -} - -/* For freeing module_init on success, in case kallsyms traversing */ -struct mod_initfree { - struct llist_node node; - void *module_init; -}; - -static void do_free_init(struct work_struct *w) -{ - struct llist_node *pos, *n, *list; - struct mod_initfree *initfree; - - list = llist_del_all(&init_free_list); - - synchronize_rcu(); - - llist_for_each_safe(pos, n, list) { - initfree = container_of(pos, struct mod_initfree, node); - module_memfree(initfree->module_init); - kfree(initfree); - } -} - -/* - * This is where the real work happens. - * - * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb - * helper command 'lx-symbols'. - */ -static noinline int do_init_module(struct module *mod) -{ - int ret = 0; - struct mod_initfree *freeinit; - - freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); - if (!freeinit) { - ret = -ENOMEM; - goto fail; - } - freeinit->module_init = mod->init_layout.base; - - do_mod_ctors(mod); - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - goto fail_free_freeinit; - } - if (ret > 0) { - pr_warn("%s: '%s'->init suspiciously returned %d, it should " - "follow 0/-E convention\n" - "%s: loading module anyway...\n", - __func__, mod->name, ret, __func__); - dump_stack(); - } - - /* Now it's a first class citizen! */ - mod->state = MODULE_STATE_LIVE; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_LIVE, mod); - - /* Delay uevent until module has finished its init routine */ - kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); - - /* - * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock if synchronous module - * loading is requested from async (which is not allowed!). - * - * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous - * request_module() from async workers") for more details. - */ - if (!mod->async_probe_requested) - async_synchronize_full(); - - ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + - mod->init_layout.size); - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -#ifdef CONFIG_KALLSYMS - /* Switch to core kallsyms now init is done: kallsyms may be walking! */ - rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); -#endif - module_enable_ro(mod, true); - mod_tree_remove_init(mod); - module_arch_freeing_init(mod); - mod->init_layout.base = NULL; - mod->init_layout.size = 0; - mod->init_layout.ro_size = 0; - mod->init_layout.ro_after_init_size = 0; - mod->init_layout.text_size = 0; -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES - /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ - mod->btf_data = NULL; -#endif - /* - * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, we - * call synchronize_rcu(), but we don't want to slow down the success - * path. module_memfree() cannot be called in an interrupt, so do the - * work and call synchronize_rcu() in a work queue. - * - * Note that module_alloc() on most architectures creates W+X page - * mappings which won't be cleaned up until do_free_init() runs. Any - * code such as mark_rodata_ro() which depends on those mappings to - * be cleaned up needs to sync with the queued work - ie - * rcu_barrier() - */ - if (llist_add(&freeinit->node, &init_free_list)) - schedule_work(&init_free_wq); - - mutex_unlock(&module_mutex); - wake_up_all(&module_wq); - - return 0; - -fail_free_freeinit: - kfree(freeinit); -fail: - /* Try to protect us from buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_rcu(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - klp_module_going(mod); - ftrace_release_mod(mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; -} - -static int may_init_module(void) -{ - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; - - return 0; -} - -/* - * We try to place it in the list now to make sure it's unique before - * we dedicate too many resources. In particular, temporary percpu - * memory exhaustion. - */ -static int add_unformed_module(struct module *mod) -{ - int err; - struct module *old; - - mod->state = MODULE_STATE_UNFORMED; - -again: - mutex_lock(&module_mutex); - old = find_module_all(mod->name, strlen(mod->name), true); - if (old != NULL) { - if (old->state != MODULE_STATE_LIVE) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto out_unlocked; - goto again; - } - err = -EEXIST; - goto out; - } - mod_update_bounds(mod); - list_add_rcu(&mod->list, &modules); - mod_tree_insert(mod); - err = 0; - -out: - mutex_unlock(&module_mutex); -out_unlocked: - return err; -} - -static int complete_formation(struct module *mod, struct load_info *info) -{ - int err; - - mutex_lock(&module_mutex); - - /* Find duplicate symbols (must be called under lock). */ - err = verify_exported_symbols(mod); - if (err < 0) - goto out; - - /* This relies on module_mutex for list integrity. */ - module_bug_finalize(info->hdr, info->sechdrs, mod); - - module_enable_ro(mod, false); - module_enable_nx(mod); - module_enable_x(mod); - - /* - * Mark state as coming so strong_try_module_get() ignores us, - * but kallsyms etc. can see us. - */ - mod->state = MODULE_STATE_COMING; - mutex_unlock(&module_mutex); - - return 0; - -out: - mutex_unlock(&module_mutex); - return err; -} - -static int prepare_coming_module(struct module *mod) -{ - int err; - - ftrace_module_enable(mod); - err = klp_module_coming(mod); - if (err) - return err; - - err = blocking_notifier_call_chain_robust(&module_notify_list, - MODULE_STATE_COMING, MODULE_STATE_GOING, mod); - err = notifier_to_errno(err); - if (err) - klp_module_going(mod); - - return err; -} - -static int unknown_module_param_cb(char *param, char *val, const char *modname, - void *arg) -{ - struct module *mod = arg; - int ret; - - if (strcmp(param, "async_probe") == 0) { - mod->async_probe_requested = true; - return 0; - } - - /* Check for magic 'dyndbg' arg */ - ret = ddebug_dyndbg_module_param_cb(param, val, modname); - if (ret != 0) - pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); - return 0; -} - -static void cfi_init(struct module *mod); - -/* - * Allocate and load the module: note that size of section 0 is always - * zero, and we rely on this for optional sections. - */ -static int load_module(struct load_info *info, const char __user *uargs, - int flags) -{ - struct module *mod; - long err = 0; - char *after_dashes; - - /* - * Do the signature check (if any) first. All that - * the signature check needs is info->len, it does - * not need any of the section info. That can be - * set up later. This will minimize the chances - * of a corrupt module causing problems before - * we even get to the signature check. - * - * The check will also adjust info->len by stripping - * off the sig length at the end of the module, making - * checks against info->len more correct. - */ - err = module_sig_check(info, flags); - if (err) - goto free_copy; - - /* - * Do basic sanity checks against the ELF header and - * sections. - */ - err = elf_validity_check(info); - if (err) - goto free_copy; - - /* - * Everything checks out, so set up the section info - * in the info structure. - */ - err = setup_load_info(info, flags); - if (err) - goto free_copy; - - /* - * Now that we know we have the correct module name, check - * if it's blacklisted. - */ - if (blacklisted(info->name)) { - err = -EPERM; - pr_err("Module %s is blacklisted\n", info->name); - goto free_copy; - } - - err = rewrite_section_headers(info, flags); - if (err) - goto free_copy; - - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info, info->mod)) { - err = -ENOEXEC; - goto free_copy; - } - - /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(info, flags); - if (IS_ERR(mod)) { - err = PTR_ERR(mod); - goto free_copy; - } - - audit_log_kern_module(mod->name); - - /* Reserve our place in the list. */ - err = add_unformed_module(mod); - if (err) - goto free_module; - -#ifdef CONFIG_MODULE_SIG - mod->sig_ok = info->sig_ok; - if (!mod->sig_ok) { - pr_notice_once("%s: module verification failed: signature " - "and/or required key missing - tainting " - "kernel\n", mod->name); - add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); - } -#endif - - /* To avoid stressing percpu allocator, do this once we're unique. */ - err = percpu_modalloc(mod, info); - if (err) - goto unlink_mod; - - /* Now module is in final location, initialize linked lists, etc. */ - err = module_unload_init(mod); - if (err) - goto unlink_mod; - - init_param_lock(mod); - - /* - * Now we've got everything in the final locations, we can - * find optional sections. - */ - err = find_module_sections(mod, info); - if (err) - goto free_unload; - - err = check_module_license_and_versions(mod); - if (err) - goto free_unload; - - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, info); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(mod, info); - if (err < 0) - goto free_modinfo; - - err = apply_relocations(mod, info); - if (err < 0) - goto free_modinfo; - - err = post_relocation(mod, info); - if (err < 0) - goto free_modinfo; - - flush_module_icache(mod); - - /* Setup CFI for the module. */ - cfi_init(mod); - - /* Now copy in args */ - mod->args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(mod->args)) { - err = PTR_ERR(mod->args); - goto free_arch_cleanup; - } - - init_build_id(mod, info); - dynamic_debug_setup(mod, info->debug, info->num_debug); - - /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ - ftrace_module_init(mod); - - /* Finally it's fully formed, ready to start executing. */ - err = complete_formation(mod, info); - if (err) - goto ddebug_cleanup; - - err = prepare_coming_module(mod); - if (err) - goto bug_cleanup; - - /* Module is ready to execute: parsing args may do that. */ - after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, mod, - unknown_module_param_cb); - if (IS_ERR(after_dashes)) { - err = PTR_ERR(after_dashes); - goto coming_cleanup; - } else if (after_dashes) { - pr_warn("%s: parameters '%s' after `--' ignored\n", - mod->name, after_dashes); - } - - /* Link in to sysfs. */ - err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); - if (err < 0) - goto coming_cleanup; - - if (is_livepatch_module(mod)) { - err = copy_module_elf(mod, info); - if (err < 0) - goto sysfs_cleanup; - } - - /* Get rid of temporary copy. */ - free_copy(info, flags); - - /* Done! */ - trace_module_load(mod); - - return do_init_module(mod); - - sysfs_cleanup: - mod_sysfs_teardown(mod); - coming_cleanup: - mod->state = MODULE_STATE_GOING; - destroy_params(mod->kp, mod->num_kp); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - klp_module_going(mod); - bug_cleanup: - mod->state = MODULE_STATE_GOING; - /* module_bug_cleanup needs module_mutex protection */ - mutex_lock(&module_mutex); - module_bug_cleanup(mod); - mutex_unlock(&module_mutex); - - ddebug_cleanup: - ftrace_release_mod(mod); - dynamic_debug_remove(mod, info->debug); - synchronize_rcu(); - kfree(mod->args); - free_arch_cleanup: - cfi_cleanup(mod); - module_arch_cleanup(mod); - free_modinfo: - free_modinfo(mod); - free_unload: - module_unload_free(mod); - unlink_mod: - mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); - mod_tree_remove(mod); - wake_up_all(&module_wq); - /* Wait for RCU-sched synchronizing before releasing mod->list. */ - synchronize_rcu(); - mutex_unlock(&module_mutex); - free_module: - /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); - - module_deallocate(mod, info); - free_copy: - free_copy(info, flags); - return err; -} - -SYSCALL_DEFINE3(init_module, void __user *, umod, - unsigned long, len, const char __user *, uargs) -{ - int err; - struct load_info info = { }; - - err = may_init_module(); - if (err) - return err; - - pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); - - err = copy_module_from_user(umod, len, &info); - if (err) - return err; - - return load_module(&info, uargs, 0); -} - -SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) -{ - struct load_info info = { }; - void *buf = NULL; - int len; - int err; - - err = may_init_module(); - if (err) - return err; - - pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); - - if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC - |MODULE_INIT_COMPRESSED_FILE)) - return -EINVAL; - - len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, - READING_MODULE); - if (len < 0) - return len; - - if (flags & MODULE_INIT_COMPRESSED_FILE) { - err = module_decompress(&info, buf, len); - vfree(buf); /* compressed data is no longer needed */ - if (err) - return err; - } else { - info.hdr = buf; - info.len = len; - } - - return load_module(&info, uargs, flags); -} - -static inline int within(unsigned long addr, void *start, unsigned long size) -{ - return ((void *)addr >= start && (void *)addr < start + size); -} - -#ifdef CONFIG_KALLSYMS -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - if (str[0] == '.' && str[1] == 'L') - return true; - return str[0] == '$' && strchr("axtd", str[1]) - && (str[2] == '\0' || str[2] == '.'); -} - -static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) -{ - return kallsyms->strtab + kallsyms->symtab[symnum].st_name; -} - -/* - * Given a module and address, find the corresponding symbol and return its name - * while providing its size and offset if needed. - */ -static const char *find_kallsyms_symbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) -{ - unsigned int i, best = 0; - unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - /* At worse, next value is at end of module */ - if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; - else - nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; - - bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); - - /* - * Scan for closest preceding symbol, and next symbol. (ELF - * starts real symbols at 1). - */ - for (i = 1; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - unsigned long thisval = kallsyms_symbol_value(sym); - - if (sym->st_shndx == SHN_UNDEF) - continue; - - /* - * We ignore unnamed symbols: they're uninformative - * and inserted at a whim. - */ - if (*kallsyms_symbol_name(kallsyms, i) == '\0' - || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) - continue; - - if (thisval <= addr && thisval > bestval) { - best = i; - bestval = thisval; - } - if (thisval > addr && thisval < nextval) - nextval = thisval; - } - - if (!best) - return NULL; - - if (size) - *size = nextval - bestval; - if (offset) - *offset = addr - bestval; - - return kallsyms_symbol_name(kallsyms, best); -} - -void * __weak dereference_module_function_descriptor(struct module *mod, - void *ptr) -{ - return ptr; -} - -/* - * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. - */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) -{ - const char *ret = NULL; - struct module *mod; - - preempt_disable(); - mod = __module_address(addr); - if (mod) { - if (modname) - *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } - - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); - ret = namebuf; - } - preempt_enable(); - - return ret; -} - -int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, NULL, NULL); - if (!sym) - goto out; - - strlcpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strlcpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - struct mod_kallsyms *kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); - if (symnum < kallsyms->num_symtab) { - const Elf_Sym *sym = &kallsyms->symtab[symnum]; - - *value = kallsyms_symbol_value(sym); - *type = kallsyms->typetab[symnum]; - strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); - strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, *value, mod); - preempt_enable(); - return 0; - } - symnum -= kallsyms->num_symtab; - } - preempt_enable(); - return -ERANGE; -} - -/* Given a module and name of symbol, find and return the symbol's value */ -static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) -{ - unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && - sym->st_shndx != SHN_UNDEF) - return kallsyms_symbol_value(sym); - } - return 0; -} - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) -{ - struct module *mod; - char *colon; - unsigned long ret = 0; - - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { - if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = find_kallsyms_symbol_value(mod, colon+1); - } else { - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) - break; - } - } - preempt_enable(); - return ret; -} - -#ifdef CONFIG_LIVEPATCH -int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, - struct module *, unsigned long), - void *data) -{ - struct module *mod; - unsigned int i; - int ret = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) { - /* We hold module_mutex: no need for rcu_dereference_sched */ - struct mod_kallsyms *kallsyms = mod->kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (sym->st_shndx == SHN_UNDEF) - continue; - - ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms_symbol_value(sym)); - if (ret != 0) - goto out; - - cond_resched(); - } - } -out: - mutex_unlock(&module_mutex); - return ret; -} -#endif /* CONFIG_LIVEPATCH */ -#endif /* CONFIG_KALLSYMS */ - -static void cfi_init(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - initcall_t *init; - exitcall_t *exit; - - rcu_read_lock_sched(); - mod->cfi_check = (cfi_check_fn) - find_kallsyms_symbol_value(mod, "__cfi_check"); - init = (initcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); - exit = (exitcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); - rcu_read_unlock_sched(); - - /* Fix init/exit functions to point to the CFI jump table */ - if (init) - mod->init = *init; -#ifdef CONFIG_MODULE_UNLOAD - if (exit) - mod->exit = *exit; -#endif - - cfi_module_add(mod, module_addr_min); -#endif -} - -static void cfi_cleanup(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - cfi_module_remove(mod, module_addr_min); -#endif -} - -/* Maximum number of characters written by module_flags() */ -#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) - -/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ -static char *module_flags(struct module *mod, char *buf) -{ - int bx = 0; - - BUG_ON(mod->state == MODULE_STATE_UNFORMED); - if (mod->taints || - mod->state == MODULE_STATE_GOING || - mod->state == MODULE_STATE_COMING) { - buf[bx++] = '('; - bx += module_flags_taint(mod, buf + bx); - /* Show a - for module-is-being-unloaded */ - if (mod->state == MODULE_STATE_GOING) - buf[bx++] = '-'; - /* Show a + for module-is-being-loaded */ - if (mod->state == MODULE_STATE_COMING) - buf[bx++] = '+'; - buf[bx++] = ')'; - } - buf[bx] = '\0'; - - return buf; -} - -#ifdef CONFIG_PROC_FS -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - -static int m_show(struct seq_file *m, void *p) -{ - struct module *mod = list_entry(p, struct module, list); - char buf[MODULE_FLAGS_BUF_SIZE]; - void *value; - - /* We always ignore unformed modules. */ - if (mod->state == MODULE_STATE_UNFORMED) - return 0; - - seq_printf(m, "%s %u", - mod->name, mod->init_layout.size + mod->core_layout.size); - print_unload_info(m, mod); - - /* Informative for users. */ - seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading" : - mod->state == MODULE_STATE_COMING ? "Loading" : - "Live"); - /* Used by oprofile and other similar tools. */ - value = m->private ? NULL : mod->core_layout.base; - seq_printf(m, " 0x%px", value); - - /* Taints info */ - if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); - - seq_puts(m, "\n"); - return 0; -} - -/* - * Format: modulename size refcount deps address - * - * Where refcount is a number or -, and deps is a comma-separated list - * of depends or -. - */ -static const struct seq_operations modules_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = m_show -}; - -/* - * This also sets the "private" pointer to non-NULL if the - * kernel pointers should be hidden (so you can just test - * "m->private" to see if you should keep the values private). - * - * We use the same logic as for /proc/kallsyms. - */ -static int modules_open(struct inode *inode, struct file *file) -{ - int err = seq_open(file, &modules_op); - - if (!err) { - struct seq_file *m = file->private_data; - m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; - } - - return err; -} - -static const struct proc_ops modules_proc_ops = { - .proc_flags = PROC_ENTRY_PERMANENT, - .proc_open = modules_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - -static int __init proc_modules_init(void) -{ - proc_create("modules", 0, NULL, &modules_proc_ops); - return 0; -} -module_init(proc_modules_init); -#endif - -/* Given an address, look for it in the module exception tables. */ -const struct exception_table_entry *search_module_extables(unsigned long addr) -{ - const struct exception_table_entry *e = NULL; - struct module *mod; - - preempt_disable(); - mod = __module_address(addr); - if (!mod) - goto out; - - if (!mod->num_exentries) - goto out; - - e = search_extable(mod->extable, - mod->num_exentries, - addr); -out: - preempt_enable(); - - /* - * Now, if we found one, we are running inside it now, hence - * we cannot unload the module, hence no refcnt needed. - */ - return e; -} - -/** - * is_module_address() - is this address inside a module? - * @addr: the address to check. - * - * See is_module_text_address() if you simply want to see if the address - * is code (not data). - */ -bool is_module_address(unsigned long addr) -{ - bool ret; - - preempt_disable(); - ret = __module_address(addr) != NULL; - preempt_enable(); - - return ret; -} - -/** - * __module_address() - get the module which contains an address. - * @addr: the address. - * - * Must be called with preempt disabled or module mutex held so that - * module doesn't get freed during this. - */ -struct module *__module_address(unsigned long addr) -{ - struct module *mod; - - if (addr < module_addr_min || addr > module_addr_max) - return NULL; - - module_assert_mutex_or_preempt(); - - mod = mod_find(addr); - if (mod) { - BUG_ON(!within_module(addr, mod)); - if (mod->state == MODULE_STATE_UNFORMED) - mod = NULL; - } - return mod; -} - -/** - * is_module_text_address() - is this address inside module code? - * @addr: the address to check. - * - * See is_module_address() if you simply want to see if the address is - * anywhere in a module. See kernel_text_address() for testing if an - * address corresponds to kernel or module code. - */ -bool is_module_text_address(unsigned long addr) -{ - bool ret; - - preempt_disable(); - ret = __module_text_address(addr) != NULL; - preempt_enable(); - - return ret; -} - -/** - * __module_text_address() - get the module whose code contains an address. - * @addr: the address. - * - * Must be called with preempt disabled or module mutex held so that - * module doesn't get freed during this. - */ -struct module *__module_text_address(unsigned long addr) -{ - struct module *mod = __module_address(addr); - if (mod) { - /* Make sure it's within the text section. */ - if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) - && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) - mod = NULL; - } - return mod; -} - -/* Don't grab lock, we're oopsing. */ -void print_modules(void) -{ - struct module *mod; - char buf[MODULE_FLAGS_BUF_SIZE]; - - printk(KERN_DEFAULT "Modules linked in:"); - /* Most callers should already have preempt disabled, but make sure */ - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - pr_cont(" %s%s", mod->name, module_flags(mod, buf)); - } - preempt_enable(); - if (last_unloaded_module[0]) - pr_cont(" [last unloaded: %s]", last_unloaded_module); - pr_cont("\n"); -} - -#ifdef CONFIG_MODVERSIONS -/* - * Generate the signature for all relevant module structures here. - * If these change, we don't want to try to parse the module. - */ -void module_layout(struct module *mod, - struct modversion_info *ver, - struct kernel_param *kp, - struct kernel_symbol *ks, - struct tracepoint * const *tp) -{ -} -EXPORT_SYMBOL(module_layout); -#endif diff --git a/kernel/module/Makefile b/kernel/module/Makefile new file mode 100644 index 000000000000..cdd5c61b8c7f --- /dev/null +++ b/kernel/module/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for linux kernel module support +# + +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n + +obj-y += main.o +obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o +obj-$(CONFIG_MODULE_SIG) += signing.o diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c new file mode 100644 index 000000000000..d14d6443225a --- /dev/null +++ b/kernel/module/decompress.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2021 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static int module_extend_max_pages(struct load_info *info, unsigned int extent) +{ + struct page **new_pages; + + new_pages = kvmalloc_array(info->max_pages + extent, + sizeof(info->pages), GFP_KERNEL); + if (!new_pages) + return -ENOMEM; + + memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages)); + kvfree(info->pages); + info->pages = new_pages; + info->max_pages += extent; + + return 0; +} + +static struct page *module_get_next_page(struct load_info *info) +{ + struct page *page; + int error; + + if (info->max_pages == info->used_pages) { + error = module_extend_max_pages(info, info->used_pages); + if (error) + return ERR_PTR(error); + } + + page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!page) + return ERR_PTR(-ENOMEM); + + info->pages[info->used_pages++] = page; + return page; +} + +#ifdef CONFIG_MODULE_COMPRESS_GZIP +#include +#define MODULE_COMPRESSION gzip +#define MODULE_DECOMPRESS_FN module_gzip_decompress + +/* + * Calculate length of the header which consists of signature, header + * flags, time stamp and operating system ID (10 bytes total), plus + * an optional filename. + */ +static size_t module_gzip_header_len(const u8 *buf, size_t size) +{ + const u8 signature[] = { 0x1f, 0x8b, 0x08 }; + size_t len = 10; + + if (size < len || memcmp(buf, signature, sizeof(signature))) + return 0; + + if (buf[3] & 0x08) { + do { + /* + * If we can't find the end of the file name we must + * be dealing with a corrupted file. + */ + if (len == size) + return 0; + } while (buf[len++] != '\0'); + } + + return len; +} + +static ssize_t module_gzip_decompress(struct load_info *info, + const void *buf, size_t size) +{ + struct z_stream_s s = { 0 }; + size_t new_size = 0; + size_t gzip_hdr_len; + ssize_t retval; + int rc; + + gzip_hdr_len = module_gzip_header_len(buf, size); + if (!gzip_hdr_len) { + pr_err("not a gzip compressed module\n"); + return -EINVAL; + } + + s.next_in = buf + gzip_hdr_len; + s.avail_in = size - gzip_hdr_len; + + s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + if (!s.workspace) + return -ENOMEM; + + rc = zlib_inflateInit2(&s, -MAX_WBITS); + if (rc != Z_OK) { + pr_err("failed to initialize decompressor: %d\n", rc); + retval = -EINVAL; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out_inflate_end; + } + + s.next_out = kmap(page); + s.avail_out = PAGE_SIZE; + rc = zlib_inflate(&s, 0); + kunmap(page); + + new_size += PAGE_SIZE - s.avail_out; + } while (rc == Z_OK); + + if (rc != Z_STREAM_END) { + pr_err("decompression failed with status %d\n", rc); + retval = -EINVAL; + goto out_inflate_end; + } + + retval = new_size; + +out_inflate_end: + zlib_inflateEnd(&s); +out: + kfree(s.workspace); + return retval; +} +#elif CONFIG_MODULE_COMPRESS_XZ +#include +#define MODULE_COMPRESSION xz +#define MODULE_DECOMPRESS_FN module_xz_decompress + +static ssize_t module_xz_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 }; + struct xz_dec *xz_dec; + struct xz_buf xz_buf; + enum xz_ret xz_ret; + size_t new_size = 0; + ssize_t retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not an xz compressed module\n"); + return -EINVAL; + } + + xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1); + if (!xz_dec) + return -ENOMEM; + + xz_buf.in_size = size; + xz_buf.in = buf; + xz_buf.in_pos = 0; + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out; + } + + xz_buf.out = kmap(page); + xz_buf.out_pos = 0; + xz_buf.out_size = PAGE_SIZE; + xz_ret = xz_dec_run(xz_dec, &xz_buf); + kunmap(page); + + new_size += xz_buf.out_pos; + } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); + + if (xz_ret != XZ_STREAM_END) { + pr_err("decompression failed with status %d\n", xz_ret); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + xz_dec_end(xz_dec); + return retval; +} +#else +#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" +#endif + +int module_decompress(struct load_info *info, const void *buf, size_t size) +{ + unsigned int n_pages; + ssize_t data_size; + int error; + + /* + * Start with number of pages twice as big as needed for + * compressed data. + */ + n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2; + error = module_extend_max_pages(info, n_pages); + + data_size = MODULE_DECOMPRESS_FN(info, buf, size); + if (data_size < 0) { + error = data_size; + goto err; + } + + info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL); + if (!info->hdr) { + error = -ENOMEM; + goto err; + } + + info->len = data_size; + return 0; + +err: + module_decompress_cleanup(info); + return error; +} + +void module_decompress_cleanup(struct load_info *info) +{ + int i; + + if (info->hdr) + vunmap(info->hdr); + + for (i = 0; i < info->used_pages; i++) + __free_page(info->pages[i]); + + kvfree(info->pages); + + info->pages = NULL; + info->max_pages = info->used_pages = 0; +} + +#ifdef CONFIG_SYSFS +static ssize_t compression_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); +} +static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); + +static int __init module_decompress_sysfs_init(void) +{ + int error; + + error = sysfs_create_file(&module_kset->kobj, + &module_compression_attr.attr); + if (error) + pr_warn("Failed to create 'compression' attribute"); + + return 0; +} +late_initcall(module_decompress_sysfs_init); +#endif diff --git a/kernel/module/internal.h b/kernel/module/internal.h new file mode 100644 index 000000000000..8c381c99062f --- /dev/null +++ b/kernel/module/internal.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include + +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; + struct _ddebug *debug; + unsigned int num_debug; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif +#ifdef CONFIG_MODULE_DECOMPRESS + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +extern int mod_verify_sig(const void *mod, struct load_info *info); + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif diff --git a/kernel/module/main.c b/kernel/module/main.c new file mode 100644 index 000000000000..1a17f02f69a0 --- /dev/null +++ b/kernel/module/main.c @@ -0,0 +1,4810 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2002 Richard Henderson + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + */ + +#define INCLUDE_VERMAGIC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX +# define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif + +/* If this is set, the section belongs in the init part of the module */ +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) + +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. + * (delete and add uses RCU list operations). + */ +static DEFINE_MUTEX(module_mutex); +static LIST_HEAD(modules); + +/* Work queue for freeing init sections in success case */ +static void do_free_init(struct work_struct *w); +static DECLARE_WORK(init_free_wq, do_free_init); +static LLIST_HEAD(init_free_list); + +#ifdef CONFIG_MODULES_TREE_LOOKUP + +/* + * Use a latched RB-tree for __module_address(); this allows us to use + * RCU-sched lookups of the address from any context. + * + * This is conditional on PERF_EVENTS || TRACING because those can really hit + * __module_address() hard by doing a lot of stack unwinding; potentially from + * NMI context. + */ + +static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->base; +} + +static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->size; +} + +static __always_inline bool +mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) +{ + return __mod_tree_val(a) < __mod_tree_val(b); +} + +static __always_inline int +mod_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long start, end; + + start = __mod_tree_val(n); + if (val < start) + return -1; + + end = start + __mod_tree_size(n); + if (val >= end) + return 1; + + return 0; +} + +static const struct latch_tree_ops mod_tree_ops = { + .less = mod_tree_less, + .comp = mod_tree_comp, +}; + +static struct mod_tree_root { + struct latch_tree_root root; + unsigned long addr_min; + unsigned long addr_max; +} mod_tree __cacheline_aligned = { + .addr_min = -1UL, +}; + +#define module_addr_min mod_tree.addr_min +#define module_addr_max mod_tree.addr_max + +static noinline void __mod_tree_insert(struct mod_tree_node *node) +{ + latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); +} + +static void __mod_tree_remove(struct mod_tree_node *node) +{ + latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); +} + +/* + * These modifications: insert, remove_init and remove; are serialized by the + * module_mutex. + */ +static void mod_tree_insert(struct module *mod) +{ + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; + + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); +} + +static void mod_tree_remove_init(struct module *mod) +{ + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); +} + +static void mod_tree_remove(struct module *mod) +{ + __mod_tree_remove(&mod->core_layout.mtn); + mod_tree_remove_init(mod); +} + +static struct module *mod_find(unsigned long addr) +{ + struct latch_tree_node *ltn; + + ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); + if (!ltn) + return NULL; + + return container_of(ltn, struct mod_tree_node, node)->mod; +} + +#else /* MODULES_TREE_LOOKUP */ + +static unsigned long module_addr_min = -1UL, module_addr_max = 0; + +static void mod_tree_insert(struct module *mod) { } +static void mod_tree_remove_init(struct module *mod) { } +static void mod_tree_remove(struct module *mod) { } + +static struct module *mod_find(unsigned long addr) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} + +#endif /* MODULES_TREE_LOOKUP */ + +/* + * Bounds of module text, for speeding up __module_address. + * Protected by module_mutex. + */ +static void __mod_update_bounds(void *base, unsigned int size) +{ + unsigned long min = (unsigned long)base; + unsigned long max = min + size; + + if (min < module_addr_min) + module_addr_min = min; + if (max > module_addr_max) + module_addr_max = max; +} + +static void mod_update_bounds(struct module *mod) +{ + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + if (mod->init_layout.size) + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); +} + +#ifdef CONFIG_KGDB_KDB +struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ +#endif /* CONFIG_KGDB_KDB */ + +static void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON_ONCE(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + +#ifdef CONFIG_MODULE_SIG +static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); +module_param(sig_enforce, bool_enable_only, 0644); + +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} +#else +#define sig_enforce false +#endif + +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + +/* Block module loading/unloading? */ +int modules_disabled = 0; +core_param(nomodule, modules_disabled, bint, 0); + +/* Waiting for a module to finish initializing? */ +static DECLARE_WAIT_QUEUE_HEAD(module_wq); + +static BLOCKING_NOTIFIER_HEAD(module_notify_list); + +int register_module_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&module_notify_list, nb); +} +EXPORT_SYMBOL(register_module_notifier); + +int unregister_module_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&module_notify_list, nb); +} +EXPORT_SYMBOL(unregister_module_notifier); + +/* + * We require a truly strong try_module_get(): 0 means success. + * Otherwise an error is returned due to ongoing or failed + * initialization etc. + */ +static inline int strong_try_module_get(struct module *mod) +{ + BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); + if (mod && mod->state == MODULE_STATE_COMING) + return -EBUSY; + if (try_module_get(mod)) + return 0; + else + return -ENOENT; +} + +static inline void add_taint_module(struct module *mod, unsigned flag, + enum lockdep_ok lockdep_ok) +{ + add_taint(flag, lockdep_ok); + set_bit(flag, &mod->taints); +} + +/* + * A thread that wants to hold a reference to a module only while it + * is running can call this to safely exit. + */ +void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) +{ + module_put(mod); + kthread_exit(code); +} +EXPORT_SYMBOL(__module_put_and_kthread_exit); + +/* Find a module section: 0 means not found. */ +static unsigned int find_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + /* Alloc bit cleared means "ignore it." */ + if ((shdr->sh_flags & SHF_ALLOC) + && strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* Find a module section, or NULL. */ +static void *section_addr(const struct load_info *info, const char *name) +{ + /* Section 0 has sh_addr 0. */ + return (void *)info->sechdrs[find_sec(info, name)].sh_addr; +} + +/* Find a module section, or NULL. Fill in number of "objects" in section. */ +static void *section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + +/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */ +static unsigned int find_any_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* + * Find a module section, or NULL. Fill in number of "objects" in section. + * Ignores SHF_ALLOC flag. + */ +static __maybe_unused void *any_section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_any_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const s32 __start___kcrctab[]; +extern const s32 __start___kcrctab_gpl[]; + +#ifndef CONFIG_MODVERSIONS +#define symversion(base, idx) NULL +#else +#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) +#endif + +struct symsearch { + const struct kernel_symbol *start, *stop; + const s32 *crcs; + enum mod_license { + NOT_GPL_ONLY, + GPL_ONLY, + } license; +}; + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const s32 *crc; + const struct kernel_symbol *sym; + enum mod_license license; +}; + +static bool check_exported_symbol(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) +{ + struct find_symbol_arg *fsa = data; + + if (!fsa->gplok && syms->license == GPL_ONLY) + return false; + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, symnum); + fsa->sym = &syms->start[symnum]; + fsa->license = syms->license; + return true; +} + +static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +static const char *kernel_symbol_name(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->name_offset); +#else + return sym->name; +#endif +} + +static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + if (!sym->namespace_offset) + return NULL; + return offset_to_ptr(&sym->namespace_offset); +#else + return sym->namespace; +#endif +} + +static int cmp_name(const void *name, const void *sym) +{ + return strcmp(name, kernel_symbol_name(sym)); +} + +static bool find_exported_symbol_in_section(const struct symsearch *syms, + struct module *owner, + void *data) +{ + struct find_symbol_arg *fsa = data; + struct kernel_symbol *sym; + + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, + sizeof(struct kernel_symbol), cmp_name); + + if (sym != NULL && check_exported_symbol(syms, owner, + sym - syms->start, data)) + return true; + + return false; +} + +/* + * Find an exported symbol and return it, along with, (optional) crc and + * (optional) module which owns it. Needs preempt disabled or module_mutex. + */ +static bool find_symbol(struct find_symbol_arg *fsa) +{ + static const struct symsearch arr[] = { + { __start___ksymtab, __stop___ksymtab, __start___kcrctab, + NOT_GPL_ONLY }, + { __start___ksymtab_gpl, __stop___ksymtab_gpl, + __start___kcrctab_gpl, + GPL_ONLY }, + }; + struct module *mod; + unsigned int i; + + module_assert_mutex_or_preempt(); + + for (i = 0; i < ARRAY_SIZE(arr); i++) + if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) + return true; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + struct symsearch arr[] = { + { mod->syms, mod->syms + mod->num_syms, mod->crcs, + NOT_GPL_ONLY }, + { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, + mod->gpl_crcs, + GPL_ONLY }, + }; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + for (i = 0; i < ARRAY_SIZE(arr); i++) + if (find_exported_symbol_in_section(&arr[i], mod, fsa)) + return true; + } + + pr_debug("Failed to find symbol %s\n", fsa->name); + return false; +} + +/* + * Search for module by name: must hold module_mutex (or preempt disabled + * for read-only access). + */ +static struct module *find_module_all(const char *name, size_t len, + bool even_unformed) +{ + struct module *mod; + + module_assert_mutex_or_preempt(); + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) + continue; + if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) + return mod; + } + return NULL; +} + +struct module *find_module(const char *name) +{ + return find_module_all(name, strlen(name), false); +} + +#ifdef CONFIG_SMP + +static inline void __percpu *mod_percpu(struct module *mod) +{ + return mod->percpu; +} + +static int percpu_modalloc(struct module *mod, struct load_info *info) +{ + Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; + unsigned long align = pcpusec->sh_addralign; + + if (!pcpusec->sh_size) + return 0; + + if (align > PAGE_SIZE) { + pr_warn("%s: per-cpu alignment %li > %li\n", + mod->name, align, PAGE_SIZE); + align = PAGE_SIZE; + } + + mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); + if (!mod->percpu) { + pr_warn("%s: Could not allocate %lu bytes percpu data\n", + mod->name, (unsigned long)pcpusec->sh_size); + return -ENOMEM; + } + mod->percpu_size = pcpusec->sh_size; + return 0; +} + +static void percpu_modfree(struct module *mod) +{ + free_percpu(mod->percpu); +} + +static unsigned int find_pcpusec(struct load_info *info) +{ + return find_sec(info, ".data..percpu"); +} + +static void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); +} + +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + struct module *mod; + unsigned int cpu; + + preempt_disable(); + + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (!mod->percpu_size) + continue; + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(mod->percpu, cpu); + void *va = (void *)addr; + + if (va >= start && va < start + mod->percpu_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(mod->percpu, + get_boot_cpu_id()); + } + preempt_enable(); + return true; + } + } + } + + preempt_enable(); + return false; +} + +/** + * is_module_percpu_address() - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * Return: %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + return __is_module_percpu_address(addr, NULL); +} + +#else /* ... !CONFIG_SMP */ + +static inline void __percpu *mod_percpu(struct module *mod) +{ + return NULL; +} +static int percpu_modalloc(struct module *mod, struct load_info *info) +{ + /* UP modules shouldn't have this section: ENOMEM isn't quite right */ + if (info->sechdrs[info->index.pcpu].sh_size != 0) + return -ENOMEM; + return 0; +} +static inline void percpu_modfree(struct module *mod) +{ +} +static unsigned int find_pcpusec(struct load_info *info) +{ + return 0; +} +static inline void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) +{ + /* pcpusec should be 0, and size of that section should be 0. */ + BUG_ON(size != 0); +} +bool is_module_percpu_address(unsigned long addr) +{ + return false; +} + +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + +#endif /* CONFIG_SMP */ + +#define MODINFO_ATTR(field) \ +static void setup_modinfo_##field(struct module *mod, const char *s) \ +{ \ + mod->field = kstrdup(s, GFP_KERNEL); \ +} \ +static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ + struct module_kobject *mk, char *buffer) \ +{ \ + return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ +} \ +static int modinfo_##field##_exists(struct module *mod) \ +{ \ + return mod->field != NULL; \ +} \ +static void free_modinfo_##field(struct module *mod) \ +{ \ + kfree(mod->field); \ + mod->field = NULL; \ +} \ +static struct module_attribute modinfo_##field = { \ + .attr = { .name = __stringify(field), .mode = 0444 }, \ + .show = show_modinfo_##field, \ + .setup = setup_modinfo_##field, \ + .test = modinfo_##field##_exists, \ + .free = free_modinfo_##field, \ +}; + +MODINFO_ATTR(version); +MODINFO_ATTR(srcversion); + +static char last_unloaded_module[MODULE_NAME_LEN+1]; + +#ifdef CONFIG_MODULE_UNLOAD + +EXPORT_TRACEPOINT_SYMBOL(module_get); + +/* MODULE_REF_BASE is the base reference count by kmodule loader. */ +#define MODULE_REF_BASE 1 + +/* Init the unload section of the module. */ +static int module_unload_init(struct module *mod) +{ + /* + * Initialize reference counter to MODULE_REF_BASE. + * refcnt == 0 means module is going. + */ + atomic_set(&mod->refcnt, MODULE_REF_BASE); + + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); + + /* Hold reference count during initialization. */ + atomic_inc(&mod->refcnt); + + return 0; +} + +/* Does a already use b? */ +static int already_uses(struct module *a, struct module *b) +{ + struct module_use *use; + + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { + pr_debug("%s uses %s!\n", a->name, b->name); + return 1; + } + } + pr_debug("%s does not use %s!\n", a->name, b->name); + return 0; +} + +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) +{ + struct module_use *use; + + pr_debug("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) + return -ENOMEM; + + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + return 0; +} + +/* Module a uses b: caller needs module_mutex() */ +static int ref_module(struct module *a, struct module *b) +{ + int err; + + if (b == NULL || already_uses(a, b)) + return 0; + + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); + if (err) + return err; + + err = add_module_usage(a, b); + if (err) { + module_put(b); + return err; + } + return 0; +} + +/* Clear the unload stuff of the module. */ +static void module_unload_free(struct module *mod) +{ + struct module_use *use, *tmp; + + mutex_lock(&module_mutex); + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + pr_debug("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); + } + mutex_unlock(&module_mutex); +} + +#ifdef CONFIG_MODULE_FORCE_UNLOAD +static inline int try_force_unload(unsigned int flags) +{ + int ret = (flags & O_TRUNC); + if (ret) + add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); + return ret; +} +#else +static inline int try_force_unload(unsigned int flags) +{ + return 0; +} +#endif /* CONFIG_MODULE_FORCE_UNLOAD */ + +/* Try to release refcount of module, 0 means success. */ +static int try_release_module_ref(struct module *mod) +{ + int ret; + + /* Try to decrement refcnt which we set at loading */ + ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); + BUG_ON(ret < 0); + if (ret) + /* Someone can put this right now, recover with checking */ + ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); + + return ret; +} + +static int try_stop_module(struct module *mod, int flags, int *forced) +{ + /* If it's not unused, quit unless we're forcing. */ + if (try_release_module_ref(mod) != 0) { + *forced = try_force_unload(flags); + if (!(*forced)) + return -EWOULDBLOCK; + } + + /* Mark it as dying. */ + mod->state = MODULE_STATE_GOING; + + return 0; +} + +/** + * module_refcount() - return the refcount or -1 if unloading + * @mod: the module we're checking + * + * Return: + * -1 if the module is in the process of unloading + * otherwise the number of references in the kernel to the module + */ +int module_refcount(struct module *mod) +{ + return atomic_read(&mod->refcnt) - MODULE_REF_BASE; +} +EXPORT_SYMBOL(module_refcount); + +/* This exists whether we can unload or not */ +static void free_module(struct module *mod); + +SYSCALL_DEFINE2(delete_module, const char __user *, name_user, + unsigned int, flags) +{ + struct module *mod; + char name[MODULE_NAME_LEN]; + int ret, forced = 0; + + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) + return -EFAULT; + name[MODULE_NAME_LEN-1] = '\0'; + + audit_log_kern_module(name); + + if (mutex_lock_interruptible(&module_mutex) != 0) + return -EINTR; + + mod = find_module(name); + if (!mod) { + ret = -ENOENT; + goto out; + } + + if (!list_empty(&mod->source_list)) { + /* Other modules depend on us: get rid of them first. */ + ret = -EWOULDBLOCK; + goto out; + } + + /* Doing init or already dying? */ + if (mod->state != MODULE_STATE_LIVE) { + /* FIXME: if (force), slam module count damn the torpedoes */ + pr_debug("%s already dying\n", mod->name); + ret = -EBUSY; + goto out; + } + + /* If it has an init func, it must have an exit func to unload */ + if (mod->init && !mod->exit) { + forced = try_force_unload(flags); + if (!forced) { + /* This module can't be removed */ + ret = -EBUSY; + goto out; + } + } + + ret = try_stop_module(mod, flags, &forced); + if (ret != 0) + goto out; + + mutex_unlock(&module_mutex); + /* Final destruction now no one is using it. */ + if (mod->exit != NULL) + mod->exit(); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + klp_module_going(mod); + ftrace_release_mod(mod); + + async_synchronize_full(); + + /* Store the name of the last unloaded module for diagnostic purposes */ + strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + + free_module(mod); + /* someone could wait for the module in add_unformed_module() */ + wake_up_all(&module_wq); + return 0; +out: + mutex_unlock(&module_mutex); + return ret; +} + +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + struct module_use *use; + int printed_something = 0; + + seq_printf(m, " %i ", module_refcount(mod)); + + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ + list_for_each_entry(use, &mod->source_list, source_list) { + printed_something = 1; + seq_printf(m, "%s,", use->source->name); + } + + if (mod->init != NULL && mod->exit == NULL) { + printed_something = 1; + seq_puts(m, "[permanent],"); + } + + if (!printed_something) + seq_puts(m, "-"); +} + +void __symbol_put(const char *symbol) +{ + struct find_symbol_arg fsa = { + .name = symbol, + .gplok = true, + }; + + preempt_disable(); + BUG_ON(!find_symbol(&fsa)); + module_put(fsa.owner); + preempt_enable(); +} +EXPORT_SYMBOL(__symbol_put); + +/* Note this assumes addr is a function, which it currently always is. */ +void symbol_put_addr(void *addr) +{ + struct module *modaddr; + unsigned long a = (unsigned long)dereference_function_descriptor(addr); + + if (core_kernel_text(a)) + return; + + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); + modaddr = __module_text_address(a); + BUG_ON(!modaddr); + module_put(modaddr); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(symbol_put_addr); + +static ssize_t show_refcnt(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%i\n", module_refcount(mk->mod)); +} + +static struct module_attribute modinfo_refcnt = + __ATTR(refcnt, 0444, show_refcnt, NULL); + +void __module_get(struct module *module) +{ + if (module) { + preempt_disable(); + atomic_inc(&module->refcnt); + trace_module_get(module, _RET_IP_); + preempt_enable(); + } +} +EXPORT_SYMBOL(__module_get); + +bool try_module_get(struct module *module) +{ + bool ret = true; + + if (module) { + preempt_disable(); + /* Note: here, we can fail to get a reference */ + if (likely(module_is_live(module) && + atomic_inc_not_zero(&module->refcnt) != 0)) + trace_module_get(module, _RET_IP_); + else + ret = false; + + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(try_module_get); + +void module_put(struct module *module) +{ + int ret; + + if (module) { + preempt_disable(); + ret = atomic_dec_if_positive(&module->refcnt); + WARN_ON(ret < 0); /* Failed to put refcount */ + trace_module_put(module, _RET_IP_); + preempt_enable(); + } +} +EXPORT_SYMBOL(module_put); + +#else /* !CONFIG_MODULE_UNLOAD */ +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + /* We don't know the usage count, or what modules are using. */ + seq_puts(m, " - -"); +} + +static inline void module_unload_free(struct module *mod) +{ +} + +static int ref_module(struct module *a, struct module *b) +{ + return strong_try_module_get(b); +} + +static inline int module_unload_init(struct module *mod) +{ + return 0; +} +#endif /* CONFIG_MODULE_UNLOAD */ + +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + int i; + + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + if (taint_flags[i].module && test_bit(i, &mod->taints)) + buf[l++] = taint_flags[i].c_true; + } + + return l; +} + +static ssize_t show_initstate(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + const char *state = "unknown"; + + switch (mk->mod->state) { + case MODULE_STATE_LIVE: + state = "live"; + break; + case MODULE_STATE_COMING: + state = "coming"; + break; + case MODULE_STATE_GOING: + state = "going"; + break; + default: + BUG(); + } + return sprintf(buffer, "%s\n", state); +} + +static struct module_attribute modinfo_initstate = + __ATTR(initstate, 0444, show_initstate, NULL); + +static ssize_t store_uevent(struct module_attribute *mattr, + struct module_kobject *mk, + const char *buffer, size_t count) +{ + int rc; + + rc = kobject_synth_uevent(&mk->kobj, buffer, count); + return rc ? rc : count; +} + +struct module_attribute module_uevent = + __ATTR(uevent, 0200, NULL, store_uevent); + +static ssize_t show_coresize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->core_layout.size); +} + +static struct module_attribute modinfo_coresize = + __ATTR(coresize, 0444, show_coresize, NULL); + +static ssize_t show_initsize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->init_layout.size); +} + +static struct module_attribute modinfo_initsize = + __ATTR(initsize, 0444, show_initsize, NULL); + +static ssize_t show_taint(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + size_t l; + + l = module_flags_taint(mk->mod, buffer); + buffer[l++] = '\n'; + return l; +} + +static struct module_attribute modinfo_taint = + __ATTR(taint, 0444, show_taint, NULL); + +static struct module_attribute *modinfo_attrs[] = { + &module_uevent, + &modinfo_version, + &modinfo_srcversion, + &modinfo_initstate, + &modinfo_coresize, + &modinfo_initsize, + &modinfo_taint, +#ifdef CONFIG_MODULE_UNLOAD + &modinfo_refcnt, +#endif + NULL, +}; + +static const char vermagic[] = VERMAGIC_STRING; + +static int try_to_force_load(struct module *mod, const char *reason) +{ +#ifdef CONFIG_MODULE_FORCE_LOAD + if (!test_taint(TAINT_FORCED_MODULE)) + pr_warn("%s: %s: kernel tainted.\n", mod->name, reason); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); + return 0; +#else + return -ENOEXEC; +#endif +} + +#ifdef CONFIG_MODVERSIONS + +static u32 resolve_rel_crc(const s32 *crc) +{ + return *(u32 *)((void *)crc + *crc); +} + +static int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; + unsigned int i, num_versions; + struct modversion_info *versions; + + /* Exporting module didn't supply crcs? OK, we're already tainted. */ + if (!crc) + return 1; + + /* No versions at all? modprobe --force does this. */ + if (versindex == 0) + return try_to_force_load(mod, symname) == 0; + + versions = (void *) sechdrs[versindex].sh_addr; + num_versions = sechdrs[versindex].sh_size + / sizeof(struct modversion_info); + + for (i = 0; i < num_versions; i++) { + u32 crcval; + + if (strcmp(versions[i].name, symname) != 0) + continue; + + if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) + crcval = resolve_rel_crc(crc); + else + crcval = *crc; + if (versions[i].crc == crcval) + return 1; + pr_debug("Found checksum %X vs module %lX\n", + crcval, versions[i].crc); + goto bad_version; + } + + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); + return 1; + +bad_version: + pr_warn("%s: disagrees about version of symbol %s\n", + info->name, symname); + return 0; +} + +static inline int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + struct find_symbol_arg fsa = { + .name = "module_layout", + .gplok = true, + }; + + /* + * Since this should be found in kernel (which can't be removed), no + * locking is necessary -- use preempt_disable() to placate lockdep. + */ + preempt_disable(); + if (!find_symbol(&fsa)) { + preempt_enable(); + BUG(); + } + preempt_enable(); + return check_version(info, "module_layout", mod, fsa.crc); +} + +/* First part is kernel version, which we ignore if module has crcs. */ +static inline int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + if (has_crcs) { + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + } + return strcmp(amagic, bmagic) == 0; +} +#else +static inline int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + return 1; +} + +static inline int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ + +static char *get_modinfo(const struct load_info *info, const char *tag); +static char *get_next_modinfo(const struct load_info *info, const char *tag, + char *prev); + +static int verify_namespace_is_imported(const struct load_info *info, + const struct kernel_symbol *sym, + struct module *mod) +{ + const char *namespace; + char *imported_namespace; + + namespace = kernel_symbol_namespace(sym); + if (namespace && namespace[0]) { + imported_namespace = get_modinfo(info, "import_ns"); + while (imported_namespace) { + if (strcmp(namespace, imported_namespace) == 0) + return 0; + imported_namespace = get_next_modinfo( + info, "import_ns", imported_namespace); + } +#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + pr_warn( +#else + pr_err( +#endif + "%s: module uses symbol (%s) from namespace %s, but does not import it.\n", + mod->name, kernel_symbol_name(sym), namespace); +#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + return -EINVAL; +#endif + } + return 0; +} + +static bool inherit_taint(struct module *mod, struct module *owner) +{ + if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) + return true; + + if (mod->using_gplonly_symbols) { + pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", + mod->name, owner->name); + return false; + } + + if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { + pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", + mod->name, owner->name); + set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); + } + return true; +} + +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ +static const struct kernel_symbol *resolve_symbol(struct module *mod, + const struct load_info *info, + const char *name, + char ownername[]) +{ + struct find_symbol_arg fsa = { + .name = name, + .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), + .warn = true, + }; + int err; + + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); + mutex_lock(&module_mutex); + if (!find_symbol(&fsa)) + goto unlock; + + if (fsa.license == GPL_ONLY) + mod->using_gplonly_symbols = true; + + if (!inherit_taint(mod, fsa.owner)) { + fsa.sym = NULL; + goto getname; + } + + if (!check_version(info, name, mod, fsa.crc)) { + fsa.sym = ERR_PTR(-EINVAL); + goto getname; + } + + err = verify_namespace_is_imported(info, fsa.sym, mod); + if (err) { + fsa.sym = ERR_PTR(err); + goto getname; + } + + err = ref_module(mod, fsa.owner); + if (err) { + fsa.sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); +unlock: + mutex_unlock(&module_mutex); + return fsa.sym; +} + +static const struct kernel_symbol * +resolve_symbol_wait(struct module *mod, + const struct load_info *info, + const char *name) +{ + const struct kernel_symbol *ksym; + char owner[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) + || PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + pr_warn("%s: gave up waiting for init of module %s.\n", + mod->name, owner); + } + return ksym; +} + +#ifdef CONFIG_KALLSYMS +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} +#endif + +/* + * /sys/module/foo/sections stuff + * J. Corbet + */ +#ifdef CONFIG_SYSFS + +#ifdef CONFIG_KALLSYMS +struct module_sect_attr { + struct bin_attribute battr; + unsigned long address; +}; + +struct module_sect_attrs { + struct attribute_group grp; + unsigned int nsections; + struct module_sect_attr attrs[]; +}; + +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) +{ + struct module_sect_attr *sattr = + container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; + + if (pos != 0) + return -EINVAL; + + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; +} + +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + unsigned int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].battr.attr.name); + kfree(sect_attrs); +} + +static void add_sect_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; + struct module_sect_attr *sattr; + struct bin_attribute **gattr; + + /* Count loaded sections and allocate structures */ + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) + nloaded++; + size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (sect_attrs == NULL) + return; + + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; + + sect_attrs->nsections = 0; + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + if (sect_empty(sec)) + continue; + sysfs_bin_attr_init(&sattr->battr); + sattr->address = sec->sh_addr; + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (sattr->battr.attr.name == NULL) + goto out; + sect_attrs->nsections++; + sattr->battr.read = module_sect_read; + sattr->battr.size = MODULE_SECT_READ_SIZE; + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; + } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; + return; + out: + free_sect_attrs(sect_attrs); +} + +static void remove_sect_attrs(struct module *mod) +{ + if (mod->sect_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* + * We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. + */ + free_sect_attrs(mod->sect_attrs); + mod->sect_attrs = NULL; + } +} + +/* + * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. + */ + +struct module_notes_attrs { + struct kobject *dir; + unsigned int notes; + struct bin_attribute attrs[]; +}; + +static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + /* + * The caller checked the pos and count against our size. + */ + memcpy(buf, bin_attr->private + pos, count); + return count; +} + +static void free_notes_attrs(struct module_notes_attrs *notes_attrs, + unsigned int i) +{ + if (notes_attrs->dir) { + while (i-- > 0) + sysfs_remove_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i]); + kobject_put(notes_attrs->dir); + } + kfree(notes_attrs); +} + +static void add_notes_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int notes, loaded, i; + struct module_notes_attrs *notes_attrs; + struct bin_attribute *nattr; + + /* failed to create section attributes, so can't create notes */ + if (!mod->sect_attrs) + return; + + /* Count notes sections and allocate structures. */ + notes = 0; + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + (info->sechdrs[i].sh_type == SHT_NOTE)) + ++notes; + + if (notes == 0) + return; + + notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), + GFP_KERNEL); + if (notes_attrs == NULL) + return; + + notes_attrs->notes = notes; + nattr = ¬es_attrs->attrs[0]; + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) + continue; + if (info->sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; + nattr->attr.mode = S_IRUGO; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *) info->sechdrs[i].sh_addr; + nattr->read = module_notes_read; + ++nattr; + } + ++loaded; + } + + notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); + if (!notes_attrs->dir) + goto out; + + for (i = 0; i < notes; ++i) + if (sysfs_create_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i])) + goto out; + + mod->notes_attrs = notes_attrs; + return; + + out: + free_notes_attrs(notes_attrs, i); +} + +static void remove_notes_attrs(struct module *mod) +{ + if (mod->notes_attrs) + free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); +} + +#else + +static inline void add_sect_attrs(struct module *mod, + const struct load_info *info) +{ +} + +static inline void remove_sect_attrs(struct module *mod) +{ +} + +static inline void add_notes_attrs(struct module *mod, + const struct load_info *info) +{ +} + +static inline void remove_notes_attrs(struct module *mod) +{ +} +#endif /* CONFIG_KALLSYMS */ + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int add_usage_links(struct module *mod) +{ + int ret = 0; +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } + mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); +#endif + return ret; +} + +static void module_remove_modinfo_attrs(struct module *mod, int end); + +static int module_add_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + struct module_attribute *temp_attr; + int error = 0; + int i; + + mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * + (ARRAY_SIZE(modinfo_attrs) + 1)), + GFP_KERNEL); + if (!mod->modinfo_attrs) + return -ENOMEM; + + temp_attr = mod->modinfo_attrs; + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (!attr->test || attr->test(mod)) { + memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); + if (error) + goto error_out; + ++temp_attr; + } + } + + return 0; + +error_out: + if (i > 0) + module_remove_modinfo_attrs(mod, --i); + else + kfree(mod->modinfo_attrs); + return error; +} + +static void module_remove_modinfo_attrs(struct module *mod, int end) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + if (end >= 0 && i > end) + break; + /* pick a field to test for end of list */ + if (!attr->attr.name) + break; + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); + if (attr->free) + attr->free(mod); + } + kfree(mod->modinfo_attrs); +} + +static void mod_kobject_put(struct module *mod) +{ + DECLARE_COMPLETION_ONSTACK(c); + mod->mkobj.kobj_completion = &c; + kobject_put(&mod->mkobj.kobj); + wait_for_completion(&c); +} + +static int mod_sysfs_init(struct module *mod) +{ + int err; + struct kobject *kobj; + + if (!module_sysfs_initialized) { + pr_err("%s: module sysfs not initialized\n", mod->name); + err = -EINVAL; + goto out; + } + + kobj = kset_find_obj(module_kset, mod->name); + if (kobj) { + pr_err("%s: module is already loaded\n", mod->name); + kobject_put(kobj); + err = -EINVAL; + goto out; + } + + mod->mkobj.mod = mod; + + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + mod->mkobj.kobj.kset = module_kset; + err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, + "%s", mod->name); + if (err) + mod_kobject_put(mod); + +out: + return err; +} + +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + err = mod_sysfs_init(mod); + if (err) + goto out; + + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); + if (!mod->holders_dir) { + err = -ENOMEM; + goto out_unreg; + } + + err = module_param_sysfs_setup(mod, kparam, num_params); + if (err) + goto out_unreg_holders; + + err = module_add_modinfo_attrs(mod); + if (err) + goto out_unreg_param; + + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); + + return 0; + +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod, -1); +out_unreg_param: + module_param_sysfs_remove(mod); +out_unreg_holders: + kobject_put(mod->holders_dir); +out_unreg: + mod_kobject_put(mod); +out: + return err; +} + +static void mod_sysfs_fini(struct module *mod) +{ + remove_notes_attrs(mod); + remove_sect_attrs(mod); + mod_kobject_put(mod); +} + +static void init_param_lock(struct module *mod) +{ + mutex_init(&mod->param_lock); +} +#else /* !CONFIG_SYSFS */ + +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static void mod_sysfs_fini(struct module *mod) +{ +} + +static void module_remove_modinfo_attrs(struct module *mod, int end) +{ +} + +static void del_usage_links(struct module *mod) +{ +} + +static void init_param_lock(struct module *mod) +{ +} +#endif /* CONFIG_SYSFS */ + +static void mod_sysfs_teardown(struct module *mod) +{ + del_usage_links(mod); + module_remove_modinfo_attrs(mod, -1); + module_param_sysfs_remove(mod); + kobject_put(mod->mkobj.drivers_dir); + kobject_put(mod->holders_dir); + mod_sysfs_fini(mod); +} + +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| + * + * These values are always page-aligned (as is base) + */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the + * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of + * whether we are strict. + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base, + layout->text_size >> PAGE_SHIFT); +} + +static void module_enable_x(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} +#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ +static void module_enable_x(const struct module *mod) { } +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + +#ifdef CONFIG_STRICT_MODULE_RWX +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); +} + +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); +} + +static void module_enable_ro(const struct module *mod, bool after_init) +{ + if (!rodata_enabled) + return; + + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); + frob_text(&mod->core_layout, set_memory_ro); + + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); +} + +static void module_enable_nx(const struct module *mod) +{ + frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); +} + +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; + int i; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); + return -ENOEXEC; + } + } + + return 0; +} + +#else /* !CONFIG_STRICT_MODULE_RWX */ +static void module_enable_nx(const struct module *mod) { } +static void module_enable_ro(const struct module *mod, bool after_init) {} +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + return 0; +} +#endif /* CONFIG_STRICT_MODULE_RWX */ + +#ifdef CONFIG_LIVEPATCH +/* + * Persist Elf information about a module. Copy the Elf header, + * section header table, section string table, and symtab section + * index from info to mod->klp_info. + */ +static int copy_module_elf(struct module *mod, struct load_info *info) +{ + unsigned int size, symndx; + int ret; + + size = sizeof(*mod->klp_info); + mod->klp_info = kmalloc(size, GFP_KERNEL); + if (mod->klp_info == NULL) + return -ENOMEM; + + /* Elf header */ + size = sizeof(mod->klp_info->hdr); + memcpy(&mod->klp_info->hdr, info->hdr, size); + + /* Elf section header table */ + size = sizeof(*info->sechdrs) * info->hdr->e_shnum; + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); + if (mod->klp_info->sechdrs == NULL) { + ret = -ENOMEM; + goto free_info; + } + + /* Elf section name string table */ + size = info->sechdrs[info->hdr->e_shstrndx].sh_size; + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); + if (mod->klp_info->secstrings == NULL) { + ret = -ENOMEM; + goto free_sechdrs; + } + + /* Elf symbol section index */ + symndx = info->index.sym; + mod->klp_info->symndx = symndx; + + /* + * For livepatch modules, core_kallsyms.symtab is a complete + * copy of the original symbol table. Adjust sh_addr to point + * to core_kallsyms.symtab since the copy of the symtab in module + * init memory is freed at the end of do_init_module(). + */ + mod->klp_info->sechdrs[symndx].sh_addr = \ + (unsigned long) mod->core_kallsyms.symtab; + + return 0; + +free_sechdrs: + kfree(mod->klp_info->sechdrs); +free_info: + kfree(mod->klp_info); + return ret; +} + +static void free_module_elf(struct module *mod) +{ + kfree(mod->klp_info->sechdrs); + kfree(mod->klp_info->secstrings); + kfree(mod->klp_info); +} +#else /* !CONFIG_LIVEPATCH */ +static int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static void free_module_elf(struct module *mod) +{ +} +#endif /* CONFIG_LIVEPATCH */ + +void __weak module_memfree(void *module_region) +{ + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); + vfree(module_region); +} + +void __weak module_arch_cleanup(struct module *mod) +{ +} + +void __weak module_arch_freeing_init(struct module *mod) +{ +} + +static void cfi_cleanup(struct module *mod); + +/* Free a module, remove from lists, etc. */ +static void free_module(struct module *mod) +{ + trace_module_free(mod); + + mod_sysfs_teardown(mod); + + /* + * We leave it in list to prevent duplicate loads, but make sure + * that noone uses it while it's being deconstructed. + */ + mutex_lock(&module_mutex); + mod->state = MODULE_STATE_UNFORMED; + mutex_unlock(&module_mutex); + + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); + + /* Arch-specific cleanup. */ + module_arch_cleanup(mod); + + /* Module unload stuff */ + module_unload_free(mod); + + /* Free any allocated parameters. */ + destroy_params(mod->kp, mod->num_kp); + + if (is_livepatch_module(mod)) + free_module_elf(mod); + + /* Now we can delete it from the lists */ + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + mod_tree_remove(mod); + /* Remove this module from bug list, this uses list_del_rcu */ + module_bug_cleanup(mod); + /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + synchronize_rcu(); + mutex_unlock(&module_mutex); + + /* Clean up CFI for the module. */ + cfi_cleanup(mod); + + /* This may be empty, but that's OK */ + module_arch_freeing_init(mod); + module_memfree(mod->init_layout.base); + kfree(mod->args); + percpu_modfree(mod); + + /* Free lock-classes; relies on the preceding sync_rcu(). */ + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + + /* Finally, free the core (containing the module structure) */ + module_memfree(mod->core_layout.base); +} + +void *__symbol_get(const char *symbol) +{ + struct find_symbol_arg fsa = { + .name = symbol, + .gplok = true, + .warn = true, + }; + + preempt_disable(); + if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) { + preempt_enable(); + return NULL; + } + preempt_enable(); + return (void *)kernel_symbol_value(fsa.sym); +} +EXPORT_SYMBOL_GPL(__symbol_get); + +/* + * Ensure that an exported symbol [global namespace] does not already exist + * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. + */ +static int verify_exported_symbols(struct module *mod) +{ + unsigned int i; + const struct kernel_symbol *s; + struct { + const struct kernel_symbol *sym; + unsigned int num; + } arr[] = { + { mod->syms, mod->num_syms }, + { mod->gpl_syms, mod->num_gpl_syms }, + }; + + for (i = 0; i < ARRAY_SIZE(arr); i++) { + for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { + struct find_symbol_arg fsa = { + .name = kernel_symbol_name(s), + .gplok = true, + }; + if (find_symbol(&fsa)) { + pr_err("%s: exports duplicate symbol %s" + " (owned by %s)\n", + mod->name, kernel_symbol_name(s), + module_name(fsa.owner)); + return -ENOEXEC; + } + } + } + return 0; +} + +static bool ignore_undef_symbol(Elf_Half emachine, const char *name) +{ + /* + * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as + * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64. + * i386 has a similar problem but may not deserve a fix. + * + * If we ever have to ignore many symbols, consider refactoring the code to + * only warn if referenced by a relocation. + */ + if (emachine == EM_386 || emachine == EM_X86_64) + return !strcmp(name, "_GLOBAL_OFFSET_TABLE_"); + return false; +} + +/* Change all symbols so that st_value encodes the pointer directly. */ +static int simplify_symbols(struct module *mod, const struct load_info *info) +{ + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + Elf_Sym *sym = (void *)symsec->sh_addr; + unsigned long secbase; + unsigned int i; + int ret = 0; + const struct kernel_symbol *ksym; + + for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { + const char *name = info->strtab + sym[i].st_name; + + switch (sym[i].st_shndx) { + case SHN_COMMON: + /* Ignore common symbols */ + if (!strncmp(name, "__gnu_lto", 9)) + break; + + /* + * We compiled with -fno-common. These are not + * supposed to happen. + */ + pr_debug("Common symbol: %s\n", name); + pr_warn("%s: please compile with -fno-common\n", + mod->name); + ret = -ENOEXEC; + break; + + case SHN_ABS: + /* Don't need to do anything */ + pr_debug("Absolute symbol: 0x%08lx\n", + (long)sym[i].st_value); + break; + + case SHN_LIVEPATCH: + /* Livepatch symbols are resolved by livepatch */ + break; + + case SHN_UNDEF: + ksym = resolve_symbol_wait(mod, info, name); + /* Ok if resolved. */ + if (ksym && !IS_ERR(ksym)) { + sym[i].st_value = kernel_symbol_value(ksym); + break; + } + + /* Ok if weak or ignored. */ + if (!ksym && + (ELF_ST_BIND(sym[i].st_info) == STB_WEAK || + ignore_undef_symbol(info->hdr->e_machine, name))) + break; + + ret = PTR_ERR(ksym) ?: -ENOENT; + pr_warn("%s: Unknown symbol %s (err %d)\n", + mod->name, name, ret); + break; + + default: + /* Divert to percpu allocation if a percpu var. */ + if (sym[i].st_shndx == info->index.pcpu) + secbase = (unsigned long)mod_percpu(mod); + else + secbase = info->sechdrs[sym[i].st_shndx].sh_addr; + sym[i].st_value += secbase; + break; + } + } + + return ret; +} + +static int apply_relocations(struct module *mod, const struct load_info *info) +{ + unsigned int i; + int err = 0; + + /* Now do relocations. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + unsigned int infosec = info->sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (infosec >= info->hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + continue; + + if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) + err = klp_apply_section_relocs(mod, info->sechdrs, + info->secstrings, + info->strtab, + info->index.sym, i, + NULL); + else if (info->sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(info->sechdrs, info->strtab, + info->index.sym, i, mod); + else if (info->sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(info->sechdrs, info->strtab, + info->index.sym, i, mod); + if (err < 0) + break; + } + return err; +} + +/* Additional bytes needed by arch in front of individual sections */ +unsigned int __weak arch_mod_section_prepend(struct module *mod, + unsigned int section) +{ + /* default implementation just returns zero */ + return 0; +} + +/* Update size with this section: return offset. */ +static long get_offset(struct module *mod, unsigned int *size, + Elf_Shdr *sechdr, unsigned int section) +{ + long ret; + + *size += arch_mod_section_prepend(mod, section); + ret = ALIGN(*size, sechdr->sh_addralign ?: 1); + *size = ret + sechdr->sh_size; + return ret; +} + +static bool module_init_layout_section(const char *sname) +{ +#ifndef CONFIG_MODULE_UNLOAD + if (module_exit_section(sname)) + return true; +#endif + return module_init_section(sname); +} + +/* + * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld + * might -- code, read-only data, read-write data, small data. Tally + * sizes, and place the offsets into sh_entsize fields: high bit means it + * belongs in init. + */ +static void layout_sections(struct module *mod, struct load_info *info) +{ + static unsigned long const masks[][2] = { + /* + * NOTE: all executable code must be the first section + * in this array; otherwise modify the text_size + * finder in the two loops below + */ + { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, + { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, + { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, + { ARCH_SHF_SMALL | SHF_ALLOC, 0 } + }; + unsigned int m, i; + + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; + + pr_debug("Core section allocation order:\n"); + for (m = 0; m < ARRAY_SIZE(masks); ++m) { + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; + + if ((s->sh_flags & masks[m][0]) != masks[m][0] + || (s->sh_flags & masks[m][1]) + || s->sh_entsize != ~0UL + || module_init_layout_section(sname)) + continue; + s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); + pr_debug("\t%s\n", sname); + } + switch (m) { + case 0: /* executable */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.text_size = mod->core_layout.size; + break; + case 1: /* RO: text and ro-data */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_size = mod->core_layout.size; + break; + case 2: /* RO after init */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_after_init_size = mod->core_layout.size; + break; + case 4: /* whole core */ + mod->core_layout.size = debug_align(mod->core_layout.size); + break; + } + } + + pr_debug("Init section allocation order:\n"); + for (m = 0; m < ARRAY_SIZE(masks); ++m) { + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; + + if ((s->sh_flags & masks[m][0]) != masks[m][0] + || (s->sh_flags & masks[m][1]) + || s->sh_entsize != ~0UL + || !module_init_layout_section(sname)) + continue; + s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) + | INIT_OFFSET_MASK); + pr_debug("\t%s\n", sname); + } + switch (m) { + case 0: /* executable */ + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.text_size = mod->init_layout.size; + break; + case 1: /* RO: text and ro-data */ + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.ro_size = mod->init_layout.size; + break; + case 2: + /* + * RO after init doesn't apply to init_layout (only + * core_layout), so it just takes the value of ro_size. + */ + mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; + break; + case 4: /* whole init */ + mod->init_layout.size = debug_align(mod->init_layout.size); + break; + } + } +} + +static void set_license(struct module *mod, const char *license) +{ + if (!license) + license = "unspecified"; + + if (!license_is_gpl_compatible(license)) { + if (!test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license '%s' taints kernel.\n", + mod->name, license); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); + } +} + +/* Parse tag=value strings from .modinfo section */ +static char *next_string(char *string, unsigned long *secsize) +{ + /* Skip non-zero chars */ + while (string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + + /* Skip any zero padding. */ + while (!string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + return string; +} + +static char *get_next_modinfo(const struct load_info *info, const char *tag, + char *prev) +{ + char *p; + unsigned int taglen = strlen(tag); + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; + + /* + * get_modinfo() calls made before rewrite_section_headers() + * must use sh_offset, as sh_addr isn't set! + */ + char *modinfo = (char *)info->hdr + infosec->sh_offset; + + if (prev) { + size -= prev - modinfo; + modinfo = next_string(prev, &size); + } + + for (p = modinfo; p; p = next_string(p, &size)) { + if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') + return p + taglen + 1; + } + return NULL; +} + +static char *get_modinfo(const struct load_info *info, const char *tag) +{ + return get_next_modinfo(info, tag, NULL); +} + +static void setup_modinfo(struct module *mod, struct load_info *info) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->setup) + attr->setup(mod, get_modinfo(info, attr->attr.name)); + } +} + +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + +#ifdef CONFIG_KALLSYMS + +/* Lookup exported symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_exported_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); +} + +static int is_exported(const char *name, unsigned long value, + const struct module *mod) +{ + const struct kernel_symbol *ks; + if (!mod) + ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); + else + ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); + + return ks != NULL && kernel_symbol_value(ks) == value; +} + +/* As per nm */ +static char elf_type(const Elf_Sym *sym, const struct load_info *info) +{ + const Elf_Shdr *sechdrs = info->sechdrs; + + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) + return 'v'; + else + return 'w'; + } + if (sym->st_shndx == SHN_UNDEF) + return 'U'; + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) + return 'a'; + if (sym->st_shndx >= SHN_LORESERVE) + return '?'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) + return 't'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC + && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { + if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) + return 'r'; + else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 'g'; + else + return 'd'; + } + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 's'; + else + return 'b'; + } + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { + return 'n'; + } + return '?'; +} + +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int pcpundx) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ +static void layout_symtab(struct module *mod, struct load_info *info) +{ + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; + const Elf_Sym *src; + unsigned int i, nsrc, ndst, strtab_size = 0; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, + info->index.sym) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); + + src = (void *)info->hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + strtab_size += strlen(&info->strtab[src[i].st_name])+1; + ndst++; + } + } + + /* Append room for core symbols at end of core part. */ + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + info->core_typeoffs = mod->core_layout.size; + mod->core_layout.size += ndst * sizeof(char); + mod->core_layout.size = debug_align(mod->core_layout.size); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, + info->index.str) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); + mod->init_layout.size = debug_align(mod->init_layout.size); +} + +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ +static void add_kallsyms(struct module *mod, const struct load_info *info) +{ + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; + + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { + mod->kallsyms->typetab[i] = elf_type(src + i, info); + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + mod->kallsyms->typetab[i]; + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } + } + mod->core_kallsyms.num_symtab = ndst; +} +#else +static inline void layout_symtab(struct module *mod, struct load_info *info) +{ +} + +static void add_kallsyms(struct module *mod, const struct load_info *info) +{ +} +#endif /* CONFIG_KALLSYMS */ + +#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) +static void init_build_id(struct module *mod, const struct load_info *info) +{ + const Elf_Shdr *sechdr; + unsigned int i; + + for (i = 0; i < info->hdr->e_shnum; i++) { + sechdr = &info->sechdrs[i]; + if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && + !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, + sechdr->sh_size)) + break; + } +} +#else +static void init_build_id(struct module *mod, const struct load_info *info) +{ +} +#endif + +static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) +{ + if (!debug) + return; + ddebug_add_module(debug, num, mod->name); +} + +static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(mod->name); +} + +void * __weak module_alloc(unsigned long size) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); +} + +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + +bool __weak module_exit_section(const char *name) +{ + return strstarts(name, ".exit"); +} + +#ifdef CONFIG_DEBUG_KMEMLEAK +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < info->hdr->e_shnum; i++) { + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); + } +} +#else +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ +} +#endif + +#ifdef CONFIG_MODULE_SIG +static int module_sig_check(struct load_info *info, int flags) +{ + int err = -ENODATA; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; + const void *mod = info->hdr; + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); + /* + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. + */ + if (!mangled_module && + info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + info->len -= markerlen; + err = mod_verify_sig(mod, info); + if (!err) { + info->sig_ok = true; + return 0; + } + } + + /* + * We don't permit modules to be loaded into the trusted kernels + * without a valid signature on them, but if we're not enforcing, + * certain errors are non-fatal. + */ + switch (err) { + case -ENODATA: + reason = "unsigned module"; + break; + case -ENOPKG: + reason = "module with unsupported crypto"; + break; + case -ENOKEY: + reason = "module with unavailable key"; + break; + + default: + /* + * All other errors are fatal, including lack of memory, + * unparseable signatures, and signature check failures -- + * even if signatures aren't required. + */ + return err; + } + + if (is_module_sig_enforced()) { + pr_notice("Loading of %s is rejected\n", reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); +} +#else /* !CONFIG_MODULE_SIG */ +static int module_sig_check(struct load_info *info, int flags) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) +{ +#if defined(CONFIG_64BIT) + unsigned long long secend; +#else + unsigned long secend; +#endif + + /* + * Check for both overflow and offset/size being + * too large. + */ + secend = shdr->sh_offset + shdr->sh_size; + if (secend < shdr->sh_offset || secend > info->len) + return -ENOEXEC; + + return 0; +} + +/* + * Sanity checks against invalid binaries, wrong arch, weird elf version. + * + * Also do basic validity checks against section offsets and sizes, the + * section name string table, and the indices used for it (sh_name). + */ +static int elf_validity_check(struct load_info *info) +{ + unsigned int i; + Elf_Shdr *shdr, *strhdr; + int err; + + if (info->len < sizeof(*(info->hdr))) { + pr_err("Invalid ELF header len %lu\n", info->len); + goto no_exec; + } + + if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) { + pr_err("Invalid ELF header magic: != %s\n", ELFMAG); + goto no_exec; + } + if (info->hdr->e_type != ET_REL) { + pr_err("Invalid ELF header type: %u != %u\n", + info->hdr->e_type, ET_REL); + goto no_exec; + } + if (!elf_check_arch(info->hdr)) { + pr_err("Invalid architecture in ELF header: %u\n", + info->hdr->e_machine); + goto no_exec; + } + if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) { + pr_err("Invalid ELF section header size\n"); + goto no_exec; + } + + /* + * e_shnum is 16 bits, and sizeof(Elf_Shdr) is + * known and small. So e_shnum * sizeof(Elf_Shdr) + * will not overflow unsigned long on any platform. + */ + if (info->hdr->e_shoff >= info->len + || (info->hdr->e_shnum * sizeof(Elf_Shdr) > + info->len - info->hdr->e_shoff)) { + pr_err("Invalid ELF section header overflow\n"); + goto no_exec; + } + + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + + /* + * Verify if the section name table index is valid. + */ + if (info->hdr->e_shstrndx == SHN_UNDEF + || info->hdr->e_shstrndx >= info->hdr->e_shnum) { + pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n", + info->hdr->e_shstrndx, info->hdr->e_shstrndx, + info->hdr->e_shnum); + goto no_exec; + } + + strhdr = &info->sechdrs[info->hdr->e_shstrndx]; + err = validate_section_offset(info, strhdr); + if (err < 0) { + pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type); + return err; + } + + /* + * The section name table must be NUL-terminated, as required + * by the spec. This makes strcmp and pr_* calls that access + * strings in the section safe. + */ + info->secstrings = (void *)info->hdr + strhdr->sh_offset; + if (info->secstrings[strhdr->sh_size - 1] != '\0') { + pr_err("ELF Spec violation: section name table isn't null terminated\n"); + goto no_exec; + } + + /* + * The code assumes that section 0 has a length of zero and + * an addr of zero, so check for it. + */ + if (info->sechdrs[0].sh_type != SHT_NULL + || info->sechdrs[0].sh_size != 0 + || info->sechdrs[0].sh_addr != 0) { + pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n", + info->sechdrs[0].sh_type); + goto no_exec; + } + + for (i = 1; i < info->hdr->e_shnum; i++) { + shdr = &info->sechdrs[i]; + switch (shdr->sh_type) { + case SHT_NULL: + case SHT_NOBITS: + continue; + case SHT_SYMTAB: + if (shdr->sh_link == SHN_UNDEF + || shdr->sh_link >= info->hdr->e_shnum) { + pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n", + shdr->sh_link, shdr->sh_link, + info->hdr->e_shnum); + goto no_exec; + } + fallthrough; + default: + err = validate_section_offset(info, shdr); + if (err < 0) { + pr_err("Invalid ELF section in module (section %u type %u)\n", + i, shdr->sh_type); + return err; + } + + if (shdr->sh_flags & SHF_ALLOC) { + if (shdr->sh_name >= strhdr->sh_size) { + pr_err("Invalid ELF section name in module (section %u type %u)\n", + i, shdr->sh_type); + return -ENOEXEC; + } + } + break; + } + } + + return 0; + +no_exec: + return -ENOEXEC; +} + +#define COPY_CHUNK_SIZE (16*PAGE_SIZE) + +static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) +{ + do { + unsigned long n = min(len, COPY_CHUNK_SIZE); + + if (copy_from_user(dst, usrc, n) != 0) + return -EFAULT; + cond_resched(); + dst += n; + usrc += n; + len -= n; + } while (len); + return 0; +} + +#ifdef CONFIG_LIVEPATCH +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) +{ + if (get_modinfo(info, "livepatch")) { + mod->klp = true; + add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", + mod->name); + } + + return 0; +} +#else /* !CONFIG_LIVEPATCH */ +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) +{ + if (get_modinfo(info, "livepatch")) { + pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", + mod->name); + return -ENOEXEC; + } + + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) +{ + int err; + + info->len = len; + if (info->len < sizeof(*(info->hdr))) + return -ENOEXEC; + + err = security_kernel_load_data(LOADING_MODULE, true); + if (err) + return err; + + /* Suck in entire file: we'll want most of it. */ + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); + if (!info->hdr) + return -ENOMEM; + + if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { + err = -EFAULT; + goto out; + } + + err = security_kernel_post_load_data((char *)info->hdr, info->len, + LOADING_MODULE, "init_module"); +out: + if (err) + vfree(info->hdr); + + return err; +} + +static void free_copy(struct load_info *info, int flags) +{ + if (flags & MODULE_INIT_COMPRESSED_FILE) + module_decompress_cleanup(info); + else + vfree(info->hdr); +} + +static int rewrite_section_headers(struct load_info *info, int flags) +{ + unsigned int i; + + /* This should always be true, but let's be sure. */ + info->sechdrs[0].sh_addr = 0; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + + /* + * Mark all sections sh_addr with their address in the + * temporary image. + */ + shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; + + } + + /* Track but don't keep modinfo and version sections. */ + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; + + return 0; +} + +/* + * Set up our basic convenience variables (pointers to section headers, + * search for module section index etc), and do some basic section + * verification. + * + * Set info->mod to the temporary copy of the module in info->hdr. The final one + * will be allocated in move_module(). + */ +static int setup_load_info(struct load_info *info, int flags) +{ + unsigned int i; + + /* Try to find a name early so we can log errors with a module name */ + info->index.info = find_sec(info, ".modinfo"); + if (info->index.info) + info->name = get_modinfo(info, "name"); + + /* Find internal symbols and strings. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + if (info->sechdrs[i].sh_type == SHT_SYMTAB) { + info->index.sym = i; + info->index.str = info->sechdrs[i].sh_link; + info->strtab = (char *)info->hdr + + info->sechdrs[info->index.str].sh_offset; + break; + } + } + + if (info->index.sym == 0) { + pr_warn("%s: module has no symbols (stripped?)\n", + info->name ?: "(missing .modinfo section or name field)"); + return -ENOEXEC; + } + + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); + if (!info->index.mod) { + pr_warn("%s: No module found in object\n", + info->name ?: "(missing .modinfo section or name field)"); + return -ENOEXEC; + } + /* This is temporary: point mod into copy of data. */ + info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset; + + /* + * If we didn't load the .modinfo 'name' field earlier, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = info->mod->name; + + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); + + info->index.pcpu = find_pcpusec(info); + + return 0; +} + +static int check_modinfo(struct module *mod, struct load_info *info, int flags) +{ + const char *modmagic = get_modinfo(info, "vermagic"); + int err; + + if (flags & MODULE_INIT_IGNORE_VERMAGIC) + modmagic = NULL; + + /* This is allowed: modprobe --force will invalidate it. */ + if (!modmagic) { + err = try_to_force_load(mod, "bad vermagic"); + if (err) + return err; + } else if (!same_magic(modmagic, vermagic, info->index.vers)) { + pr_err("%s: version magic '%s' should be '%s'\n", + info->name, modmagic, vermagic); + return -ENOEXEC; + } + + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } + + check_modinfo_retpoline(mod, info); + + if (get_modinfo(info, "staging")) { + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); + pr_warn("%s: module is from the staging directory, the quality " + "is unknown, you have been warned.\n", mod->name); + } + + err = check_modinfo_livepatch(mod, info); + if (err) + return err; + + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(info, "license")); + + return 0; +} + +static int find_module_sections(struct module *mod, struct load_info *info) +{ + mod->kp = section_objs(info, "__param", + sizeof(*mod->kp), &mod->num_kp); + mod->syms = section_objs(info, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(info, "__kcrctab"); + mod->gpl_syms = section_objs(info, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + +#ifdef CONFIG_CONSTRUCTORS + mod->ctors = section_objs(info, ".ctors", + sizeof(*mod->ctors), &mod->num_ctors); + if (!mod->ctors) + mod->ctors = section_objs(info, ".init_array", + sizeof(*mod->ctors), &mod->num_ctors); + else if (find_sec(info, ".init_array")) { + /* + * This shouldn't happen with same compiler and binutils + * building all parts of the module. + */ + pr_warn("%s: has both .ctors and .init_array.\n", + mod->name); + return -EINVAL; + } +#endif + + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", + sizeof(*mod->tracepoints_ptrs), + &mod->num_tracepoints); +#endif +#ifdef CONFIG_TREE_SRCU + mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", + sizeof(*mod->srcu_struct_ptrs), + &mod->num_srcu_structs); +#endif +#ifdef CONFIG_BPF_EVENTS + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", + sizeof(*mod->bpf_raw_events), + &mod->num_bpf_raw_events); +#endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); +#endif +#ifdef CONFIG_JUMP_LABEL + mod->jump_entries = section_objs(info, "__jump_table", + sizeof(*mod->jump_entries), + &mod->num_jump_entries); +#endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(info, "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); + mod->trace_evals = section_objs(info, "_ftrace_eval_map", + sizeof(*mod->trace_evals), + &mod->num_trace_evals); +#endif +#ifdef CONFIG_TRACING + mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", + sizeof(*mod->trace_bprintk_fmt_start), + &mod->num_trace_bprintk_fmt); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); +#endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); +#endif +#ifdef CONFIG_PRINTK_INDEX + mod->printk_index_start = section_objs(info, ".printk_index", + sizeof(*mod->printk_index_start), + &mod->printk_index_size); +#endif +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + mod->static_call_sites = section_objs(info, ".static_call_sites", + sizeof(*mod->static_call_sites), + &mod->num_static_call_sites); +#endif + mod->extable = section_objs(info, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + + if (section_addr(info, "__obsparm")) + pr_warn("%s: Ignoring obsolete parameters\n", mod->name); + + info->debug = section_objs(info, "__dyndbg", + sizeof(*info->debug), &info->num_debug); + + return 0; +} + +static int move_module(struct module *mod, struct load_info *info) +{ + int i; + void *ptr; + + /* Do the allocs. */ + ptr = module_alloc(mod->core_layout.size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); + if (!ptr) + return -ENOMEM; + + memset(ptr, 0, mod->core_layout.size); + mod->core_layout.base = ptr; + + if (mod->init_layout.size) { + ptr = module_alloc(mod->init_layout.size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); + if (!ptr) { + module_memfree(mod->core_layout.base); + return -ENOMEM; + } + memset(ptr, 0, mod->init_layout.size); + mod->init_layout.base = ptr; + } else + mod->init_layout.base = NULL; + + /* Transfer each section which specifies SHF_ALLOC */ + pr_debug("final section addresses:\n"); + for (i = 0; i < info->hdr->e_shnum; i++) { + void *dest; + Elf_Shdr *shdr = &info->sechdrs[i]; + + if (!(shdr->sh_flags & SHF_ALLOC)) + continue; + + if (shdr->sh_entsize & INIT_OFFSET_MASK) + dest = mod->init_layout.base + + (shdr->sh_entsize & ~INIT_OFFSET_MASK); + else + dest = mod->core_layout.base + shdr->sh_entsize; + + if (shdr->sh_type != SHT_NOBITS) + memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); + /* Update sh_addr to point to copy in image. */ + shdr->sh_addr = (unsigned long)dest; + pr_debug("\t0x%lx %s\n", + (long)shdr->sh_addr, info->secstrings + shdr->sh_name); + } + + return 0; +} + +static int check_module_license_and_versions(struct module *mod) +{ + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + + /* + * ndiswrapper is under GPL by itself, but loads proprietary modules. + * Don't use add_taint_module(), as it would prevent ndiswrapper from + * using GPL-only symbols it needs. + */ + if (strcmp(mod->name, "ndiswrapper") == 0) + add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); + + /* driverloader was caught wrongly pretending to be under GPL */ + if (strcmp(mod->name, "driverloader") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); + + /* lve claims to be GPL but upstream won't provide source */ + if (strcmp(mod->name, "lve") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); + + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); + +#ifdef CONFIG_MODVERSIONS + if ((mod->num_syms && !mod->crcs) || + (mod->num_gpl_syms && !mod->gpl_crcs)) { + return try_to_force_load(mod, + "no versions for exported symbols"); + } +#endif + return 0; +} + +static void flush_module_icache(const struct module *mod) +{ + /* + * Flush the instruction cache, since we've played with text. + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ + if (mod->init_layout.base) + flush_icache_range((unsigned long)mod->init_layout.base, + (unsigned long)mod->init_layout.base + + mod->init_layout.size); + flush_icache_range((unsigned long)mod->core_layout.base, + (unsigned long)mod->core_layout.base + mod->core_layout.size); +} + +int __weak module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +/* module_blacklist is a comma-separated list of module names */ +static char *module_blacklist; +static bool blacklisted(const char *module_name) +{ + const char *p; + size_t len; + + if (!module_blacklist) + return false; + + for (p = module_blacklist; *p; p += len) { + len = strcspn(p, ","); + if (strlen(module_name) == len && !memcmp(module_name, p, len)) + return true; + if (p[len] == ',') + len++; + } + return false; +} +core_param(module_blacklist, module_blacklist, charp, 0400); + +static struct module *layout_and_allocate(struct load_info *info, int flags) +{ + struct module *mod; + unsigned int ndx; + int err; + + err = check_modinfo(info->mod, info, flags); + if (err) + return ERR_PTR(err); + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(info->hdr, info->sechdrs, + info->secstrings, info->mod); + if (err < 0) + return ERR_PTR(err); + + err = module_enforce_rwx_sections(info->hdr, info->sechdrs, + info->secstrings, info->mod); + if (err < 0) + return ERR_PTR(err); + + /* We will do a special allocation for per-cpu sections later. */ + info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; + + /* + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that + * layout_sections() can put it in the right place. + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. + */ + ndx = find_sec(info, ".data..ro_after_init"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* + * Mark the __jump_table section as ro_after_init as well: these data + * structures are never modified, with the exception of entries that + * refer to code in the __init section, which are annotated as such + * at module load time. + */ + ndx = find_sec(info, "__jump_table"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + + /* + * Determine total sizes, and put offsets in sh_entsize. For now + * this is done generically; there doesn't appear to be any + * special cases for the architectures. + */ + layout_sections(info->mod, info); + layout_symtab(info->mod, info); + + /* Allocate and move to the final place */ + err = move_module(info->mod, info); + if (err) + return ERR_PTR(err); + + /* Module has been copied to its final place now: return it. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + kmemleak_load_module(mod, info); + return mod; +} + +/* mod is no longer valid after this! */ +static void module_deallocate(struct module *mod, struct load_info *info) +{ + percpu_modfree(mod); + module_arch_freeing_init(mod); + module_memfree(mod->init_layout.base); + module_memfree(mod->core_layout.base); +} + +int __weak module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + return 0; +} + +static int post_relocation(struct module *mod, const struct load_info *info) +{ + /* Sort exception table now relocations are done. */ + sort_extable(mod->extable, mod->extable + mod->num_exentries); + + /* Copy relocated percpu area over. */ + percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, + info->sechdrs[info->index.pcpu].sh_size); + + /* Setup kallsyms-specific fields. */ + add_kallsyms(mod, info); + + /* Arch-specific module finalizing. */ + return module_finalize(info->hdr, info->sechdrs, mod); +} + +/* Is this module of this name done loading? No locks held. */ +static bool finished_loading(const char *name) +{ + struct module *mod; + bool ret; + + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); + mutex_lock(&module_mutex); + mod = find_module_all(name, strlen(name), true); + ret = !mod || mod->state == MODULE_STATE_LIVE; + mutex_unlock(&module_mutex); + + return ret; +} + +/* Call module constructors. */ +static void do_mod_ctors(struct module *mod) +{ +#ifdef CONFIG_CONSTRUCTORS + unsigned long i; + + for (i = 0; i < mod->num_ctors; i++) + mod->ctors[i](); +#endif +} + +/* For freeing module_init on success, in case kallsyms traversing */ +struct mod_initfree { + struct llist_node node; + void *module_init; +}; + +static void do_free_init(struct work_struct *w) +{ + struct llist_node *pos, *n, *list; + struct mod_initfree *initfree; + + list = llist_del_all(&init_free_list); + + synchronize_rcu(); + + llist_for_each_safe(pos, n, list) { + initfree = container_of(pos, struct mod_initfree, node); + module_memfree(initfree->module_init); + kfree(initfree); + } +} + +/* + * This is where the real work happens. + * + * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb + * helper command 'lx-symbols'. + */ +static noinline int do_init_module(struct module *mod) +{ + int ret = 0; + struct mod_initfree *freeinit; + + freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); + if (!freeinit) { + ret = -ENOMEM; + goto fail; + } + freeinit->module_init = mod->init_layout.base; + + do_mod_ctors(mod); + /* Start the module */ + if (mod->init != NULL) + ret = do_one_initcall(mod->init); + if (ret < 0) { + goto fail_free_freeinit; + } + if (ret > 0) { + pr_warn("%s: '%s'->init suspiciously returned %d, it should " + "follow 0/-E convention\n" + "%s: loading module anyway...\n", + __func__, mod->name, ret, __func__); + dump_stack(); + } + + /* Now it's a first class citizen! */ + mod->state = MODULE_STATE_LIVE; + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + + /* Delay uevent until module has finished its init routine */ + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); + + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). + * + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. + */ + if (!mod->async_probe_requested) + async_synchronize_full(); + + ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + + mod->init_layout.size); + mutex_lock(&module_mutex); + /* Drop initial reference. */ + module_put(mod); + trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); +#endif + module_enable_ro(mod, true); + mod_tree_remove_init(mod); + module_arch_freeing_init(mod); + mod->init_layout.base = NULL; + mod->init_layout.size = 0; + mod->init_layout.ro_size = 0; + mod->init_layout.ro_after_init_size = 0; + mod->init_layout.text_size = 0; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ + mod->btf_data = NULL; +#endif + /* + * We want to free module_init, but be aware that kallsyms may be + * walking this with preempt disabled. In all the failure paths, we + * call synchronize_rcu(), but we don't want to slow down the success + * path. module_memfree() cannot be called in an interrupt, so do the + * work and call synchronize_rcu() in a work queue. + * + * Note that module_alloc() on most architectures creates W+X page + * mappings which won't be cleaned up until do_free_init() runs. Any + * code such as mark_rodata_ro() which depends on those mappings to + * be cleaned up needs to sync with the queued work - ie + * rcu_barrier() + */ + if (llist_add(&freeinit->node, &init_free_list)) + schedule_work(&init_free_wq); + + mutex_unlock(&module_mutex); + wake_up_all(&module_wq); + + return 0; + +fail_free_freeinit: + kfree(freeinit); +fail: + /* Try to protect us from buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_rcu(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + klp_module_going(mod); + ftrace_release_mod(mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; +} + +static int may_init_module(void) +{ + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + return 0; +} + +/* + * We try to place it in the list now to make sure it's unique before + * we dedicate too many resources. In particular, temporary percpu + * memory exhaustion. + */ +static int add_unformed_module(struct module *mod) +{ + int err; + struct module *old; + + mod->state = MODULE_STATE_UNFORMED; + +again: + mutex_lock(&module_mutex); + old = find_module_all(mod->name, strlen(mod->name), true); + if (old != NULL) { + if (old->state != MODULE_STATE_LIVE) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto out_unlocked; + goto again; + } + err = -EEXIST; + goto out; + } + mod_update_bounds(mod); + list_add_rcu(&mod->list, &modules); + mod_tree_insert(mod); + err = 0; + +out: + mutex_unlock(&module_mutex); +out_unlocked: + return err; +} + +static int complete_formation(struct module *mod, struct load_info *info) +{ + int err; + + mutex_lock(&module_mutex); + + /* Find duplicate symbols (must be called under lock). */ + err = verify_exported_symbols(mod); + if (err < 0) + goto out; + + /* This relies on module_mutex for list integrity. */ + module_bug_finalize(info->hdr, info->sechdrs, mod); + + module_enable_ro(mod, false); + module_enable_nx(mod); + module_enable_x(mod); + + /* + * Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. + */ + mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + return 0; + +out: + mutex_unlock(&module_mutex); + return err; +} + +static int prepare_coming_module(struct module *mod) +{ + int err; + + ftrace_module_enable(mod); + err = klp_module_coming(mod); + if (err) + return err; + + err = blocking_notifier_call_chain_robust(&module_notify_list, + MODULE_STATE_COMING, MODULE_STATE_GOING, mod); + err = notifier_to_errno(err); + if (err) + klp_module_going(mod); + + return err; +} + +static int unknown_module_param_cb(char *param, char *val, const char *modname, + void *arg) +{ + struct module *mod = arg; + int ret; + + if (strcmp(param, "async_probe") == 0) { + mod->async_probe_requested = true; + return 0; + } + + /* Check for magic 'dyndbg' arg */ + ret = ddebug_dyndbg_module_param_cb(param, val, modname); + if (ret != 0) + pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); + return 0; +} + +static void cfi_init(struct module *mod); + +/* + * Allocate and load the module: note that size of section 0 is always + * zero, and we rely on this for optional sections. + */ +static int load_module(struct load_info *info, const char __user *uargs, + int flags) +{ + struct module *mod; + long err = 0; + char *after_dashes; + + /* + * Do the signature check (if any) first. All that + * the signature check needs is info->len, it does + * not need any of the section info. That can be + * set up later. This will minimize the chances + * of a corrupt module causing problems before + * we even get to the signature check. + * + * The check will also adjust info->len by stripping + * off the sig length at the end of the module, making + * checks against info->len more correct. + */ + err = module_sig_check(info, flags); + if (err) + goto free_copy; + + /* + * Do basic sanity checks against the ELF header and + * sections. + */ + err = elf_validity_check(info); + if (err) + goto free_copy; + + /* + * Everything checks out, so set up the section info + * in the info structure. + */ + err = setup_load_info(info, flags); + if (err) + goto free_copy; + + /* + * Now that we know we have the correct module name, check + * if it's blacklisted. + */ + if (blacklisted(info->name)) { + err = -EPERM; + pr_err("Module %s is blacklisted\n", info->name); + goto free_copy; + } + + err = rewrite_section_headers(info, flags); + if (err) + goto free_copy; + + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(info, info->mod)) { + err = -ENOEXEC; + goto free_copy; + } + + /* Figure out module layout, and allocate all the memory. */ + mod = layout_and_allocate(info, flags); + if (IS_ERR(mod)) { + err = PTR_ERR(mod); + goto free_copy; + } + + audit_log_kern_module(mod->name); + + /* Reserve our place in the list. */ + err = add_unformed_module(mod); + if (err) + goto free_module; + +#ifdef CONFIG_MODULE_SIG + mod->sig_ok = info->sig_ok; + if (!mod->sig_ok) { + pr_notice_once("%s: module verification failed: signature " + "and/or required key missing - tainting " + "kernel\n", mod->name); + add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); + } +#endif + + /* To avoid stressing percpu allocator, do this once we're unique. */ + err = percpu_modalloc(mod, info); + if (err) + goto unlink_mod; + + /* Now module is in final location, initialize linked lists, etc. */ + err = module_unload_init(mod); + if (err) + goto unlink_mod; + + init_param_lock(mod); + + /* + * Now we've got everything in the final locations, we can + * find optional sections. + */ + err = find_module_sections(mod, info); + if (err) + goto free_unload; + + err = check_module_license_and_versions(mod); + if (err) + goto free_unload; + + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, info); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(mod, info); + if (err < 0) + goto free_modinfo; + + err = apply_relocations(mod, info); + if (err < 0) + goto free_modinfo; + + err = post_relocation(mod, info); + if (err < 0) + goto free_modinfo; + + flush_module_icache(mod); + + /* Setup CFI for the module. */ + cfi_init(mod); + + /* Now copy in args */ + mod->args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(mod->args)) { + err = PTR_ERR(mod->args); + goto free_arch_cleanup; + } + + init_build_id(mod, info); + dynamic_debug_setup(mod, info->debug, info->num_debug); + + /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ + ftrace_module_init(mod); + + /* Finally it's fully formed, ready to start executing. */ + err = complete_formation(mod, info); + if (err) + goto ddebug_cleanup; + + err = prepare_coming_module(mod); + if (err) + goto bug_cleanup; + + /* Module is ready to execute: parsing args may do that. */ + after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, mod, + unknown_module_param_cb); + if (IS_ERR(after_dashes)) { + err = PTR_ERR(after_dashes); + goto coming_cleanup; + } else if (after_dashes) { + pr_warn("%s: parameters '%s' after `--' ignored\n", + mod->name, after_dashes); + } + + /* Link in to sysfs. */ + err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); + if (err < 0) + goto coming_cleanup; + + if (is_livepatch_module(mod)) { + err = copy_module_elf(mod, info); + if (err < 0) + goto sysfs_cleanup; + } + + /* Get rid of temporary copy. */ + free_copy(info, flags); + + /* Done! */ + trace_module_load(mod); + + return do_init_module(mod); + + sysfs_cleanup: + mod_sysfs_teardown(mod); + coming_cleanup: + mod->state = MODULE_STATE_GOING; + destroy_params(mod->kp, mod->num_kp); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + klp_module_going(mod); + bug_cleanup: + mod->state = MODULE_STATE_GOING; + /* module_bug_cleanup needs module_mutex protection */ + mutex_lock(&module_mutex); + module_bug_cleanup(mod); + mutex_unlock(&module_mutex); + + ddebug_cleanup: + ftrace_release_mod(mod); + dynamic_debug_remove(mod, info->debug); + synchronize_rcu(); + kfree(mod->args); + free_arch_cleanup: + cfi_cleanup(mod); + module_arch_cleanup(mod); + free_modinfo: + free_modinfo(mod); + free_unload: + module_unload_free(mod); + unlink_mod: + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + mod_tree_remove(mod); + wake_up_all(&module_wq); + /* Wait for RCU-sched synchronizing before releasing mod->list. */ + synchronize_rcu(); + mutex_unlock(&module_mutex); + free_module: + /* Free lock-classes; relies on the preceding sync_rcu() */ + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + + module_deallocate(mod, info); + free_copy: + free_copy(info, flags); + return err; +} + +SYSCALL_DEFINE3(init_module, void __user *, umod, + unsigned long, len, const char __user *, uargs) +{ + int err; + struct load_info info = { }; + + err = may_init_module(); + if (err) + return err; + + pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + + err = copy_module_from_user(umod, len, &info); + if (err) + return err; + + return load_module(&info, uargs, 0); +} + +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + struct load_info info = { }; + void *buf = NULL; + int len; + int err; + + err = may_init_module(); + if (err) + return err; + + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) + return -EINVAL; + + len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, + READING_MODULE); + if (len < 0) + return len; + + if (flags & MODULE_INIT_COMPRESSED_FILE) { + err = module_decompress(&info, buf, len); + vfree(buf); /* compressed data is no longer needed */ + if (err) + return err; + } else { + info.hdr = buf; + info.len = len; + } + + return load_module(&info, uargs, flags); +} + +static inline int within(unsigned long addr, void *start, unsigned long size) +{ + return ((void *)addr >= start && (void *)addr < start + size); +} + +#ifdef CONFIG_KALLSYMS +/* + * This ignores the intensely annoying "mapping symbols" found + * in ARM ELF files: $a, $t and $d. + */ +static inline int is_arm_mapping_symbol(const char *str) +{ + if (str[0] == '.' && str[1] == 'L') + return true; + return str[0] == '$' && strchr("axtd", str[1]) + && (str[2] == '\0' || str[2] == '.'); +} + +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + +/* + * Given a module and address, find the corresponding symbol and return its name + * while providing its size and offset if needed. + */ +static const char *find_kallsyms_symbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) +{ + unsigned int i, best = 0; + unsigned long nextval, bestval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + /* At worse, next value is at end of module */ + if (within_module_init(addr, mod)) + nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; + else + nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; + + bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); + + /* + * Scan for closest preceding symbol, and next symbol. (ELF + * starts real symbols at 1). + */ + for (i = 1; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + unsigned long thisval = kallsyms_symbol_value(sym); + + if (sym->st_shndx == SHN_UNDEF) + continue; + + /* + * We ignore unnamed symbols: they're uninformative + * and inserted at a whim. + */ + if (*kallsyms_symbol_name(kallsyms, i) == '\0' + || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) + continue; + + if (thisval <= addr && thisval > bestval) { + best = i; + bestval = thisval; + } + if (thisval > addr && thisval < nextval) + nextval = thisval; + } + + if (!best) + return NULL; + + if (size) + *size = nextval - bestval; + if (offset) + *offset = addr - bestval; + + return kallsyms_symbol_name(kallsyms, best); +} + +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + +/* + * For kallsyms to ask for address resolution. NULL means not found. Careful + * not to lock to avoid deadlock on oopses, simply disable preemption. + */ +const char *module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) +{ + const char *ret = NULL; + struct module *mod; + + preempt_disable(); + mod = __module_address(addr); + if (mod) { + if (modname) + *modname = mod->name; + if (modbuildid) { +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) + *modbuildid = mod->build_id; +#else + *modbuildid = NULL; +#endif + } + + ret = find_kallsyms_symbol(mod, addr, size, offset); + } + /* Make a copy in here where it's safe */ + if (ret) { + strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + ret = namebuf; + } + preempt_enable(); + + return ret; +} + +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + + strlcpy(symname, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strlcpy(modname, mod->name, MODULE_NAME_LEN); + if (name) + strlcpy(name, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + const Elf_Sym *sym = &kallsyms->symtab[symnum]; + + *value = kallsyms_symbol_value(sym); + *type = kallsyms->typetab[symnum]; + strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); + strlcpy(module_name, mod->name, MODULE_NAME_LEN); + *exported = is_exported(name, *value, mod); + preempt_enable(); + return 0; + } + symnum -= kallsyms->num_symtab; + } + preempt_enable(); + return -ERANGE; +} + +/* Given a module and name of symbol, find and return the symbol's value */ +static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) +{ + unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && + sym->st_shndx != SHN_UNDEF) + return kallsyms_symbol_value(sym); + } + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + struct module *mod; + char *colon; + unsigned long ret = 0; + + /* Don't lock: we're in enough trouble already. */ + preempt_disable(); + if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + if ((mod = find_module_all(name, colon - name, false)) != NULL) + ret = find_kallsyms_symbol_value(mod, colon+1); + } else { + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) + break; + } + } + preempt_enable(); + return ret; +} + +#ifdef CONFIG_LIVEPATCH +int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + struct module *mod; + unsigned int i; + int ret = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (sym->st_shndx == SHN_UNDEF) + continue; + + ret = fn(data, kallsyms_symbol_name(kallsyms, i), + mod, kallsyms_symbol_value(sym)); + if (ret != 0) + goto out; + + cond_resched(); + } + } +out: + mutex_unlock(&module_mutex); + return ret; +} +#endif /* CONFIG_LIVEPATCH */ +#endif /* CONFIG_KALLSYMS */ + +static void cfi_init(struct module *mod) +{ +#ifdef CONFIG_CFI_CLANG + initcall_t *init; + exitcall_t *exit; + + rcu_read_lock_sched(); + mod->cfi_check = (cfi_check_fn) + find_kallsyms_symbol_value(mod, "__cfi_check"); + init = (initcall_t *) + find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); + exit = (exitcall_t *) + find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); + rcu_read_unlock_sched(); + + /* Fix init/exit functions to point to the CFI jump table */ + if (init) + mod->init = *init; +#ifdef CONFIG_MODULE_UNLOAD + if (exit) + mod->exit = *exit; +#endif + + cfi_module_add(mod, module_addr_min); +#endif +} + +static void cfi_cleanup(struct module *mod) +{ +#ifdef CONFIG_CFI_CLANG + cfi_module_remove(mod, module_addr_min); +#endif +} + +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ +static char *module_flags(struct module *mod, char *buf) +{ + int bx = 0; + + BUG_ON(mod->state == MODULE_STATE_UNFORMED); + if (mod->taints || + mod->state == MODULE_STATE_GOING || + mod->state == MODULE_STATE_COMING) { + buf[bx++] = '('; + bx += module_flags_taint(mod, buf + bx); + /* Show a - for module-is-being-unloaded */ + if (mod->state == MODULE_STATE_GOING) + buf[bx++] = '-'; + /* Show a + for module-is-being-loaded */ + if (mod->state == MODULE_STATE_COMING) + buf[bx++] = '+'; + buf[bx++] = ')'; + } + buf[bx] = '\0'; + + return buf; +} + +#ifdef CONFIG_PROC_FS +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + +static int m_show(struct seq_file *m, void *p) +{ + struct module *mod = list_entry(p, struct module, list); + char buf[MODULE_FLAGS_BUF_SIZE]; + void *value; + + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + + seq_printf(m, "%s %u", + mod->name, mod->init_layout.size + mod->core_layout.size); + print_unload_info(m, mod); + + /* Informative for users. */ + seq_printf(m, " %s", + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : + "Live"); + /* Used by oprofile and other similar tools. */ + value = m->private ? NULL : mod->core_layout.base; + seq_printf(m, " 0x%px", value); + + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", module_flags(mod, buf)); + + seq_puts(m, "\n"); + return 0; +} + +/* + * Format: modulename size refcount deps address + * + * Where refcount is a number or -, and deps is a comma-separated list + * of depends or -. + */ +static const struct seq_operations modules_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show +}; + +/* + * This also sets the "private" pointer to non-NULL if the + * kernel pointers should be hidden (so you can just test + * "m->private" to see if you should keep the values private). + * + * We use the same logic as for /proc/kallsyms. + */ +static int modules_open(struct inode *inode, struct file *file) +{ + int err = seq_open(file, &modules_op); + + if (!err) { + struct seq_file *m = file->private_data; + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; + } + + return err; +} + +static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = modules_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &modules_proc_ops); + return 0; +} +module_init(proc_modules_init); +#endif + +/* Given an address, look for it in the module exception tables. */ +const struct exception_table_entry *search_module_extables(unsigned long addr) +{ + const struct exception_table_entry *e = NULL; + struct module *mod; + + preempt_disable(); + mod = __module_address(addr); + if (!mod) + goto out; + + if (!mod->num_exentries) + goto out; + + e = search_extable(mod->extable, + mod->num_exentries, + addr); +out: + preempt_enable(); + + /* + * Now, if we found one, we are running inside it now, hence + * we cannot unload the module, hence no refcnt needed. + */ + return e; +} + +/** + * is_module_address() - is this address inside a module? + * @addr: the address to check. + * + * See is_module_text_address() if you simply want to see if the address + * is code (not data). + */ +bool is_module_address(unsigned long addr) +{ + bool ret; + + preempt_disable(); + ret = __module_address(addr) != NULL; + preempt_enable(); + + return ret; +} + +/** + * __module_address() - get the module which contains an address. + * @addr: the address. + * + * Must be called with preempt disabled or module mutex held so that + * module doesn't get freed during this. + */ +struct module *__module_address(unsigned long addr) +{ + struct module *mod; + + if (addr < module_addr_min || addr > module_addr_max) + return NULL; + + module_assert_mutex_or_preempt(); + + mod = mod_find(addr); + if (mod) { + BUG_ON(!within_module(addr, mod)); + if (mod->state == MODULE_STATE_UNFORMED) + mod = NULL; + } + return mod; +} + +/** + * is_module_text_address() - is this address inside module code? + * @addr: the address to check. + * + * See is_module_address() if you simply want to see if the address is + * anywhere in a module. See kernel_text_address() for testing if an + * address corresponds to kernel or module code. + */ +bool is_module_text_address(unsigned long addr) +{ + bool ret; + + preempt_disable(); + ret = __module_text_address(addr) != NULL; + preempt_enable(); + + return ret; +} + +/** + * __module_text_address() - get the module whose code contains an address. + * @addr: the address. + * + * Must be called with preempt disabled or module mutex held so that + * module doesn't get freed during this. + */ +struct module *__module_text_address(unsigned long addr) +{ + struct module *mod = __module_address(addr); + if (mod) { + /* Make sure it's within the text section. */ + if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) + && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) + mod = NULL; + } + return mod; +} + +/* Don't grab lock, we're oopsing. */ +void print_modules(void) +{ + struct module *mod; + char buf[MODULE_FLAGS_BUF_SIZE]; + + printk(KERN_DEFAULT "Modules linked in:"); + /* Most callers should already have preempt disabled, but make sure */ + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + pr_cont(" %s%s", mod->name, module_flags(mod, buf)); + } + preempt_enable(); + if (last_unloaded_module[0]) + pr_cont(" [last unloaded: %s]", last_unloaded_module); + pr_cont("\n"); +} + +#ifdef CONFIG_MODVERSIONS +/* + * Generate the signature for all relevant module structures here. + * If these change, we don't want to try to parse the module. + */ +void module_layout(struct module *mod, + struct modversion_info *ver, + struct kernel_param *kp, + struct kernel_symbol *ks, + struct tracepoint * const *tp) +{ +} +EXPORT_SYMBOL(module_layout); +#endif diff --git a/kernel/module/signing.c b/kernel/module/signing.c new file mode 100644 index 000000000000..8aeb6d2ee94b --- /dev/null +++ b/kernel/module/signing.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, struct load_info *info) +{ + struct module_signature ms; + size_t sig_len, modlen = info->len; + int ret; + + pr_devel("==>%s(,%zu)\n", __func__, modlen); + + if (modlen <= sizeof(ms)) + return -EBADMSG; + + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + + ret = mod_check_sig(&ms, modlen, "module"); + if (ret) + return ret; + + sig_len = be32_to_cpu(ms.sig_len); + modlen -= sig_len + sizeof(ms); + info->len = modlen; + + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); +} diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c deleted file mode 100644 index ffef98a20320..000000000000 --- a/kernel/module_decompress.c +++ /dev/null @@ -1,273 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2021 Google LLC. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "module-internal.h" - -static int module_extend_max_pages(struct load_info *info, unsigned int extent) -{ - struct page **new_pages; - - new_pages = kvmalloc_array(info->max_pages + extent, - sizeof(info->pages), GFP_KERNEL); - if (!new_pages) - return -ENOMEM; - - memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages)); - kvfree(info->pages); - info->pages = new_pages; - info->max_pages += extent; - - return 0; -} - -static struct page *module_get_next_page(struct load_info *info) -{ - struct page *page; - int error; - - if (info->max_pages == info->used_pages) { - error = module_extend_max_pages(info, info->used_pages); - if (error) - return ERR_PTR(error); - } - - page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!page) - return ERR_PTR(-ENOMEM); - - info->pages[info->used_pages++] = page; - return page; -} - -#ifdef CONFIG_MODULE_COMPRESS_GZIP -#include -#define MODULE_COMPRESSION gzip -#define MODULE_DECOMPRESS_FN module_gzip_decompress - -/* - * Calculate length of the header which consists of signature, header - * flags, time stamp and operating system ID (10 bytes total), plus - * an optional filename. - */ -static size_t module_gzip_header_len(const u8 *buf, size_t size) -{ - const u8 signature[] = { 0x1f, 0x8b, 0x08 }; - size_t len = 10; - - if (size < len || memcmp(buf, signature, sizeof(signature))) - return 0; - - if (buf[3] & 0x08) { - do { - /* - * If we can't find the end of the file name we must - * be dealing with a corrupted file. - */ - if (len == size) - return 0; - } while (buf[len++] != '\0'); - } - - return len; -} - -static ssize_t module_gzip_decompress(struct load_info *info, - const void *buf, size_t size) -{ - struct z_stream_s s = { 0 }; - size_t new_size = 0; - size_t gzip_hdr_len; - ssize_t retval; - int rc; - - gzip_hdr_len = module_gzip_header_len(buf, size); - if (!gzip_hdr_len) { - pr_err("not a gzip compressed module\n"); - return -EINVAL; - } - - s.next_in = buf + gzip_hdr_len; - s.avail_in = size - gzip_hdr_len; - - s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); - if (!s.workspace) - return -ENOMEM; - - rc = zlib_inflateInit2(&s, -MAX_WBITS); - if (rc != Z_OK) { - pr_err("failed to initialize decompressor: %d\n", rc); - retval = -EINVAL; - goto out; - } - - do { - struct page *page = module_get_next_page(info); - if (!page) { - retval = -ENOMEM; - goto out_inflate_end; - } - - s.next_out = kmap(page); - s.avail_out = PAGE_SIZE; - rc = zlib_inflate(&s, 0); - kunmap(page); - - new_size += PAGE_SIZE - s.avail_out; - } while (rc == Z_OK); - - if (rc != Z_STREAM_END) { - pr_err("decompression failed with status %d\n", rc); - retval = -EINVAL; - goto out_inflate_end; - } - - retval = new_size; - -out_inflate_end: - zlib_inflateEnd(&s); -out: - kfree(s.workspace); - return retval; -} -#elif CONFIG_MODULE_COMPRESS_XZ -#include -#define MODULE_COMPRESSION xz -#define MODULE_DECOMPRESS_FN module_xz_decompress - -static ssize_t module_xz_decompress(struct load_info *info, - const void *buf, size_t size) -{ - static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 }; - struct xz_dec *xz_dec; - struct xz_buf xz_buf; - enum xz_ret xz_ret; - size_t new_size = 0; - ssize_t retval; - - if (size < sizeof(signature) || - memcmp(buf, signature, sizeof(signature))) { - pr_err("not an xz compressed module\n"); - return -EINVAL; - } - - xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1); - if (!xz_dec) - return -ENOMEM; - - xz_buf.in_size = size; - xz_buf.in = buf; - xz_buf.in_pos = 0; - - do { - struct page *page = module_get_next_page(info); - if (!page) { - retval = -ENOMEM; - goto out; - } - - xz_buf.out = kmap(page); - xz_buf.out_pos = 0; - xz_buf.out_size = PAGE_SIZE; - xz_ret = xz_dec_run(xz_dec, &xz_buf); - kunmap(page); - - new_size += xz_buf.out_pos; - } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); - - if (xz_ret != XZ_STREAM_END) { - pr_err("decompression failed with status %d\n", xz_ret); - retval = -EINVAL; - goto out; - } - - retval = new_size; - - out: - xz_dec_end(xz_dec); - return retval; -} -#else -#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" -#endif - -int module_decompress(struct load_info *info, const void *buf, size_t size) -{ - unsigned int n_pages; - ssize_t data_size; - int error; - - /* - * Start with number of pages twice as big as needed for - * compressed data. - */ - n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2; - error = module_extend_max_pages(info, n_pages); - - data_size = MODULE_DECOMPRESS_FN(info, buf, size); - if (data_size < 0) { - error = data_size; - goto err; - } - - info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL); - if (!info->hdr) { - error = -ENOMEM; - goto err; - } - - info->len = data_size; - return 0; - -err: - module_decompress_cleanup(info); - return error; -} - -void module_decompress_cleanup(struct load_info *info) -{ - int i; - - if (info->hdr) - vunmap(info->hdr); - - for (i = 0; i < info->used_pages; i++) - __free_page(info->pages[i]); - - kvfree(info->pages); - - info->pages = NULL; - info->max_pages = info->used_pages = 0; -} - -#ifdef CONFIG_SYSFS -static ssize_t compression_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); -} -static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); - -static int __init module_decompress_sysfs_init(void) -{ - int error; - - error = sysfs_create_file(&module_kset->kobj, - &module_compression_attr.attr); - if (error) - pr_warn("Failed to create 'compression' attribute"); - - return 0; -} -late_initcall(module_decompress_sysfs_init); -#endif diff --git a/kernel/module_signing.c b/kernel/module_signing.c deleted file mode 100644 index 8723ae70ea1f..000000000000 --- a/kernel/module_signing.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Module signature checker - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include "module-internal.h" - -/* - * Verify the signature on a module. - */ -int mod_verify_sig(const void *mod, struct load_info *info) -{ - struct module_signature ms; - size_t sig_len, modlen = info->len; - int ret; - - pr_devel("==>%s(,%zu)\n", __func__, modlen); - - if (modlen <= sizeof(ms)) - return -EBADMSG; - - memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - - ret = mod_check_sig(&ms, modlen, "module"); - if (ret) - return ret; - - sig_len = be32_to_cpu(ms.sig_len); - modlen -= sig_len + sizeof(ms); - info->len = modlen; - - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_MODULE_SIGNATURE, - NULL, NULL); -} -- cgit v1.2.3 From 8ab4ed08a24f88359f22439e37cac65c95cf6ac2 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:32 +0000 Subject: module: Simple refactor in preparation for split No functional change. This patch makes it possible to move non-essential code out of core module code. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 21 +++++++++++++++++++++ kernel/module/main.c | 22 ++-------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 8c381c99062f..ea8c4c02614c 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -7,6 +7,27 @@ #include #include +#include + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* If this is set, the section belongs in the init part of the module */ +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1)) +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +extern struct mutex module_mutex; +extern struct list_head modules; + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const s32 __start___kcrctab[]; +extern const s32 __start___kcrctab_gpl[]; struct load_info { const char *name; diff --git a/kernel/module/main.c b/kernel/module/main.c index 1a17f02f69a0..5898a1af41a9 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -63,10 +63,6 @@ #define CREATE_TRACE_POINTS #include -#ifndef ARCH_SHF_SMALL -#define ARCH_SHF_SMALL 0 -#endif - /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but @@ -78,9 +74,6 @@ # define debug_align(X) (X) #endif -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) - /* * Mutex protects: * 1) List of modules (also safely readable with preempt_disable), @@ -88,8 +81,8 @@ * 3) module_addr_min/module_addr_max. * (delete and add uses RCU list operations). */ -static DEFINE_MUTEX(module_mutex); -static LIST_HEAD(modules); +DEFINE_MUTEX(module_mutex); +LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ static void do_free_init(struct work_struct *w); @@ -408,14 +401,6 @@ static __maybe_unused void *any_section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } -/* Provided by the linker */ -extern const struct kernel_symbol __start___ksymtab[]; -extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const s32 __start___kcrctab[]; -extern const s32 __start___kcrctab_gpl[]; - #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL #else @@ -4542,9 +4527,6 @@ static void cfi_cleanup(struct module *mod) #endif } -/* Maximum number of characters written by module_flags() */ -#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) - /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ static char *module_flags(struct module *mod, char *buf) { -- cgit v1.2.3 From 5aff4dfdb4ae2741cfff759d917f597f2c7f70aa Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:33 +0000 Subject: module: Make internal.h and decompress.c more compliant This patch will address the following warning and style violations generated by ./scripts/checkpatch.pl in strict mode: WARNING: Use #include instead of #10: FILE: kernel/module/internal.h:10: +#include CHECK: spaces preferred around that '-' (ctx:VxV) #18: FILE: kernel/module/internal.h:18: +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) CHECK: Please use a blank line after function/struct/union/enum declarations #69: FILE: kernel/module/internal.h:69: +} +static inline void module_decompress_cleanup(struct load_info *info) ^ CHECK: extern prototypes should be avoided in .h files #84: FILE: kernel/module/internal.h:84: +extern int mod_verify_sig(const void *mod, struct load_info *info); WARNING: Missing a blank line after declarations #116: FILE: kernel/module/decompress.c:116: + struct page *page = module_get_next_page(info); + if (!page) { WARNING: Missing a blank line after declarations #174: FILE: kernel/module/decompress.c:174: + struct page *page = module_get_next_page(info); + if (!page) { CHECK: Please use a blank line after function/struct/union/enum declarations #258: FILE: kernel/module/decompress.c:258: +} +static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); Note: Fortunately, the multiple-include optimisation found in include/linux/module.h will prevent duplication/or inclusion more than once. Fixes: f314dfea16a0 ("modsign: log module name in the event of an error") Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/decompress.c | 3 +++ kernel/module/internal.h | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index d14d6443225a..2fc7081dd7c1 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -113,6 +113,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); + if (!page) { retval = -ENOMEM; goto out_inflate_end; @@ -171,6 +172,7 @@ static ssize_t module_xz_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); + if (!page) { retval = -ENOMEM; goto out; @@ -256,6 +258,7 @@ static ssize_t compression_show(struct kobject *kobj, { return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); } + static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); static int __init module_decompress_sysfs_init(void) diff --git a/kernel/module/internal.h b/kernel/module/internal.h index ea8c4c02614c..e0775e66bcf7 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -6,7 +6,8 @@ */ #include -#include +#include +#include #include #ifndef ARCH_SHF_SMALL @@ -54,7 +55,7 @@ struct load_info { } index; }; -extern int mod_verify_sig(const void *mod, struct load_info *info); +int mod_verify_sig(const void *mod, struct load_info *info); #ifdef CONFIG_MODULE_DECOMPRESS int module_decompress(struct load_info *info, const void *buf, size_t size); @@ -65,6 +66,7 @@ static inline int module_decompress(struct load_info *info, { return -EOPNOTSUPP; } + static inline void module_decompress_cleanup(struct load_info *info) { } -- cgit v1.2.3 From 1be9473e31ab87ad1b6ecf9fd11df461930ede85 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:34 +0000 Subject: module: Move livepatch support to a separate file No functional change. This patch migrates livepatch support (i.e. used during module add/or load and remove/or deletion) from core module code into kernel/module/livepatch.c. At the moment it contains code to persist Elf information about a given livepatch module, only. The new file was added to MAINTAINERS. Reviewed-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- MAINTAINERS | 1 + include/linux/module.h | 9 ++-- kernel/module/Makefile | 1 + kernel/module/internal.h | 22 ++++++++++ kernel/module/livepatch.c | 74 +++++++++++++++++++++++++++++++++ kernel/module/main.c | 102 +++++----------------------------------------- 6 files changed, 111 insertions(+), 98 deletions(-) create mode 100644 kernel/module/livepatch.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index 5e7778cd437f..6dcd93fb3a96 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11360,6 +11360,7 @@ F: arch/s390/include/asm/livepatch.h F: arch/x86/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ +F: kernel/module/livepatch.c F: lib/livepatch/ F: samples/livepatch/ F: tools/testing/selftests/livepatch/ diff --git a/include/linux/module.h b/include/linux/module.h index 1e135fd5c076..7ec9715de7dc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -663,17 +663,14 @@ static inline bool module_requested_async_probing(struct module *module) return module && module->async_probe_requested; } -#ifdef CONFIG_LIVEPATCH static inline bool is_livepatch_module(struct module *mod) { +#ifdef CONFIG_LIVEPATCH return mod->klp; -} -#else /* !CONFIG_LIVEPATCH */ -static inline bool is_livepatch_module(struct module *mod) -{ +#else return false; +#endif } -#endif /* CONFIG_LIVEPATCH */ bool is_module_sig_enforced(void); void set_module_sig_enforced(void); diff --git a/kernel/module/Makefile b/kernel/module/Makefile index cdd5c61b8c7f..ed3aacb04f17 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -10,3 +10,4 @@ KCOV_INSTRUMENT_module.o := n obj-y += main.o obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o obj-$(CONFIG_MODULE_SIG) += signing.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index e0775e66bcf7..ad7a444253ed 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -57,6 +57,28 @@ struct load_info { int mod_verify_sig(const void *mod, struct load_info *info); +#ifdef CONFIG_LIVEPATCH +int copy_module_elf(struct module *mod, struct load_info *info); +void free_module_elf(struct module *mod); +#else /* !CONFIG_LIVEPATCH */ +static inline int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static inline void free_module_elf(struct module *mod) { } +#endif /* CONFIG_LIVEPATCH */ + +static inline bool set_livepatch_module(struct module *mod) +{ +#ifdef CONFIG_LIVEPATCH + mod->klp = true; + return true; +#else + return false; +#endif +} + #ifdef CONFIG_MODULE_DECOMPRESS int module_decompress(struct load_info *info, const void *buf, size_t size); void module_decompress_cleanup(struct load_info *info); diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c new file mode 100644 index 000000000000..486d4ff92719 --- /dev/null +++ b/kernel/module/livepatch.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module livepatch support + * + * Copyright (C) 2016 Jessica Yu + */ + +#include +#include +#include +#include "internal.h" + +/* + * Persist Elf information about a module. Copy the Elf header, + * section header table, section string table, and symtab section + * index from info to mod->klp_info. + */ +int copy_module_elf(struct module *mod, struct load_info *info) +{ + unsigned int size, symndx; + int ret; + + size = sizeof(*mod->klp_info); + mod->klp_info = kmalloc(size, GFP_KERNEL); + if (!mod->klp_info) + return -ENOMEM; + + /* Elf header */ + size = sizeof(mod->klp_info->hdr); + memcpy(&mod->klp_info->hdr, info->hdr, size); + + /* Elf section header table */ + size = sizeof(*info->sechdrs) * info->hdr->e_shnum; + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); + if (!mod->klp_info->sechdrs) { + ret = -ENOMEM; + goto free_info; + } + + /* Elf section name string table */ + size = info->sechdrs[info->hdr->e_shstrndx].sh_size; + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); + if (!mod->klp_info->secstrings) { + ret = -ENOMEM; + goto free_sechdrs; + } + + /* Elf symbol section index */ + symndx = info->index.sym; + mod->klp_info->symndx = symndx; + + /* + * For livepatch modules, core_kallsyms.symtab is a complete + * copy of the original symbol table. Adjust sh_addr to point + * to core_kallsyms.symtab since the copy of the symtab in module + * init memory is freed at the end of do_init_module(). + */ + mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab; + + return 0; + +free_sechdrs: + kfree(mod->klp_info->sechdrs); +free_info: + kfree(mod->klp_info); + return ret; +} + +void free_module_elf(struct module *mod) +{ + kfree(mod->klp_info->sechdrs); + kfree(mod->klp_info->secstrings); + kfree(mod->klp_info); +} diff --git a/kernel/module/main.c b/kernel/module/main.c index 5898a1af41a9..915143827069 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2043,81 +2043,6 @@ static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, } #endif /* CONFIG_STRICT_MODULE_RWX */ -#ifdef CONFIG_LIVEPATCH -/* - * Persist Elf information about a module. Copy the Elf header, - * section header table, section string table, and symtab section - * index from info to mod->klp_info. - */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - unsigned int size, symndx; - int ret; - - size = sizeof(*mod->klp_info); - mod->klp_info = kmalloc(size, GFP_KERNEL); - if (mod->klp_info == NULL) - return -ENOMEM; - - /* Elf header */ - size = sizeof(mod->klp_info->hdr); - memcpy(&mod->klp_info->hdr, info->hdr, size); - - /* Elf section header table */ - size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); - if (mod->klp_info->sechdrs == NULL) { - ret = -ENOMEM; - goto free_info; - } - - /* Elf section name string table */ - size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); - if (mod->klp_info->secstrings == NULL) { - ret = -ENOMEM; - goto free_sechdrs; - } - - /* Elf symbol section index */ - symndx = info->index.sym; - mod->klp_info->symndx = symndx; - - /* - * For livepatch modules, core_kallsyms.symtab is a complete - * copy of the original symbol table. Adjust sh_addr to point - * to core_kallsyms.symtab since the copy of the symtab in module - * init memory is freed at the end of do_init_module(). - */ - mod->klp_info->sechdrs[symndx].sh_addr = \ - (unsigned long) mod->core_kallsyms.symtab; - - return 0; - -free_sechdrs: - kfree(mod->klp_info->sechdrs); -free_info: - kfree(mod->klp_info); - return ret; -} - -static void free_module_elf(struct module *mod) -{ - kfree(mod->klp_info->sechdrs); - kfree(mod->klp_info->secstrings); - kfree(mod->klp_info); -} -#else /* !CONFIG_LIVEPATCH */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - return 0; -} - -static void free_module_elf(struct module *mod) -{ -} -#endif /* CONFIG_LIVEPATCH */ - void __weak module_memfree(void *module_region) { /* @@ -3092,30 +3017,23 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l return 0; } -#ifdef CONFIG_LIVEPATCH static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { - if (get_modinfo(info, "livepatch")) { - mod->klp = true; + if (!get_modinfo(info, "livepatch")) + /* Nothing more to do */ + return 0; + + if (set_livepatch_module(mod)) { add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", - mod->name); - } - - return 0; -} -#else /* !CONFIG_LIVEPATCH */ -static int check_modinfo_livepatch(struct module *mod, struct load_info *info) -{ - if (get_modinfo(info, "livepatch")) { - pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", - mod->name); - return -ENOEXEC; + mod->name); + return 0; } - return 0; + pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", + mod->name); + return -ENOEXEC; } -#endif /* CONFIG_LIVEPATCH */ static void check_modinfo_retpoline(struct module *mod, struct load_info *info) { -- cgit v1.2.3 From 58d208de3e8d87dbe196caf0b57cc58c7a3836ca Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:35 +0000 Subject: module: Move latched RB-tree support to a separate file No functional change. This patch migrates module latched RB-tree support (e.g. see __module_address()) from core module code into kernel/module/tree_lookup.c. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/internal.h | 33 +++++++++++ kernel/module/main.c | 130 ++------------------------------------------ kernel/module/tree_lookup.c | 109 +++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 126 deletions(-) create mode 100644 kernel/module/tree_lookup.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index ed3aacb04f17..88774e386276 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -11,3 +11,4 @@ obj-y += main.o obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o obj-$(CONFIG_MODULE_SIG) += signing.o obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index ad7a444253ed..f1682e3677be 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 @@ -93,3 +94,35 @@ static inline void module_decompress_cleanup(struct load_info *info) { } #endif + +#ifdef CONFIG_MODULES_TREE_LOOKUP +struct mod_tree_root { + struct latch_tree_root root; + unsigned long addr_min; + unsigned long addr_max; +}; + +extern struct mod_tree_root mod_tree; + +void mod_tree_insert(struct module *mod); +void mod_tree_remove_init(struct module *mod); +void mod_tree_remove(struct module *mod); +struct module *mod_find(unsigned long addr); +#else /* !CONFIG_MODULES_TREE_LOOKUP */ + +static inline void mod_tree_insert(struct module *mod) { } +static inline void mod_tree_remove_init(struct module *mod) { } +static inline void mod_tree_remove(struct module *mod) { } +static inline struct module *mod_find(unsigned long addr) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} +#endif /* CONFIG_MODULES_TREE_LOOKUP */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 915143827069..0d0fdd82359b 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -90,138 +90,16 @@ static DECLARE_WORK(init_free_wq, do_free_init); static LLIST_HEAD(init_free_list); #ifdef CONFIG_MODULES_TREE_LOOKUP - -/* - * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. - * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. - */ - -static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->base; -} - -static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->size; -} - -static __always_inline bool -mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) -{ - return __mod_tree_val(a) < __mod_tree_val(b); -} - -static __always_inline int -mod_tree_comp(void *key, struct latch_tree_node *n) -{ - unsigned long val = (unsigned long)key; - unsigned long start, end; - - start = __mod_tree_val(n); - if (val < start) - return -1; - - end = start + __mod_tree_size(n); - if (val >= end) - return 1; - - return 0; -} - -static const struct latch_tree_ops mod_tree_ops = { - .less = mod_tree_less, - .comp = mod_tree_comp, -}; - -static struct mod_tree_root { - struct latch_tree_root root; - unsigned long addr_min; - unsigned long addr_max; -} mod_tree __cacheline_aligned = { +struct mod_tree_root mod_tree __cacheline_aligned = { .addr_min = -1UL, }; #define module_addr_min mod_tree.addr_min #define module_addr_max mod_tree.addr_max -static noinline void __mod_tree_insert(struct mod_tree_node *node) -{ - latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); -} - -static void __mod_tree_remove(struct mod_tree_node *node) -{ - latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); -} - -/* - * These modifications: insert, remove_init and remove; are serialized by the - * module_mutex. - */ -static void mod_tree_insert(struct module *mod) -{ - mod->core_layout.mtn.mod = mod; - mod->init_layout.mtn.mod = mod; - - __mod_tree_insert(&mod->core_layout.mtn); - if (mod->init_layout.size) - __mod_tree_insert(&mod->init_layout.mtn); -} - -static void mod_tree_remove_init(struct module *mod) -{ - if (mod->init_layout.size) - __mod_tree_remove(&mod->init_layout.mtn); -} - -static void mod_tree_remove(struct module *mod) -{ - __mod_tree_remove(&mod->core_layout.mtn); - mod_tree_remove_init(mod); -} - -static struct module *mod_find(unsigned long addr) -{ - struct latch_tree_node *ltn; - - ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); - if (!ltn) - return NULL; - - return container_of(ltn, struct mod_tree_node, node)->mod; -} - -#else /* MODULES_TREE_LOOKUP */ - -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - -static void mod_tree_insert(struct module *mod) { } -static void mod_tree_remove_init(struct module *mod) { } -static void mod_tree_remove(struct module *mod) { } - -static struct module *mod_find(unsigned long addr) -{ - struct module *mod; - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - if (within_module(addr, mod)) - return mod; - } - - return NULL; -} - -#endif /* MODULES_TREE_LOOKUP */ +#else /* !CONFIG_MODULES_TREE_LOOKUP */ +static unsigned long module_addr_min = -1UL, module_addr_max; +#endif /* CONFIG_MODULES_TREE_LOOKUP */ /* * Bounds of module text, for speeding up __module_address. diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c new file mode 100644 index 000000000000..0bc4ec3b22ce --- /dev/null +++ b/kernel/module/tree_lookup.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Modules tree lookup + * + * Copyright (C) 2015 Peter Zijlstra + * Copyright (C) 2015 Rusty Russell + */ + +#include +#include +#include "internal.h" + +/* + * Use a latched RB-tree for __module_address(); this allows us to use + * RCU-sched lookups of the address from any context. + * + * This is conditional on PERF_EVENTS || TRACING because those can really hit + * __module_address() hard by doing a lot of stack unwinding; potentially from + * NMI context. + */ + +static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->base; +} + +static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->size; +} + +static __always_inline bool +mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) +{ + return __mod_tree_val(a) < __mod_tree_val(b); +} + +static __always_inline int +mod_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long start, end; + + start = __mod_tree_val(n); + if (val < start) + return -1; + + end = start + __mod_tree_size(n); + if (val >= end) + return 1; + + return 0; +} + +static const struct latch_tree_ops mod_tree_ops = { + .less = mod_tree_less, + .comp = mod_tree_comp, +}; + +static noinline void __mod_tree_insert(struct mod_tree_node *node) +{ + latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); +} + +static void __mod_tree_remove(struct mod_tree_node *node) +{ + latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); +} + +/* + * These modifications: insert, remove_init and remove; are serialized by the + * module_mutex. + */ +void mod_tree_insert(struct module *mod) +{ + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; + + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); +} + +void mod_tree_remove_init(struct module *mod) +{ + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); +} + +void mod_tree_remove(struct module *mod) +{ + __mod_tree_remove(&mod->core_layout.mtn); + mod_tree_remove_init(mod); +} + +struct module *mod_find(unsigned long addr) +{ + struct latch_tree_node *ltn; + + ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); + if (!ltn) + return NULL; + + return container_of(ltn, struct mod_tree_node, node)->mod; +} -- cgit v1.2.3 From b33465fe9c52a3719f013deeca261bd82af235ee Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:36 +0000 Subject: module: Move strict rwx support to a separate file No functional change. This patch migrates code that makes module text and rodata memory read-only and non-text memory non-executable from core module code into kernel/module/strict_rwx.c. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/internal.h | 32 +++++++++++++++ kernel/module/main.c | 99 +--------------------------------------------- kernel/module/strict_rwx.c | 85 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 97 deletions(-) create mode 100644 kernel/module/strict_rwx.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 88774e386276..d313c8472cb3 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -12,3 +12,4 @@ obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o obj-$(CONFIG_MODULE_SIG) += signing.o obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o +obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index f1682e3677be..a6895bb5598a 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -20,6 +20,17 @@ /* Maximum number of characters written by module_flags() */ #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX +# define debug_align(X) PAGE_ALIGN(X) +#else +# define debug_align(X) (X) +#endif + extern struct mutex module_mutex; extern struct list_head modules; @@ -126,3 +137,24 @@ static inline struct module *mod_find(unsigned long addr) return NULL; } #endif /* CONFIG_MODULES_TREE_LOOKUP */ + +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX +void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, + int num_pages)); +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + +#ifdef CONFIG_STRICT_MODULE_RWX +void module_enable_ro(const struct module *mod, bool after_init); +void module_enable_nx(const struct module *mod); +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod); + +#else /* !CONFIG_STRICT_MODULE_RWX */ +static inline void module_enable_nx(const struct module *mod) { } +static inline void module_enable_ro(const struct module *mod, bool after_init) {} +static inline int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + return 0; +} +#endif /* CONFIG_STRICT_MODULE_RWX */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 0d0fdd82359b..d55a2a8338a1 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -63,17 +63,6 @@ #define CREATE_TRACE_POINTS #include -/* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -# define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif - /* * Mutex protects: * 1) List of modules (also safely readable with preempt_disable), @@ -1819,8 +1808,8 @@ static void mod_sysfs_teardown(struct module *mod) * whether we are strict. */ #ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -static void frob_text(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) +void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); @@ -1837,90 +1826,6 @@ static void module_enable_x(const struct module *mod) static void module_enable_x(const struct module *mod) { } #endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -#ifdef CONFIG_STRICT_MODULE_RWX -static void frob_rodata(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->text_size, - (layout->ro_size - layout->text_size) >> PAGE_SHIFT); -} - -static void frob_ro_after_init(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); -} - -static void frob_writable_data(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_after_init_size, - (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); -} - -static void module_enable_ro(const struct module *mod, bool after_init) -{ - if (!rodata_enabled) - return; - - set_vm_flush_reset_perms(mod->core_layout.base); - set_vm_flush_reset_perms(mod->init_layout.base); - frob_text(&mod->core_layout, set_memory_ro); - - frob_rodata(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_rodata(&mod->init_layout, set_memory_ro); - - if (after_init) - frob_ro_after_init(&mod->core_layout, set_memory_ro); -} - -static void module_enable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_nx); - frob_ro_after_init(&mod->core_layout, set_memory_nx); - frob_writable_data(&mod->core_layout, set_memory_nx); - frob_rodata(&mod->init_layout, set_memory_nx); - frob_writable_data(&mod->init_layout, set_memory_nx); -} - -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; - int i; - - for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { - pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", - mod->name, secstrings + sechdrs[i].sh_name, i); - return -ENOEXEC; - } - } - - return 0; -} - -#else /* !CONFIG_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_ro(const struct module *mod, bool after_init) {} -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - return 0; -} -#endif /* CONFIG_STRICT_MODULE_RWX */ - void __weak module_memfree(void *module_region) { /* diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c new file mode 100644 index 000000000000..7949dfd449c2 --- /dev/null +++ b/kernel/module/strict_rwx.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module strict rwx + * + * Copyright (C) 2015 Rusty Russell + */ + +#include +#include +#include +#include +#include "internal.h" + +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON(!PAGE_ALIGNED(layout->base)); + BUG_ON(!PAGE_ALIGNED(layout->text_size)); + BUG_ON(!PAGE_ALIGNED(layout->ro_size)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); +} + +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON(!PAGE_ALIGNED(layout->base)); + BUG_ON(!PAGE_ALIGNED(layout->ro_size)); + BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON(!PAGE_ALIGNED(layout->base)); + BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size)); + BUG_ON(!PAGE_ALIGNED(layout->size)); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); +} + +void module_enable_ro(const struct module *mod, bool after_init) +{ + if (!rodata_enabled) + return; + + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); + frob_text(&mod->core_layout, set_memory_ro); + + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); +} + +void module_enable_nx(const struct module *mod) +{ + frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); +} + +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; + int i; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); + return -ENOEXEC; + } + } + + return 0; +} -- cgit v1.2.3 From 0c1e42805c25c87eb7a6f3b18bdbf3b3b7840aff Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:37 +0000 Subject: module: Move extra signature support out of core code No functional change. This patch migrates additional module signature check code from core module code into kernel/module/signing.c. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- include/linux/module.h | 12 ++++--- kernel/module/internal.h | 9 +++++ kernel/module/main.c | 87 ------------------------------------------------ kernel/module/signing.c | 77 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 7ec9715de7dc..5e2059f3afc7 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -672,7 +672,6 @@ static inline bool is_livepatch_module(struct module *mod) #endif } -bool is_module_sig_enforced(void); void set_module_sig_enforced(void); #else /* !CONFIG_MODULES... */ @@ -799,10 +798,6 @@ static inline bool module_requested_async_probing(struct module *module) return false; } -static inline bool is_module_sig_enforced(void) -{ - return false; -} static inline void set_module_sig_enforced(void) { @@ -854,11 +849,18 @@ static inline bool retpoline_module_ok(bool has_retpoline) #endif #ifdef CONFIG_MODULE_SIG +bool is_module_sig_enforced(void); + static inline bool module_sig_ok(struct module *module) { return module->sig_ok; } #else /* !CONFIG_MODULE_SIG */ +static inline bool is_module_sig_enforced(void) +{ + return false; +} + static inline bool module_sig_ok(struct module *module) { return true; diff --git a/kernel/module/internal.h b/kernel/module/internal.h index a6895bb5598a..d6f646a5da41 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -158,3 +158,12 @@ static inline int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, return 0; } #endif /* CONFIG_STRICT_MODULE_RWX */ + +#ifdef CONFIG_MODULE_SIG +int module_sig_check(struct load_info *info, int flags); +#else /* !CONFIG_MODULE_SIG */ +static inline int module_sig_check(struct load_info *info, int flags) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ diff --git a/kernel/module/main.c b/kernel/module/main.c index d55a2a8338a1..c349c91e53fa 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -127,28 +126,6 @@ static void module_assert_mutex_or_preempt(void) #endif } -#ifdef CONFIG_MODULE_SIG -static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); -module_param(sig_enforce, bool_enable_only, 0644); - -void set_module_sig_enforced(void) -{ - sig_enforce = true; -} -#else -#define sig_enforce false -#endif - -/* - * Export sig_enforce kernel cmdline parameter to allow other subsystems rely - * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. - */ -bool is_module_sig_enforced(void) -{ - return sig_enforce; -} -EXPORT_SYMBOL(is_module_sig_enforced); - /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); @@ -2569,70 +2546,6 @@ static inline void kmemleak_load_module(const struct module *mod, } #endif -#ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, int flags) -{ - int err = -ENODATA; - const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - const char *reason; - const void *mod = info->hdr; - bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | - MODULE_INIT_IGNORE_VERMAGIC); - /* - * Do not allow mangled modules as a module with version information - * removed is no longer the module that was signed. - */ - if (!mangled_module && - info->len > markerlen && - memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { - /* We truncate the module to discard the signature */ - info->len -= markerlen; - err = mod_verify_sig(mod, info); - if (!err) { - info->sig_ok = true; - return 0; - } - } - - /* - * We don't permit modules to be loaded into the trusted kernels - * without a valid signature on them, but if we're not enforcing, - * certain errors are non-fatal. - */ - switch (err) { - case -ENODATA: - reason = "unsigned module"; - break; - case -ENOPKG: - reason = "module with unsupported crypto"; - break; - case -ENOKEY: - reason = "module with unavailable key"; - break; - - default: - /* - * All other errors are fatal, including lack of memory, - * unparseable signatures, and signature check failures -- - * even if signatures aren't required. - */ - return err; - } - - if (is_module_sig_enforced()) { - pr_notice("Loading of %s is rejected\n", reason); - return -EKEYREJECTED; - } - - return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); -} -#else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, int flags) -{ - return 0; -} -#endif /* !CONFIG_MODULE_SIG */ - static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) { #if defined(CONFIG_64BIT) diff --git a/kernel/module/signing.c b/kernel/module/signing.c index 8aeb6d2ee94b..85c8999dfecf 100644 --- a/kernel/module/signing.c +++ b/kernel/module/signing.c @@ -11,9 +11,29 @@ #include #include #include +#include #include +#include #include "internal.h" +static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); +module_param(sig_enforce, bool_enable_only, 0644); + +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + /* * Verify the signature on a module. */ @@ -43,3 +63,60 @@ int mod_verify_sig(const void *mod, struct load_info *info) VERIFYING_MODULE_SIGNATURE, NULL, NULL); } + +int module_sig_check(struct load_info *info, int flags) +{ + int err = -ENODATA; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; + const void *mod = info->hdr; + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); + /* + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. + */ + if (!mangled_module && + info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + info->len -= markerlen; + err = mod_verify_sig(mod, info); + if (!err) { + info->sig_ok = true; + return 0; + } + } + + /* + * We don't permit modules to be loaded into the trusted kernels + * without a valid signature on them, but if we're not enforcing, + * certain errors are non-fatal. + */ + switch (err) { + case -ENODATA: + reason = "unsigned module"; + break; + case -ENOPKG: + reason = "module with unsupported crypto"; + break; + case -ENOKEY: + reason = "module with unavailable key"; + break; + + default: + /* + * All other errors are fatal, including lack of memory, + * unparseable signatures, and signature check failures -- + * even if signatures aren't required. + */ + return err; + } + + if (is_module_sig_enforced()) { + pr_notice("Loading of %s is rejected\n", reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); +} -- cgit v1.2.3 From 473c84d1856e83faebf059a52a8e49bdb89026d3 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:38 +0000 Subject: module: Move kmemleak support to a separate file No functional change. This patch migrates kmemleak code out of core module code into kernel/module/debug_kmemleak.c Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/debug_kmemleak.c | 30 ++++++++++++++++++++++++++++++ kernel/module/internal.h | 7 +++++++ kernel/module/main.c | 27 --------------------------- 4 files changed, 38 insertions(+), 27 deletions(-) create mode 100644 kernel/module/debug_kmemleak.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index d313c8472cb3..12388627725c 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_MODULE_SIG) += signing.o obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c new file mode 100644 index 000000000000..12a569d361e8 --- /dev/null +++ b/kernel/module/debug_kmemleak.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kmemleak support + * + * Copyright (C) 2009 Catalin Marinas + */ + +#include +#include +#include "internal.h" + +void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < info->hdr->e_shnum; i++) { + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); + } +} diff --git a/kernel/module/internal.h b/kernel/module/internal.h index d6f646a5da41..b0c360839f63 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -167,3 +167,10 @@ static inline int module_sig_check(struct load_info *info, int flags) return 0; } #endif /* !CONFIG_MODULE_SIG */ + +#ifdef CONFIG_DEBUG_KMEMLEAK +void kmemleak_load_module(const struct module *mod, const struct load_info *info); +#else /* !CONFIG_DEBUG_KMEMLEAK */ +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } +#endif /* CONFIG_DEBUG_KMEMLEAK */ diff --git a/kernel/module/main.c b/kernel/module/main.c index c349c91e53fa..d400c477c35a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2519,33 +2519,6 @@ bool __weak module_exit_section(const char *name) return strstarts(name, ".exit"); } -#ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); - } -} -#else -static inline void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ -} -#endif - static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) { #if defined(CONFIG_64BIT) -- cgit v1.2.3 From 91fb02f31505dc22262b13a129550f470ab90a79 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:39 +0000 Subject: module: Move kallsyms support into a separate file No functional change. This patch migrates kallsyms code out of core module code kernel/module/kallsyms.c Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/internal.h | 29 +++ kernel/module/kallsyms.c | 502 ++++++++++++++++++++++++++++++++++++++++++++ kernel/module/main.c | 531 +---------------------------------------------- 4 files changed, 538 insertions(+), 525 deletions(-) create mode 100644 kernel/module/kallsyms.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 12388627725c..9901bed3ab5b 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o +obj-$(CONFIG_KALLSYMS) += kallsyms.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index b0c360839f63..44ca05b9eb8f 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -68,6 +68,19 @@ struct load_info { }; int mod_verify_sig(const void *mod, struct load_info *info); +struct module *find_module_all(const char *name, size_t len, bool even_unformed); +int cmp_name(const void *name, const void *sym); +long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, + unsigned int section); + +static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} #ifdef CONFIG_LIVEPATCH int copy_module_elf(struct module *mod, struct load_info *info); @@ -174,3 +187,19 @@ void kmemleak_load_module(const struct module *mod, const struct load_info *info static inline void kmemleak_load_module(const struct module *mod, const struct load_info *info) { } #endif /* CONFIG_DEBUG_KMEMLEAK */ + +#ifdef CONFIG_KALLSYMS +void init_build_id(struct module *mod, const struct load_info *info); +void layout_symtab(struct module *mod, struct load_info *info); +void add_kallsyms(struct module *mod, const struct load_info *info); +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} +#else /* !CONFIG_KALLSYMS */ +static inline void init_build_id(struct module *mod, const struct load_info *info) { } +static inline void layout_symtab(struct module *mod, struct load_info *info) { } +static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } +#endif /* CONFIG_KALLSYMS */ diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c new file mode 100644 index 000000000000..1b0780e20aab --- /dev/null +++ b/kernel/module/kallsyms.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kallsyms support + * + * Copyright (C) 2010 Rusty Russell + */ + +#include +#include +#include +#include +#include "internal.h" + +/* Lookup exported symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_exported_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); +} + +static int is_exported(const char *name, unsigned long value, + const struct module *mod) +{ + const struct kernel_symbol *ks; + + if (!mod) + ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); + else + ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); + + return ks && kernel_symbol_value(ks) == value; +} + +/* As per nm */ +static char elf_type(const Elf_Sym *sym, const struct load_info *info) +{ + const Elf_Shdr *sechdrs = info->sechdrs; + + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) + return 'v'; + else + return 'w'; + } + if (sym->st_shndx == SHN_UNDEF) + return 'U'; + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) + return 'a'; + if (sym->st_shndx >= SHN_LORESERVE) + return '?'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) + return 't'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC && + sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { + if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) + return 'r'; + else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 'g'; + else + return 'd'; + } + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 's'; + else + return 'b'; + } + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { + return 'n'; + } + return '?'; +} + +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int pcpundx) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF || + src->st_shndx >= shnum || + !src->st_name) + return false; + +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ +void layout_symtab(struct module *mod, struct load_info *info) +{ + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; + const Elf_Sym *src; + unsigned int i, nsrc, ndst, strtab_size = 0; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect, + info->index.sym) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); + + src = (void *)info->hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + strtab_size += strlen(&info->strtab[src[i].st_name]) + 1; + ndst++; + } + } + + /* Append room for core symbols at end of core part. */ + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + info->core_typeoffs = mod->core_layout.size; + mod->core_layout.size += ndst * sizeof(char); + mod->core_layout.size = debug_align(mod->core_layout.size); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect, + info->index.str) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); + mod->init_layout.size = debug_align(mod->init_layout.size); +} + +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ +void add_kallsyms(struct module *mod, const struct load_info *info) +{ + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + /* The following is safe since this pointer cannot change */ + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; + + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { + mod->kallsyms->typetab[i] = elf_type(src + i, info); + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + mod->kallsyms->typetab[i]; + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strscpy(s, &mod->kallsyms->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } + } + mod->core_kallsyms.num_symtab = ndst; +} + +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) +void init_build_id(struct module *mod, const struct load_info *info) +{ + const Elf_Shdr *sechdr; + unsigned int i; + + for (i = 0; i < info->hdr->e_shnum; i++) { + sechdr = &info->sechdrs[i]; + if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && + !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, + sechdr->sh_size)) + break; + } +} +#else +void init_build_id(struct module *mod, const struct load_info *info) +{ +} +#endif + +/* + * This ignores the intensely annoying "mapping symbols" found + * in ARM ELF files: $a, $t and $d. + */ +static inline int is_arm_mapping_symbol(const char *str) +{ + if (str[0] == '.' && str[1] == 'L') + return true; + return str[0] == '$' && strchr("axtd", str[1]) && + (str[2] == '\0' || str[2] == '.'); +} + +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + +/* + * Given a module and address, find the corresponding symbol and return its name + * while providing its size and offset if needed. + */ +static const char *find_kallsyms_symbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) +{ + unsigned int i, best = 0; + unsigned long nextval, bestval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + /* At worse, next value is at end of module */ + if (within_module_init(addr, mod)) + nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size; + else + nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size; + + bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); + + /* + * Scan for closest preceding symbol, and next symbol. (ELF + * starts real symbols at 1). + */ + for (i = 1; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + unsigned long thisval = kallsyms_symbol_value(sym); + + if (sym->st_shndx == SHN_UNDEF) + continue; + + /* + * We ignore unnamed symbols: they're uninformative + * and inserted at a whim. + */ + if (*kallsyms_symbol_name(kallsyms, i) == '\0' || + is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) + continue; + + if (thisval <= addr && thisval > bestval) { + best = i; + bestval = thisval; + } + if (thisval > addr && thisval < nextval) + nextval = thisval; + } + + if (!best) + return NULL; + + if (size) + *size = nextval - bestval; + if (offset) + *offset = addr - bestval; + + return kallsyms_symbol_name(kallsyms, best); +} + +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + +/* + * For kallsyms to ask for address resolution. NULL means not found. Careful + * not to lock to avoid deadlock on oopses, simply disable preemption. + */ +const char *module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) +{ + const char *ret = NULL; + struct module *mod; + + preempt_disable(); + mod = __module_address(addr); + if (mod) { + if (modname) + *modname = mod->name; + if (modbuildid) { +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) + *modbuildid = mod->build_id; +#else + *modbuildid = NULL; +#endif + } + + ret = find_kallsyms_symbol(mod, addr, size, offset); + } + /* Make a copy in here where it's safe */ + if (ret) { + strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + ret = namebuf; + } + preempt_enable(); + + return ret; +} + +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + + strscpy(symname, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strscpy(modname, mod->name, MODULE_NAME_LEN); + if (name) + strscpy(name, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + const Elf_Sym *sym = &kallsyms->symtab[symnum]; + + *value = kallsyms_symbol_value(sym); + *type = kallsyms->typetab[symnum]; + strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); + strscpy(module_name, mod->name, MODULE_NAME_LEN); + *exported = is_exported(name, *value, mod); + preempt_enable(); + return 0; + } + symnum -= kallsyms->num_symtab; + } + preempt_enable(); + return -ERANGE; +} + +/* Given a module and name of symbol, find and return the symbol's value */ +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) +{ + unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && + sym->st_shndx != SHN_UNDEF) + return kallsyms_symbol_value(sym); + } + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + struct module *mod; + char *colon; + unsigned long ret = 0; + + /* Don't lock: we're in enough trouble already. */ + preempt_disable(); + if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + if ((mod = find_module_all(name, colon - name, false)) != NULL) + ret = find_kallsyms_symbol_value(mod, colon + 1); + } else { + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) + break; + } + } + preempt_enable(); + return ret; +} + +#ifdef CONFIG_LIVEPATCH +int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + struct module *mod; + unsigned int i; + int ret = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (sym->st_shndx == SHN_UNDEF) + continue; + + ret = fn(data, kallsyms_symbol_name(kallsyms, i), + mod, kallsyms_symbol_value(sym)); + if (ret != 0) + goto out; + } + } +out: + mutex_unlock(&module_mutex); + return ret; +} +#endif /* CONFIG_LIVEPATCH */ diff --git a/kernel/module/main.c b/kernel/module/main.c index d400c477c35a..4d74988e1814 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -288,15 +288,6 @@ static bool check_exported_symbol(const struct symsearch *syms, return true; } -static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) -{ -#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS - return (unsigned long)offset_to_ptr(&sym->value_offset); -#else - return sym->value; -#endif -} - static const char *kernel_symbol_name(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -317,7 +308,7 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) #endif } -static int cmp_name(const void *name, const void *sym) +int cmp_name(const void *name, const void *sym) { return strcmp(name, kernel_symbol_name(sym)); } @@ -387,8 +378,8 @@ static bool find_symbol(struct find_symbol_arg *fsa) * Search for module by name: must hold module_mutex (or preempt disabled * for read-only access). */ -static struct module *find_module_all(const char *name, size_t len, - bool even_unformed) +struct module *find_module_all(const char *name, size_t len, + bool even_unformed) { struct module *mod; @@ -1294,13 +1285,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -#ifdef CONFIG_KALLSYMS -static inline bool sect_empty(const Elf_Shdr *sect) -{ - return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; -} -#endif - /* * /sys/module/foo/sections stuff * J. Corbet @@ -2065,7 +2049,7 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod, } /* Update size with this section: return offset. */ -static long get_offset(struct module *mod, unsigned int *size, +long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section) { long ret; @@ -2121,7 +2105,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || module_init_layout_section(sname)) continue; - s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); + s->sh_entsize = module_get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); } switch (m) { @@ -2154,7 +2138,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !module_init_layout_section(sname)) continue; - s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) + s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } @@ -2267,228 +2251,6 @@ static void free_modinfo(struct module *mod) } } -#ifdef CONFIG_KALLSYMS - -/* Lookup exported symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_exported_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - return bsearch(name, start, stop - start, - sizeof(struct kernel_symbol), cmp_name); -} - -static int is_exported(const char *name, unsigned long value, - const struct module *mod) -{ - const struct kernel_symbol *ks; - if (!mod) - ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); - else - ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); - - return ks != NULL && kernel_symbol_value(ks) == value; -} - -/* As per nm */ -static char elf_type(const Elf_Sym *sym, const struct load_info *info) -{ - const Elf_Shdr *sechdrs = info->sechdrs; - - if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { - if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) - return 'v'; - else - return 'w'; - } - if (sym->st_shndx == SHN_UNDEF) - return 'U'; - if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) - return 'a'; - if (sym->st_shndx >= SHN_LORESERVE) - return '?'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) - return 't'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC - && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { - if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) - return 'r'; - else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 'g'; - else - return 'd'; - } - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 's'; - else - return 'b'; - } - if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, - ".debug")) { - return 'n'; - } - return '?'; -} - -static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum, unsigned int pcpundx) -{ - const Elf_Shdr *sec; - - if (src->st_shndx == SHN_UNDEF - || src->st_shndx >= shnum - || !src->st_name) - return false; - -#ifdef CONFIG_KALLSYMS_ALL - if (src->st_shndx == pcpundx) - return true; -#endif - - sec = sechdrs + src->st_shndx; - if (!(sec->sh_flags & SHF_ALLOC) -#ifndef CONFIG_KALLSYMS_ALL - || !(sec->sh_flags & SHF_EXECINSTR) -#endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) - return false; - - return true; -} - -/* - * We only allocate and copy the strings needed by the parts of symtab - * we keep. This is simple, but has the effect of making multiple - * copies of duplicates. We could be more sophisticated, see - * linux-kernel thread starting with - * <73defb5e4bca04a6431392cc341112b1@localhost>. - */ -static void layout_symtab(struct module *mod, struct load_info *info) -{ - Elf_Shdr *symsect = info->sechdrs + info->index.sym; - Elf_Shdr *strsect = info->sechdrs + info->index.str; - const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size = 0; - - /* Put symbol section at end of init part of module. */ - symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, - info->index.sym) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + symsect->sh_name); - - src = (void *)info->hdr + symsect->sh_offset; - nsrc = symsect->sh_size / sizeof(*src); - - /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = 0; i < nsrc; i++) { - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - strtab_size += strlen(&info->strtab[src[i].st_name])+1; - ndst++; - } - } - - /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_layout.size += strtab_size; - info->core_typeoffs = mod->core_layout.size; - mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = debug_align(mod->core_layout.size); - - /* Put string table section at end of init part of module. */ - strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, - info->index.str) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + strsect->sh_name); - - /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_layout.size = ALIGN(mod->init_layout.size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_layout.size; - mod->init_layout.size += sizeof(struct mod_kallsyms); - info->init_typeoffs = mod->init_layout.size; - mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = debug_align(mod->init_layout.size); -} - -/* - * We use the full symtab and strtab which layout_symtab arranged to - * be appended to the init section. Later we switch to the cut-down - * core-only ones. - */ -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ - unsigned int i, ndst; - const Elf_Sym *src; - Elf_Sym *dst; - char *s; - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - - /* Set up to point into init section. */ - mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; - - mod->kallsyms->symtab = (void *)symsec->sh_addr; - mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); - /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - - /* - * Now populate the cut down core kallsyms for after init - * and set types up while we still have access to sections. - */ - mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; - src = mod->kallsyms->symtab; - for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - mod->kallsyms->typetab[i] = elf_type(src + i, info); - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - mod->core_kallsyms.typetab[ndst] = - mod->kallsyms->typetab[i]; - dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], - KSYM_NAME_LEN) + 1; - } - } - mod->core_kallsyms.num_symtab = ndst; -} -#else -static inline void layout_symtab(struct module *mod, struct load_info *info) -{ -} - -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ -} -#endif /* CONFIG_KALLSYMS */ - -#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) -static void init_build_id(struct module *mod, const struct load_info *info) -{ - const Elf_Shdr *sechdr; - unsigned int i; - - for (i = 0; i < info->hdr->e_shnum; i++) { - sechdr = &info->sechdrs[i]; - if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && - !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, - sechdr->sh_size)) - break; - } -} -#else -static void init_build_id(struct module *mod, const struct load_info *info) -{ -} -#endif - static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) { if (!debug) @@ -3799,287 +3561,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size) return ((void *)addr >= start && (void *)addr < start + size); } -#ifdef CONFIG_KALLSYMS -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - if (str[0] == '.' && str[1] == 'L') - return true; - return str[0] == '$' && strchr("axtd", str[1]) - && (str[2] == '\0' || str[2] == '.'); -} - -static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) -{ - return kallsyms->strtab + kallsyms->symtab[symnum].st_name; -} - -/* - * Given a module and address, find the corresponding symbol and return its name - * while providing its size and offset if needed. - */ -static const char *find_kallsyms_symbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) -{ - unsigned int i, best = 0; - unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - /* At worse, next value is at end of module */ - if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; - else - nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; - - bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); - - /* - * Scan for closest preceding symbol, and next symbol. (ELF - * starts real symbols at 1). - */ - for (i = 1; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - unsigned long thisval = kallsyms_symbol_value(sym); - - if (sym->st_shndx == SHN_UNDEF) - continue; - - /* - * We ignore unnamed symbols: they're uninformative - * and inserted at a whim. - */ - if (*kallsyms_symbol_name(kallsyms, i) == '\0' - || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) - continue; - - if (thisval <= addr && thisval > bestval) { - best = i; - bestval = thisval; - } - if (thisval > addr && thisval < nextval) - nextval = thisval; - } - - if (!best) - return NULL; - - if (size) - *size = nextval - bestval; - if (offset) - *offset = addr - bestval; - - return kallsyms_symbol_name(kallsyms, best); -} - -void * __weak dereference_module_function_descriptor(struct module *mod, - void *ptr) -{ - return ptr; -} - -/* - * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. - */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) -{ - const char *ret = NULL; - struct module *mod; - - preempt_disable(); - mod = __module_address(addr); - if (mod) { - if (modname) - *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } - - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); - ret = namebuf; - } - preempt_enable(); - - return ret; -} - -int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, NULL, NULL); - if (!sym) - goto out; - - strlcpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strlcpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - struct mod_kallsyms *kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); - if (symnum < kallsyms->num_symtab) { - const Elf_Sym *sym = &kallsyms->symtab[symnum]; - - *value = kallsyms_symbol_value(sym); - *type = kallsyms->typetab[symnum]; - strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); - strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, *value, mod); - preempt_enable(); - return 0; - } - symnum -= kallsyms->num_symtab; - } - preempt_enable(); - return -ERANGE; -} - -/* Given a module and name of symbol, find and return the symbol's value */ -static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) -{ - unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && - sym->st_shndx != SHN_UNDEF) - return kallsyms_symbol_value(sym); - } - return 0; -} - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) -{ - struct module *mod; - char *colon; - unsigned long ret = 0; - - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { - if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = find_kallsyms_symbol_value(mod, colon+1); - } else { - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) - break; - } - } - preempt_enable(); - return ret; -} - -#ifdef CONFIG_LIVEPATCH -int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, - struct module *, unsigned long), - void *data) -{ - struct module *mod; - unsigned int i; - int ret = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) { - /* We hold module_mutex: no need for rcu_dereference_sched */ - struct mod_kallsyms *kallsyms = mod->kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (sym->st_shndx == SHN_UNDEF) - continue; - - ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms_symbol_value(sym)); - if (ret != 0) - goto out; - - cond_resched(); - } - } -out: - mutex_unlock(&module_mutex); - return ret; -} -#endif /* CONFIG_LIVEPATCH */ -#endif /* CONFIG_KALLSYMS */ - static void cfi_init(struct module *mod) { #ifdef CONFIG_CFI_CLANG -- cgit v1.2.3 From 08126db5ff739fa011fc5b8af683ad759f2cba9a Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:40 +0000 Subject: module: kallsyms: Fix suspicious rcu usage No functional change. The purpose of this patch is to address the various Sparse warnings due to the incorrect dereference/or access of an __rcu pointer. Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/kallsyms.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 1b0780e20aab..a3da0686a2a6 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -171,14 +171,17 @@ void add_kallsyms(struct module *mod, const struct load_info *info) Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; /* Set up to point into init section. */ - mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + mod->kallsyms = (void __rcu *)mod->init_layout.base + + info->mod_kallsyms_init_off; + preempt_disable(); /* The following is safe since this pointer cannot change */ - mod->kallsyms->symtab = (void *)symsec->sh_addr; - mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr; + rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; + rcu_dereference_sched(mod->kallsyms)->strtab = + (void *)info->sechdrs[info->index.str].sh_addr; + rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -187,20 +190,22 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; - src = mod->kallsyms->symtab; - for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - mod->kallsyms->typetab[i] = elf_type(src + i, info); + src = rcu_dereference_sched(mod->kallsyms)->symtab; + for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) { + rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { mod->core_kallsyms.typetab[ndst] = - mod->kallsyms->typetab[i]; + rcu_dereference_sched(mod->kallsyms)->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strscpy(s, &mod->kallsyms->strtab[src[i].st_name], + s += strscpy(s, + &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } + preempt_enable(); mod->core_kallsyms.num_symtab = ndst; } @@ -478,11 +483,16 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, mutex_lock(&module_mutex); list_for_each_entry(mod, &modules, list) { - /* We hold module_mutex: no need for rcu_dereference_sched */ - struct mod_kallsyms *kallsyms = mod->kallsyms; + struct mod_kallsyms *kallsyms; if (mod->state == MODULE_STATE_UNFORMED) continue; + + /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ + preempt_disable(); + kallsyms = rcu_dereference_sched(mod->kallsyms); + preempt_enable(); + for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; -- cgit v1.2.3 From 0ffc40f6c8ab684e694774ebc835b198398129a8 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:41 +0000 Subject: module: Move procfs support into a separate file No functional change. This patch migrates code that allows one to generate a list of loaded/or linked modules via /proc when procfs support is enabled into kernel/module/procfs.c. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/internal.h | 1 + kernel/module/main.c | 131 +------------------------------------------ kernel/module/procfs.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 130 deletions(-) create mode 100644 kernel/module/procfs.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 9901bed3ab5b..94296c98a67f 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -15,3 +15,4 @@ obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_PROC_FS) += procfs.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 44ca05b9eb8f..6af40c2d145f 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -72,6 +72,7 @@ struct module *find_module_all(const char *name, size_t len, bool even_unformed) int cmp_name(const void *name, const void *sym); long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section); +char *module_flags(struct module *mod, char *buf); static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { diff --git a/kernel/module/main.c b/kernel/module/main.c index 4d74988e1814..63b98452fd7d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -805,31 +804,6 @@ out: return ret; } -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - struct module_use *use; - int printed_something = 0; - - seq_printf(m, " %i ", module_refcount(mod)); - - /* - * Always include a trailing , so userspace can differentiate - * between this and the old multi-field proc format. - */ - list_for_each_entry(use, &mod->source_list, source_list) { - printed_something = 1; - seq_printf(m, "%s,", use->source->name); - } - - if (mod->init != NULL && mod->exit == NULL) { - printed_something = 1; - seq_puts(m, "[permanent],"); - } - - if (!printed_something) - seq_puts(m, "-"); -} - void __symbol_put(const char *symbol) { struct find_symbol_arg fsa = { @@ -919,12 +893,6 @@ void module_put(struct module *module) EXPORT_SYMBOL(module_put); #else /* !CONFIG_MODULE_UNLOAD */ -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - /* We don't know the usage count, or what modules are using. */ - seq_puts(m, " - -"); -} - static inline void module_unload_free(struct module *mod) { } @@ -3596,7 +3564,7 @@ static void cfi_cleanup(struct module *mod) } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ -static char *module_flags(struct module *mod, char *buf) +char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -3619,103 +3587,6 @@ static char *module_flags(struct module *mod, char *buf) return buf; } -#ifdef CONFIG_PROC_FS -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - -static int m_show(struct seq_file *m, void *p) -{ - struct module *mod = list_entry(p, struct module, list); - char buf[MODULE_FLAGS_BUF_SIZE]; - void *value; - - /* We always ignore unformed modules. */ - if (mod->state == MODULE_STATE_UNFORMED) - return 0; - - seq_printf(m, "%s %u", - mod->name, mod->init_layout.size + mod->core_layout.size); - print_unload_info(m, mod); - - /* Informative for users. */ - seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading" : - mod->state == MODULE_STATE_COMING ? "Loading" : - "Live"); - /* Used by oprofile and other similar tools. */ - value = m->private ? NULL : mod->core_layout.base; - seq_printf(m, " 0x%px", value); - - /* Taints info */ - if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); - - seq_puts(m, "\n"); - return 0; -} - -/* - * Format: modulename size refcount deps address - * - * Where refcount is a number or -, and deps is a comma-separated list - * of depends or -. - */ -static const struct seq_operations modules_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = m_show -}; - -/* - * This also sets the "private" pointer to non-NULL if the - * kernel pointers should be hidden (so you can just test - * "m->private" to see if you should keep the values private). - * - * We use the same logic as for /proc/kallsyms. - */ -static int modules_open(struct inode *inode, struct file *file) -{ - int err = seq_open(file, &modules_op); - - if (!err) { - struct seq_file *m = file->private_data; - m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; - } - - return err; -} - -static const struct proc_ops modules_proc_ops = { - .proc_flags = PROC_ENTRY_PERMANENT, - .proc_open = modules_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - -static int __init proc_modules_init(void) -{ - proc_create("modules", 0, NULL, &modules_proc_ops); - return 0; -} -module_init(proc_modules_init); -#endif - /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c new file mode 100644 index 000000000000..2717e130788e --- /dev/null +++ b/kernel/module/procfs.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module proc support + * + * Copyright (C) 2008 Alexey Dobriyan + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef CONFIG_MODULE_UNLOAD +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + struct module_use *use; + int printed_something = 0; + + seq_printf(m, " %i ", module_refcount(mod)); + + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ + list_for_each_entry(use, &mod->source_list, source_list) { + printed_something = 1; + seq_printf(m, "%s,", use->source->name); + } + + if (mod->init && !mod->exit) { + printed_something = 1; + seq_puts(m, "[permanent],"); + } + + if (!printed_something) + seq_puts(m, "-"); +} +#else /* !CONFIG_MODULE_UNLOAD */ +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + /* We don't know the usage count, or what modules are using. */ + seq_puts(m, " - -"); +} +#endif /* CONFIG_MODULE_UNLOAD */ + +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + +static int m_show(struct seq_file *m, void *p) +{ + struct module *mod = list_entry(p, struct module, list); + char buf[MODULE_FLAGS_BUF_SIZE]; + void *value; + + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + + seq_printf(m, "%s %u", + mod->name, mod->init_layout.size + mod->core_layout.size); + print_unload_info(m, mod); + + /* Informative for users. */ + seq_printf(m, " %s", + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : + "Live"); + /* Used by oprofile and other similar tools. */ + value = m->private ? NULL : mod->core_layout.base; + seq_printf(m, " 0x%px", value); + + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", module_flags(mod, buf)); + + seq_puts(m, "\n"); + return 0; +} + +/* + * Format: modulename size refcount deps address + * + * Where refcount is a number or -, and deps is a comma-separated list + * of depends or -. + */ +static const struct seq_operations modules_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show +}; + +/* + * This also sets the "private" pointer to non-NULL if the + * kernel pointers should be hidden (so you can just test + * "m->private" to see if you should keep the values private). + * + * We use the same logic as for /proc/kallsyms. + */ +static int modules_open(struct inode *inode, struct file *file) +{ + int err = seq_open(file, &modules_op); + + if (!err) { + struct seq_file *m = file->private_data; + + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; + } + + return err; +} + +static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = modules_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &modules_proc_ops); + return 0; +} +module_init(proc_modules_init); -- cgit v1.2.3 From 44c09535de4784f31d151aa1047efcf4797ca3cd Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:42 +0000 Subject: module: Move sysfs support into a separate file No functional change. This patch migrates module sysfs support out of core code into kernel/module/sysfs.c. In addition simple code refactoring to make this possible. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/internal.h | 21 +++ kernel/module/main.c | 469 +---------------------------------------------- kernel/module/sysfs.c | 436 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 461 insertions(+), 466 deletions(-) create mode 100644 kernel/module/sysfs.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 94296c98a67f..cf8dcdc6b55f 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PROC_FS) += procfs.o +obj-$(CONFIG_SYSFS) += sysfs.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 6af40c2d145f..62d749ef695e 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -34,6 +34,9 @@ extern struct mutex module_mutex; extern struct list_head modules; +extern struct module_attribute *modinfo_attrs[]; +extern size_t modinfo_attrs_count; + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -204,3 +207,21 @@ static inline void init_build_id(struct module *mod, const struct load_info *inf static inline void layout_symtab(struct module *mod, struct load_info *info) { } static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ + +#ifdef CONFIG_SYSFS +int mod_sysfs_setup(struct module *mod, const struct load_info *info, + struct kernel_param *kparam, unsigned int num_params); +void mod_sysfs_teardown(struct module *mod); +void init_param_lock(struct module *mod); +#else /* !CONFIG_SYSFS */ +static inline int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline void mod_sysfs_teardown(struct module *mod) { } +static inline void init_param_lock(struct module *mod) { } +#endif /* CONFIG_SYSFS */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 63b98452fd7d..0cd0590dd411 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -14,9 +14,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -989,7 +987,7 @@ static ssize_t show_taint(struct module_attribute *mattr, static struct module_attribute modinfo_taint = __ATTR(taint, 0444, show_taint, NULL); -static struct module_attribute *modinfo_attrs[] = { +struct module_attribute *modinfo_attrs[] = { &module_uevent, &modinfo_version, &modinfo_srcversion, @@ -1003,6 +1001,8 @@ static struct module_attribute *modinfo_attrs[] = { NULL, }; +size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs); + static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *reason) @@ -1253,469 +1253,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -/* - * /sys/module/foo/sections stuff - * J. Corbet - */ -#ifdef CONFIG_SYSFS - -#ifdef CONFIG_KALLSYMS -struct module_sect_attr { - struct bin_attribute battr; - unsigned long address; -}; - -struct module_sect_attrs { - struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[]; -}; - -#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) -static ssize_t module_sect_read(struct file *file, struct kobject *kobj, - struct bin_attribute *battr, - char *buf, loff_t pos, size_t count) -{ - struct module_sect_attr *sattr = - container_of(battr, struct module_sect_attr, battr); - char bounce[MODULE_SECT_READ_SIZE + 1]; - size_t wrote; - - if (pos != 0) - return -EINVAL; - - /* - * Since we're a binary read handler, we must account for the - * trailing NUL byte that sprintf will write: if "buf" is - * too small to hold the NUL, or the NUL is exactly the last - * byte, the read will look like it got truncated by one byte. - * Since there is no way to ask sprintf nicely to not write - * the NUL, we have to use a bounce buffer. - */ - wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", - kallsyms_show_value(file->f_cred) - ? (void *)sattr->address : NULL); - count = min(count, wrote); - memcpy(buf, bounce, count); - - return count; -} - -static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -{ - unsigned int section; - - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].battr.attr.name); - kfree(sect_attrs); -} - -static void add_sect_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int nloaded = 0, i, size[2]; - struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct bin_attribute **gattr; - - /* Count loaded sections and allocate structures */ - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i])) - nloaded++; - size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.bin_attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); - if (sect_attrs == NULL) - return; - - /* Setup section attributes. */ - sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; - - sect_attrs->nsections = 0; - sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.bin_attrs[0]; - for (i = 0; i < info->hdr->e_shnum; i++) { - Elf_Shdr *sec = &info->sechdrs[i]; - if (sect_empty(sec)) - continue; - sysfs_bin_attr_init(&sattr->battr); - sattr->address = sec->sh_addr; - sattr->battr.attr.name = - kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (sattr->battr.attr.name == NULL) - goto out; - sect_attrs->nsections++; - sattr->battr.read = module_sect_read; - sattr->battr.size = MODULE_SECT_READ_SIZE; - sattr->battr.attr.mode = 0400; - *(gattr++) = &(sattr++)->battr; - } - *gattr = NULL; - - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) - goto out; - - mod->sect_attrs = sect_attrs; - return; - out: - free_sect_attrs(sect_attrs); -} - -static void remove_sect_attrs(struct module *mod) -{ - if (mod->sect_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->sect_attrs->grp); - /* - * We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. - */ - free_sect_attrs(mod->sect_attrs); - mod->sect_attrs = NULL; - } -} - -/* - * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. - */ - -struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[]; -}; - -static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) -{ - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } - kfree(notes_attrs); -} - -static void add_notes_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int notes, loaded, i; - struct module_notes_attrs *notes_attrs; - struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; - - /* Count notes sections and allocate structures. */ - notes = 0; - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i]) && - (info->sechdrs[i].sh_type == SHT_NOTE)) - ++notes; - - if (notes == 0) - return; - - notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), - GFP_KERNEL); - if (notes_attrs == NULL) - return; - - notes_attrs->notes = notes; - nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { - if (sect_empty(&info->sechdrs[i])) - continue; - if (info->sechdrs[i].sh_type == SHT_NOTE) { - sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; - nattr->attr.mode = S_IRUGO; - nattr->size = info->sechdrs[i].sh_size; - nattr->private = (void *) info->sechdrs[i].sh_addr; - nattr->read = module_notes_read; - ++nattr; - } - ++loaded; - } - - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) - goto out; - - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) - goto out; - - mod->notes_attrs = notes_attrs; - return; - - out: - free_notes_attrs(notes_attrs, i); -} - -static void remove_notes_attrs(struct module *mod) -{ - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); -} - -#else - -static inline void add_sect_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_sect_attrs(struct module *mod) -{ -} - -static inline void add_notes_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_notes_attrs(struct module *mod) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void del_usage_links(struct module *mod) -{ -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); - mutex_unlock(&module_mutex); -#endif -} - -static int add_usage_links(struct module *mod) -{ - int ret = 0; -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - ret = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - if (ret) - break; - } - mutex_unlock(&module_mutex); - if (ret) - del_usage_links(mod); -#endif - return ret; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end); - -static int module_add_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - struct module_attribute *temp_attr; - int error = 0; - int i; - - mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * - (ARRAY_SIZE(modinfo_attrs) + 1)), - GFP_KERNEL); - if (!mod->modinfo_attrs) - return -ENOMEM; - - temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (!attr->test || attr->test(mod)) { - memcpy(temp_attr, attr, sizeof(*temp_attr)); - sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj, - &temp_attr->attr); - if (error) - goto error_out; - ++temp_attr; - } - } - - return 0; - -error_out: - if (i > 0) - module_remove_modinfo_attrs(mod, --i); - else - kfree(mod->modinfo_attrs); - return error; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { - if (end >= 0 && i > end) - break; - /* pick a field to test for end of list */ - if (!attr->attr.name) - break; - sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); - if (attr->free) - attr->free(mod); - } - kfree(mod->modinfo_attrs); -} - -static void mod_kobject_put(struct module *mod) -{ - DECLARE_COMPLETION_ONSTACK(c); - mod->mkobj.kobj_completion = &c; - kobject_put(&mod->mkobj.kobj); - wait_for_completion(&c); -} - -static int mod_sysfs_init(struct module *mod) -{ - int err; - struct kobject *kobj; - - if (!module_sysfs_initialized) { - pr_err("%s: module sysfs not initialized\n", mod->name); - err = -EINVAL; - goto out; - } - - kobj = kset_find_obj(module_kset, mod->name); - if (kobj) { - pr_err("%s: module is already loaded\n", mod->name); - kobject_put(kobj); - err = -EINVAL; - goto out; - } - - mod->mkobj.mod = mod; - - memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); - mod->mkobj.kobj.kset = module_kset; - err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, - "%s", mod->name); - if (err) - mod_kobject_put(mod); - -out: - return err; -} - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - int err; - - err = mod_sysfs_init(mod); - if (err) - goto out; - - mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); - if (!mod->holders_dir) { - err = -ENOMEM; - goto out_unreg; - } - - err = module_param_sysfs_setup(mod, kparam, num_params); - if (err) - goto out_unreg_holders; - - err = module_add_modinfo_attrs(mod); - if (err) - goto out_unreg_param; - - err = add_usage_links(mod); - if (err) - goto out_unreg_modinfo_attrs; - - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); - - return 0; - -out_unreg_modinfo_attrs: - module_remove_modinfo_attrs(mod, -1); -out_unreg_param: - module_param_sysfs_remove(mod); -out_unreg_holders: - kobject_put(mod->holders_dir); -out_unreg: - mod_kobject_put(mod); -out: - return err; -} - -static void mod_sysfs_fini(struct module *mod) -{ - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_put(mod); -} - -static void init_param_lock(struct module *mod) -{ - mutex_init(&mod->param_lock); -} -#else /* !CONFIG_SYSFS */ - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static void mod_sysfs_fini(struct module *mod) -{ -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ -} - -static void del_usage_links(struct module *mod) -{ -} - -static void init_param_lock(struct module *mod) -{ -} -#endif /* CONFIG_SYSFS */ - -static void mod_sysfs_teardown(struct module *mod) -{ - del_usage_links(mod); - module_remove_modinfo_attrs(mod, -1); - module_param_sysfs_remove(mod); - kobject_put(mod->mkobj.drivers_dir); - kobject_put(mod->holders_dir); - mod_sysfs_fini(mod); -} - /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c new file mode 100644 index 000000000000..ce68f821dcd1 --- /dev/null +++ b/kernel/module/sysfs.c @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module sysfs support + * + * Copyright (C) 2008 Rusty Russell + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * /sys/module/foo/sections stuff + * J. Corbet + */ +#ifdef CONFIG_KALLSYMS +struct module_sect_attr { + struct bin_attribute battr; + unsigned long address; +}; + +struct module_sect_attrs { + struct attribute_group grp; + unsigned int nsections; + struct module_sect_attr attrs[]; +}; + +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) +{ + struct module_sect_attr *sattr = + container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; + + if (pos != 0) + return -EINVAL; + + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; +} + +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + unsigned int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].battr.attr.name); + kfree(sect_attrs); +} + +static void add_sect_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; + struct module_sect_attr *sattr; + struct bin_attribute **gattr; + + /* Count loaded sections and allocate structures */ + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) + nloaded++; + size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (!sect_attrs) + return; + + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; + + sect_attrs->nsections = 0; + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + + if (sect_empty(sec)) + continue; + sysfs_bin_attr_init(&sattr->battr); + sattr->address = sec->sh_addr; + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (!sattr->battr.attr.name) + goto out; + sect_attrs->nsections++; + sattr->battr.read = module_sect_read; + sattr->battr.size = MODULE_SECT_READ_SIZE; + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; + } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; + return; +out: + free_sect_attrs(sect_attrs); +} + +static void remove_sect_attrs(struct module *mod) +{ + if (mod->sect_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* + * We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. + */ + free_sect_attrs(mod->sect_attrs); + mod->sect_attrs = NULL; + } +} + +/* + * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. + */ + +struct module_notes_attrs { + struct kobject *dir; + unsigned int notes; + struct bin_attribute attrs[]; +}; + +static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + /* + * The caller checked the pos and count against our size. + */ + memcpy(buf, bin_attr->private + pos, count); + return count; +} + +static void free_notes_attrs(struct module_notes_attrs *notes_attrs, + unsigned int i) +{ + if (notes_attrs->dir) { + while (i-- > 0) + sysfs_remove_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i]); + kobject_put(notes_attrs->dir); + } + kfree(notes_attrs); +} + +static void add_notes_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int notes, loaded, i; + struct module_notes_attrs *notes_attrs; + struct bin_attribute *nattr; + + /* failed to create section attributes, so can't create notes */ + if (!mod->sect_attrs) + return; + + /* Count notes sections and allocate structures. */ + notes = 0; + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + info->sechdrs[i].sh_type == SHT_NOTE) + ++notes; + + if (notes == 0) + return; + + notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), + GFP_KERNEL); + if (!notes_attrs) + return; + + notes_attrs->notes = notes; + nattr = ¬es_attrs->attrs[0]; + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) + continue; + if (info->sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; + nattr->attr.mode = 0444; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *)info->sechdrs[i].sh_addr; + nattr->read = module_notes_read; + ++nattr; + } + ++loaded; + } + + notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); + if (!notes_attrs->dir) + goto out; + + for (i = 0; i < notes; ++i) + if (sysfs_create_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i])) + goto out; + + mod->notes_attrs = notes_attrs; + return; + +out: + free_notes_attrs(notes_attrs, i); +} + +static void remove_notes_attrs(struct module *mod) +{ + if (mod->notes_attrs) + free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); +} + +#else /* !CONFIG_KALLSYMS */ +static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { } +static inline void remove_sect_attrs(struct module *mod) { } +static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { } +static inline void remove_notes_attrs(struct module *mod) { } +#endif /* CONFIG_KALLSYMS */ + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int add_usage_links(struct module *mod) +{ + int ret = 0; +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } + mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); +#endif + return ret; +} + +static void module_remove_modinfo_attrs(struct module *mod, int end) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + if (end >= 0 && i > end) + break; + /* pick a field to test for end of list */ + if (!attr->attr.name) + break; + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); + if (attr->free) + attr->free(mod); + } + kfree(mod->modinfo_attrs); +} + +static int module_add_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + struct module_attribute *temp_attr; + int error = 0; + int i; + + mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * + (modinfo_attrs_count + 1)), + GFP_KERNEL); + if (!mod->modinfo_attrs) + return -ENOMEM; + + temp_attr = mod->modinfo_attrs; + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (!attr->test || attr->test(mod)) { + memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); + if (error) + goto error_out; + ++temp_attr; + } + } + + return 0; + +error_out: + if (i > 0) + module_remove_modinfo_attrs(mod, --i); + else + kfree(mod->modinfo_attrs); + return error; +} + +static void mod_kobject_put(struct module *mod) +{ + DECLARE_COMPLETION_ONSTACK(c); + + mod->mkobj.kobj_completion = &c; + kobject_put(&mod->mkobj.kobj); + wait_for_completion(&c); +} + +static int mod_sysfs_init(struct module *mod) +{ + int err; + struct kobject *kobj; + + if (!module_sysfs_initialized) { + pr_err("%s: module sysfs not initialized\n", mod->name); + err = -EINVAL; + goto out; + } + + kobj = kset_find_obj(module_kset, mod->name); + if (kobj) { + pr_err("%s: module is already loaded\n", mod->name); + kobject_put(kobj); + err = -EINVAL; + goto out; + } + + mod->mkobj.mod = mod; + + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + mod->mkobj.kobj.kset = module_kset; + err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, + "%s", mod->name); + if (err) + mod_kobject_put(mod); + +out: + return err; +} + +int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + err = mod_sysfs_init(mod); + if (err) + goto out; + + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); + if (!mod->holders_dir) { + err = -ENOMEM; + goto out_unreg; + } + + err = module_param_sysfs_setup(mod, kparam, num_params); + if (err) + goto out_unreg_holders; + + err = module_add_modinfo_attrs(mod); + if (err) + goto out_unreg_param; + + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); + + return 0; + +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod, -1); +out_unreg_param: + module_param_sysfs_remove(mod); +out_unreg_holders: + kobject_put(mod->holders_dir); +out_unreg: + mod_kobject_put(mod); +out: + return err; +} + +static void mod_sysfs_fini(struct module *mod) +{ + remove_notes_attrs(mod); + remove_sect_attrs(mod); + mod_kobject_put(mod); +} + +void mod_sysfs_teardown(struct module *mod) +{ + del_usage_links(mod); + module_remove_modinfo_attrs(mod, -1); + module_param_sysfs_remove(mod); + kobject_put(mod->mkobj.drivers_dir); + kobject_put(mod->holders_dir); + mod_sysfs_fini(mod); +} + +void init_param_lock(struct module *mod) +{ + mutex_init(&mod->param_lock); +} -- cgit v1.2.3 From f64205a42046d3802c423fa2059e7fca39af127c Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:43 +0000 Subject: module: Move kdb module related code out of main kdb code No functional change. This patch migrates the kdb 'lsmod' command support out of main kdb code into its own file under kernel/module. In addition to the above, a minor style warning i.e. missing a blank line after declarations, was resolved too. The new file was added to MAINTAINERS. Finally we remove linux/module.h as it is entirely redundant. Reviewed-by: Daniel Thompson Acked-by: Daniel Thompson Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- MAINTAINERS | 1 + include/linux/kdb.h | 1 + kernel/debug/kdb/kdb_io.c | 1 - kernel/debug/kdb/kdb_keyboard.c | 1 - kernel/debug/kdb/kdb_main.c | 49 ------------------------------------ kernel/debug/kdb/kdb_private.h | 4 --- kernel/debug/kdb/kdb_support.c | 1 - kernel/module/Makefile | 1 + kernel/module/kdb.c | 56 +++++++++++++++++++++++++++++++++++++++++ kernel/module/main.c | 4 --- 10 files changed, 59 insertions(+), 60 deletions(-) create mode 100644 kernel/module/kdb.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index 6dcd93fb3a96..87cc05c6b462 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10907,6 +10907,7 @@ F: drivers/tty/serial/kgdboc.c F: include/linux/kdb.h F: include/linux/kgdb.h F: kernel/debug/ +F: kernel/module/kdb.c KHADAS MCU MFD DRIVER M: Neil Armstrong diff --git a/include/linux/kdb.h b/include/linux/kdb.h index ea0f5e580fac..07dfb6a20a1c 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -222,5 +222,6 @@ enum { extern int kdbgetintenv(const char *, int *); extern int kdb_set(int, const char **); +int kdb_lsmod(int argc, const char **argv); #endif /* !_KDB_H */ diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6735ac36b718..67d3c48a1522 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -9,7 +9,6 @@ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. */ -#include #include #include #include diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index f877a0a0d7cf..f87c750d3eb3 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -11,7 +11,6 @@ #include #include #include -#include #include /* Keyboard Controller Registers on normal PCs. */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0852a537dad4..f3a30cd5037f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -2004,54 +2003,6 @@ static int kdb_ef(int argc, const char **argv) return 0; } -#if defined(CONFIG_MODULES) -/* - * kdb_lsmod - This function implements the 'lsmod' command. Lists - * currently loaded kernel modules. - * Mostly taken from userland lsmod. - */ -static int kdb_lsmod(int argc, const char **argv) -{ - struct module *mod; - - if (argc != 0) - return KDB_ARGCOUNT; - - kdb_printf("Module Size modstruct Used by\n"); - list_for_each_entry(mod, kdb_modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - kdb_printf("%-20s%8u 0x%px ", mod->name, - mod->core_layout.size, (void *)mod); -#ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); -#endif - if (mod->state == MODULE_STATE_GOING) - kdb_printf(" (Unloading)"); - else if (mod->state == MODULE_STATE_COMING) - kdb_printf(" (Loading)"); - else - kdb_printf(" (Live)"); - kdb_printf(" 0x%px", mod->core_layout.base); - -#ifdef CONFIG_MODULE_UNLOAD - { - struct module_use *use; - kdb_printf(" [ "); - list_for_each_entry(use, &mod->source_list, - source_list) - kdb_printf("%s ", use->target->name); - kdb_printf("]\n"); - } -#endif - } - - return 0; -} - -#endif /* CONFIG_MODULES */ - /* * kdb_env - This function implements the 'env' command. Display the * current environment variables. diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 0d2f9feea0a4..1f8c519a5f81 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -226,10 +226,6 @@ extern void kdb_kbd_cleanup_state(void); #define kdb_kbd_cleanup_state() #endif /* ! CONFIG_KDB_KEYBOARD */ -#ifdef CONFIG_MODULES -extern struct list_head *kdb_modules; -#endif /* CONFIG_MODULES */ - extern char kdb_prompt_str[]; #define KDB_WORD_SIZE ((int)sizeof(unsigned long)) diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 85cb51c4a17e..0a39497140bf 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/module/Makefile b/kernel/module/Makefile index cf8dcdc6b55f..88f5cdcdb067 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -17,3 +17,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PROC_FS) += procfs.o obj-$(CONFIG_SYSFS) += sysfs.o +obj-$(CONFIG_KGDB_KDB) += kdb.o diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c new file mode 100644 index 000000000000..a446c699db0a --- /dev/null +++ b/kernel/module/kdb.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kdb support + * + * Copyright (C) 2010 Jason Wessel + */ + +#include +#include +#include "internal.h" + +/* + * kdb_lsmod - This function implements the 'lsmod' command. Lists + * currently loaded kernel modules. + * Mostly taken from userland lsmod. + */ +int kdb_lsmod(int argc, const char **argv) +{ + struct module *mod; + + if (argc != 0) + return KDB_ARGCOUNT; + + kdb_printf("Module Size modstruct Used by\n"); + list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + kdb_printf("%-20s%8u 0x%px ", mod->name, + mod->core_layout.size, (void *)mod); +#ifdef CONFIG_MODULE_UNLOAD + kdb_printf("%4d ", module_refcount(mod)); +#endif + if (mod->state == MODULE_STATE_GOING) + kdb_printf(" (Unloading)"); + else if (mod->state == MODULE_STATE_COMING) + kdb_printf(" (Loading)"); + else + kdb_printf(" (Live)"); + kdb_printf(" 0x%px", mod->core_layout.base); + +#ifdef CONFIG_MODULE_UNLOAD + { + struct module_use *use; + + kdb_printf(" [ "); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); + kdb_printf("]\n"); + } +#endif + } + + return 0; +} diff --git a/kernel/module/main.c b/kernel/module/main.c index 0cd0590dd411..a2dc54726621 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -108,10 +108,6 @@ static void mod_update_bounds(struct module *mod) __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); } -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ - static void module_assert_mutex_or_preempt(void) { #ifdef CONFIG_LOCKDEP -- cgit v1.2.3 From 47889798da4307ed78346f04c5d95c87abbf696b Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:44 +0000 Subject: module: Move version support into a separate file No functional change. This patch migrates module version support out of core code into kernel/module/version.c. In addition simple code refactoring to make this possible. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 1 + kernel/module/internal.h | 48 +++++++++++++++ kernel/module/main.c | 156 +++-------------------------------------------- kernel/module/version.c | 109 +++++++++++++++++++++++++++++++++ 4 files changed, 166 insertions(+), 148 deletions(-) create mode 100644 kernel/module/version.c (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 88f5cdcdb067..e2eff9853a28 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PROC_FS) += procfs.o obj-$(CONFIG_SYSFS) += sysfs.o obj-$(CONFIG_KGDB_KDB) += kdb.o +obj-$(CONFIG_MODVERSIONS) += version.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 62d749ef695e..3fc139d5074b 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -70,7 +70,27 @@ struct load_info { } index; }; +enum mod_license { + NOT_GPL_ONLY, + GPL_ONLY, +}; + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const s32 *crc; + const struct kernel_symbol *sym; + enum mod_license license; +}; + int mod_verify_sig(const void *mod, struct load_info *info); +int try_to_force_load(struct module *mod, const char *reason); +bool find_symbol(struct find_symbol_arg *fsa); struct module *find_module_all(const char *name, size_t len, bool even_unformed); int cmp_name(const void *name, const void *sym); long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, @@ -225,3 +245,31 @@ static inline int mod_sysfs_setup(struct module *mod, static inline void mod_sysfs_teardown(struct module *mod) { } static inline void init_param_lock(struct module *mod) { } #endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODVERSIONS +int check_version(const struct load_info *info, + const char *symname, struct module *mod, const s32 *crc); +void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, + struct kernel_symbol *ks, struct tracepoint * const *tp); +int check_modstruct_version(const struct load_info *info, struct module *mod); +int same_magic(const char *amagic, const char *bmagic, bool has_crcs); +#else /* !CONFIG_MODVERSIONS */ +static inline int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + return 1; +} + +static inline int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ diff --git a/kernel/module/main.c b/kernel/module/main.c index a2dc54726621..5f06c06df9e1 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -86,6 +86,12 @@ struct mod_tree_root mod_tree __cacheline_aligned = { static unsigned long module_addr_min = -1UL, module_addr_max; #endif /* CONFIG_MODULES_TREE_LOOKUP */ +struct symsearch { + const struct kernel_symbol *start, *stop; + const s32 *crcs; + enum mod_license license; +}; + /* * Bounds of module text, for speeding up __module_address. * Protected by module_mutex. @@ -244,28 +250,6 @@ static __maybe_unused void *any_section_objs(const struct load_info *info, #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -struct symsearch { - const struct kernel_symbol *start, *stop; - const s32 *crcs; - enum mod_license { - NOT_GPL_ONLY, - GPL_ONLY, - } license; -}; - -struct find_symbol_arg { - /* Input */ - const char *name; - bool gplok; - bool warn; - - /* Output */ - struct module *owner; - const s32 *crc; - const struct kernel_symbol *sym; - enum mod_license license; -}; - static bool check_exported_symbol(const struct symsearch *syms, struct module *owner, unsigned int symnum, void *data) @@ -327,7 +311,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, * Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -static bool find_symbol(struct find_symbol_arg *fsa) +bool find_symbol(struct find_symbol_arg *fsa) { static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, @@ -1001,7 +985,7 @@ size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs); static const char vermagic[] = VERMAGIC_STRING; -static int try_to_force_load(struct module *mod, const char *reason) +int try_to_force_load(struct module *mod, const char *reason) { #ifdef CONFIG_MODULE_FORCE_LOAD if (!test_taint(TAINT_FORCED_MODULE)) @@ -1013,115 +997,6 @@ static int try_to_force_load(struct module *mod, const char *reason) #endif } -#ifdef CONFIG_MODVERSIONS - -static u32 resolve_rel_crc(const s32 *crc) -{ - return *(u32 *)((void *)crc + *crc); -} - -static int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - Elf_Shdr *sechdrs = info->sechdrs; - unsigned int versindex = info->index.vers; - unsigned int i, num_versions; - struct modversion_info *versions; - - /* Exporting module didn't supply crcs? OK, we're already tainted. */ - if (!crc) - return 1; - - /* No versions at all? modprobe --force does this. */ - if (versindex == 0) - return try_to_force_load(mod, symname) == 0; - - versions = (void *) sechdrs[versindex].sh_addr; - num_versions = sechdrs[versindex].sh_size - / sizeof(struct modversion_info); - - for (i = 0; i < num_versions; i++) { - u32 crcval; - - if (strcmp(versions[i].name, symname) != 0) - continue; - - if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) - crcval = resolve_rel_crc(crc); - else - crcval = *crc; - if (versions[i].crc == crcval) - return 1; - pr_debug("Found checksum %X vs module %lX\n", - crcval, versions[i].crc); - goto bad_version; - } - - /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", info->name, symname); - return 1; - -bad_version: - pr_warn("%s: disagrees about version of symbol %s\n", - info->name, symname); - return 0; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - struct find_symbol_arg fsa = { - .name = "module_layout", - .gplok = true, - }; - - /* - * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. - */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); - return check_version(info, "module_layout", mod, fsa.crc); -} - -/* First part is kernel version, which we ignore if module has crcs. */ -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - if (has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); - } - return strcmp(amagic, bmagic) == 0; -} -#else -static inline int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - return 1; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - return 1; -} - -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - return strcmp(amagic, bmagic) == 0; -} -#endif /* CONFIG_MODVERSIONS */ - static char *get_modinfo(const struct load_info *info, const char *tag); static char *get_next_modinfo(const struct load_info *info, const char *tag, char *prev); @@ -3247,18 +3122,3 @@ void print_modules(void) pr_cont(" [last unloaded: %s]", last_unloaded_module); pr_cont("\n"); } - -#ifdef CONFIG_MODVERSIONS -/* - * Generate the signature for all relevant module structures here. - * If these change, we don't want to try to parse the module. - */ -void module_layout(struct module *mod, - struct modversion_info *ver, - struct kernel_param *kp, - struct kernel_symbol *ks, - struct tracepoint * const *tp) -{ -} -EXPORT_SYMBOL(module_layout); -#endif diff --git a/kernel/module/version.c b/kernel/module/version.c new file mode 100644 index 000000000000..adaedce1dc97 --- /dev/null +++ b/kernel/module/version.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module version support + * + * Copyright (C) 2008 Rusty Russell + */ + +#include +#include +#include +#include "internal.h" + +static u32 resolve_rel_crc(const s32 *crc) +{ + return *(u32 *)((void *)crc + *crc); +} + +int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; + unsigned int i, num_versions; + struct modversion_info *versions; + + /* Exporting module didn't supply crcs? OK, we're already tainted. */ + if (!crc) + return 1; + + /* No versions at all? modprobe --force does this. */ + if (versindex == 0) + return try_to_force_load(mod, symname) == 0; + + versions = (void *)sechdrs[versindex].sh_addr; + num_versions = sechdrs[versindex].sh_size + / sizeof(struct modversion_info); + + for (i = 0; i < num_versions; i++) { + u32 crcval; + + if (strcmp(versions[i].name, symname) != 0) + continue; + + if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) + crcval = resolve_rel_crc(crc); + else + crcval = *crc; + if (versions[i].crc == crcval) + return 1; + pr_debug("Found checksum %X vs module %lX\n", + crcval, versions[i].crc); + goto bad_version; + } + + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); + return 1; + +bad_version: + pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname); + return 0; +} + +int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + struct find_symbol_arg fsa = { + .name = "module_layout", + .gplok = true, + }; + + /* + * Since this should be found in kernel (which can't be removed), no + * locking is necessary -- use preempt_disable() to placate lockdep. + */ + preempt_disable(); + if (!find_symbol(&fsa)) { + preempt_enable(); + BUG(); + } + preempt_enable(); + return check_version(info, "module_layout", mod, fsa.crc); +} + +/* First part is kernel version, which we ignore if module has crcs. */ +int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + if (has_crcs) { + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + } + return strcmp(amagic, bmagic) == 0; +} + +/* + * Generate the signature for all relevant module structures here. + * If these change, we don't want to try to parse the module. + */ +void module_layout(struct module *mod, + struct modversion_info *ver, + struct kernel_param *kp, + struct kernel_symbol *ks, + struct tracepoint * const *tp) +{ +} +EXPORT_SYMBOL(module_layout); -- cgit v1.2.3 From 0597579356fe5b6c0b99196e4743d4c2978f654a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 10:00:58 +0100 Subject: module: Make module_enable_x() independent of CONFIG_ARCH_HAS_STRICT_MODULE_RWX module_enable_x() has nothing to do with CONFIG_ARCH_HAS_STRICT_MODULE_RWX allthough by coincidence architectures who need module_enable_x() are selection CONFIG_ARCH_HAS_STRICT_MODULE_RWX. Enable module_enable_x() for everyone everytime. If an architecture already has module text set executable, it's a no-op. Don't check text_size alignment. When CONFIG_STRICT_MODULE_RWX is set the verification is already done in frob_rodata(). When CONFIG_STRICT_MODULE_RWX is not set it is not a big deal to have the start of data as executable. Just make sure we entirely get the last page when the boundary is not aligned. And don't BUG on misaligned base as some architectures like nios2 use kmalloc() for allocating modules. So just bail out in that case. If that's a problem, a page fault will occur later anyway. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 6 ++---- kernel/module/main.c | 12 +++++------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 3fc139d5074b..972bc811dcd2 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -23,9 +23,9 @@ /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but - * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y + * only when CONFIG_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX +#ifdef CONFIG_STRICT_MODULE_RWX # define debug_align(X) PAGE_ALIGN(X) #else # define debug_align(X) (X) @@ -175,10 +175,8 @@ static inline struct module *mod_find(unsigned long addr) } #endif /* CONFIG_MODULES_TREE_LOOKUP */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)); -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ #ifdef CONFIG_STRICT_MODULE_RWX void module_enable_ro(const struct module *mod, bool after_init); diff --git a/kernel/module/main.c b/kernel/module/main.c index 5f06c06df9e1..2bf8c617a901 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1144,24 +1144,22 @@ resolve_symbol_wait(struct module *mod, * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of * whether we are strict. */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); set_memory((unsigned long)layout->base, - layout->text_size >> PAGE_SHIFT); + PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT); } static void module_enable_x(const struct module *mod) { + if (!PAGE_ALIGNED(mod->core_layout.base) || + !PAGE_ALIGNED(mod->init_layout.base)) + return; + frob_text(&mod->core_layout, set_memory_x); frob_text(&mod->init_layout, set_memory_x); } -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ void __weak module_memfree(void *module_region) { -- cgit v1.2.3 From 32a08c17d8096f0fd2c6600bc5fe8464aaf68ea7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 10:00:59 +0100 Subject: module: Move module_enable_x() and frob_text() in strict_rwx.c Move module_enable_x() together with module_enable_nx() and module_enable_ro(). Those three functions are going together, they are all used to set up the correct page flags on the different sections. As module_enable_x() is used independently of CONFIG_STRICT_MODULE_RWX, build strict_rwx.c all the time and use IS_ENABLED(CONFIG_STRICT_MODULE_RWX) when relevant. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/Makefile | 3 +-- kernel/module/internal.h | 15 +-------------- kernel/module/main.c | 37 ----------------------------------- kernel/module/strict_rwx.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/module/Makefile b/kernel/module/Makefile index e2eff9853a28..d1ca799c12e2 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -7,12 +7,11 @@ # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n -obj-y += main.o +obj-y += main.o strict_rwx.o obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o obj-$(CONFIG_MODULE_SIG) += signing.o obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o -obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PROC_FS) += procfs.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 972bc811dcd2..c59473b232df 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -175,25 +175,12 @@ static inline struct module *mod_find(unsigned long addr) } #endif /* CONFIG_MODULES_TREE_LOOKUP */ -void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, - int num_pages)); - -#ifdef CONFIG_STRICT_MODULE_RWX void module_enable_ro(const struct module *mod, bool after_init); void module_enable_nx(const struct module *mod); +void module_enable_x(const struct module *mod); int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod); -#else /* !CONFIG_STRICT_MODULE_RWX */ -static inline void module_enable_nx(const struct module *mod) { } -static inline void module_enable_ro(const struct module *mod, bool after_init) {} -static inline int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - return 0; -} -#endif /* CONFIG_STRICT_MODULE_RWX */ - #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); #else /* !CONFIG_MODULE_SIG */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 2bf8c617a901..7175b44ba88a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1124,43 +1124,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -/* - * LKM RO/NX protection: protect module's text/ro-data - * from modification and any data from execution. - * - * General layout of module is: - * [text] [read-only-data] [ro-after-init] [writable data] - * text_size -----^ ^ ^ ^ - * ro_size ------------------------| | | - * ro_after_init_size -----------------------------| | - * size -----------------------------------------------------------| - * - * These values are always page-aligned (as is base) - */ - -/* - * Since some arches are moving towards PAGE_KERNEL module allocations instead - * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the - * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of - * whether we are strict. - */ -void frob_text(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - set_memory((unsigned long)layout->base, - PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT); -} - -static void module_enable_x(const struct module *mod) -{ - if (!PAGE_ALIGNED(mod->core_layout.base) || - !PAGE_ALIGNED(mod->init_layout.base)) - return; - - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); -} - void __weak module_memfree(void *module_region) { /* diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 7949dfd449c2..43332b4416b0 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -11,6 +11,34 @@ #include #include "internal.h" +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| + * + * These values are always page-aligned (as is base) when + * CONFIG_STRICT_MODULE_RWX is set. + */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of + * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we + * are strict. + */ +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base, + PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT); +} + static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { @@ -41,10 +69,24 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } +void module_enable_x(const struct module *mod) +{ + if (!PAGE_ALIGNED(mod->core_layout.base) || + !PAGE_ALIGNED(mod->init_layout.base)) + return; + + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} + void module_enable_ro(const struct module *mod, bool after_init) { + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return; +#ifdef CONFIG_STRICT_MODULE_RWX if (!rodata_enabled) return; +#endif set_vm_flush_reset_perms(mod->core_layout.base); set_vm_flush_reset_perms(mod->init_layout.base); @@ -60,6 +102,9 @@ void module_enable_ro(const struct module *mod, bool after_init) void module_enable_nx(const struct module *mod) { + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return; + frob_rodata(&mod->core_layout, set_memory_nx); frob_ro_after_init(&mod->core_layout, set_memory_nx); frob_writable_data(&mod->core_layout, set_memory_nx); @@ -73,6 +118,9 @@ int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; int i; + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return 0; + for (i = 0; i < hdr->e_shnum; i++) { if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", -- cgit v1.2.3 From ef505058dc5524488a84423b4d5e8a7598a23a2e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 10:01:00 +0100 Subject: module: Rework layout alignment to avoid BUG_ON()s Perform layout alignment verification up front and WARN_ON() and fail module loading instead of crashing the machine. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 1 + kernel/module/main.c | 5 +++++ kernel/module/strict_rwx.c | 27 ++++++++++++++++++--------- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index c59473b232df..e94defbeda00 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -180,6 +180,7 @@ void module_enable_nx(const struct module *mod); void module_enable_x(const struct module *mod); int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod); +bool module_check_misalignment(const struct module *mod); #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); diff --git a/kernel/module/main.c b/kernel/module/main.c index 7175b44ba88a..55e710bc7f46 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2550,6 +2550,9 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + if (module_check_misalignment(mod)) + goto out_misaligned; + module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); @@ -2563,6 +2566,8 @@ static int complete_formation(struct module *mod, struct load_info *info) return 0; +out_misaligned: + err = -EINVAL; out: mutex_unlock(&module_mutex); return err; diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 43332b4416b0..f36ea54c1dac 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -42,9 +42,6 @@ static void frob_text(const struct module_layout *layout, static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { - BUG_ON(!PAGE_ALIGNED(layout->base)); - BUG_ON(!PAGE_ALIGNED(layout->text_size)); - BUG_ON(!PAGE_ALIGNED(layout->ro_size)); set_memory((unsigned long)layout->base + layout->text_size, (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } @@ -52,9 +49,6 @@ static void frob_rodata(const struct module_layout *layout, static void frob_ro_after_init(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { - BUG_ON(!PAGE_ALIGNED(layout->base)); - BUG_ON(!PAGE_ALIGNED(layout->ro_size)); - BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size)); set_memory((unsigned long)layout->base + layout->ro_size, (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); } @@ -62,13 +56,28 @@ static void frob_ro_after_init(const struct module_layout *layout, static void frob_writable_data(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { - BUG_ON(!PAGE_ALIGNED(layout->base)); - BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size)); - BUG_ON(!PAGE_ALIGNED(layout->size)); set_memory((unsigned long)layout->base + layout->ro_after_init_size, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } +static bool layout_check_misalignment(const struct module_layout *layout) +{ + return WARN_ON(!PAGE_ALIGNED(layout->base)) || + WARN_ON(!PAGE_ALIGNED(layout->text_size)) || + WARN_ON(!PAGE_ALIGNED(layout->ro_size)) || + WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) || + WARN_ON(!PAGE_ALIGNED(layout->size)); +} + +bool module_check_misalignment(const struct module *mod) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return false; + + return layout_check_misalignment(&mod->core_layout) || + layout_check_misalignment(&mod->init_layout); +} + void module_enable_x(const struct module *mod) { if (!PAGE_ALIGNED(mod->core_layout.base) || -- cgit v1.2.3 From 7337f929d5672e37a80c8582d357f084320f475f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 10:01:01 +0100 Subject: module: Rename debug_align() as strict_align() debug_align() was added by commit 84e1c6bb38eb ("x86: Add RO/NX protection for loadable kernel modules") At that time the config item was CONFIG_DEBUG_SET_MODULE_RONX. But nowadays it has changed to CONFIG_STRICT_MODULE_RWX and debug_align() is confusing because it has nothing to do with DEBUG. Rename it strict_align() Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 4 ++-- kernel/module/kallsyms.c | 4 ++-- kernel/module/main.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index e94defbeda00..cbc268af23ae 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -26,9 +26,9 @@ * only when CONFIG_STRICT_MODULE_RWX=y */ #ifdef CONFIG_STRICT_MODULE_RWX -# define debug_align(X) PAGE_ALIGN(X) +# define strict_align(X) PAGE_ALIGN(X) #else -# define debug_align(X) (X) +# define strict_align(X) (X) #endif extern struct mutex module_mutex; diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index a3da0686a2a6..438492e103c8 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -139,7 +139,7 @@ void layout_symtab(struct module *mod, struct load_info *info) mod->core_layout.size += strtab_size; info->core_typeoffs = mod->core_layout.size; mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; @@ -154,7 +154,7 @@ void layout_symtab(struct module *mod, struct load_info *info) mod->init_layout.size += sizeof(struct mod_kallsyms); info->init_typeoffs = mod->init_layout.size; mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); } /* diff --git a/kernel/module/main.c b/kernel/module/main.c index 55e710bc7f46..e6a72c3651f6 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1447,19 +1447,19 @@ static void layout_sections(struct module *mod, struct load_info *info) } switch (m) { case 0: /* executable */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); mod->core_layout.ro_size = mod->core_layout.size; break; case 2: /* RO after init */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); mod->core_layout.ro_after_init_size = mod->core_layout.size; break; case 4: /* whole core */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); break; } } @@ -1481,11 +1481,11 @@ static void layout_sections(struct module *mod, struct load_info *info) } switch (m) { case 0: /* executable */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; case 2: @@ -1496,7 +1496,7 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; break; case 4: /* whole init */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); break; } } -- cgit v1.2.3 From 80b8bf4369906aefbcb63a03012aed7a1abcbd18 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 13:02:11 +0100 Subject: module: Always have struct mod_tree_root In order to separate text and data, we need to setup two rb trees. This means that struct mod_tree_root is required even without MODULES_TREE_LOOKUP. Signed-off-by: Christophe Leroy Reviewed-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 4 +++- kernel/module/main.c | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index cbc268af23ae..ac64e53ac5e3 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -143,15 +143,17 @@ static inline void module_decompress_cleanup(struct load_info *info) } #endif -#ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_root { +#ifdef CONFIG_MODULES_TREE_LOOKUP struct latch_tree_root root; +#endif unsigned long addr_min; unsigned long addr_max; }; extern struct mod_tree_root mod_tree; +#ifdef CONFIG_MODULES_TREE_LOOKUP void mod_tree_insert(struct module *mod); void mod_tree_remove_init(struct module *mod); void mod_tree_remove(struct module *mod); diff --git a/kernel/module/main.c b/kernel/module/main.c index e6a72c3651f6..84d567c57575 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -74,7 +74,6 @@ static void do_free_init(struct work_struct *w); static DECLARE_WORK(init_free_wq, do_free_init); static LLIST_HEAD(init_free_list); -#ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_root mod_tree __cacheline_aligned = { .addr_min = -1UL, }; @@ -82,10 +81,6 @@ struct mod_tree_root mod_tree __cacheline_aligned = { #define module_addr_min mod_tree.addr_min #define module_addr_max mod_tree.addr_max -#else /* !CONFIG_MODULES_TREE_LOOKUP */ -static unsigned long module_addr_min = -1UL, module_addr_max; -#endif /* CONFIG_MODULES_TREE_LOOKUP */ - struct symsearch { const struct kernel_symbol *start, *stop; const s32 *crcs; -- cgit v1.2.3 From 446d55666d5599ca58c1ceac25ce4b5191e70835 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 13:02:12 +0100 Subject: module: Prepare for handling several RB trees In order to separate text and data, we need to setup two rb trees. Modify functions to give the tree as a parameter. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 4 ++-- kernel/module/main.c | 16 ++++++++-------- kernel/module/tree_lookup.c | 20 ++++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index ac64e53ac5e3..0f3146347256 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -157,13 +157,13 @@ extern struct mod_tree_root mod_tree; void mod_tree_insert(struct module *mod); void mod_tree_remove_init(struct module *mod); void mod_tree_remove(struct module *mod); -struct module *mod_find(unsigned long addr); +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree); #else /* !CONFIG_MODULES_TREE_LOOKUP */ static inline void mod_tree_insert(struct module *mod) { } static inline void mod_tree_remove_init(struct module *mod) { } static inline void mod_tree_remove(struct module *mod) { } -static inline struct module *mod_find(unsigned long addr) +static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) { struct module *mod; diff --git a/kernel/module/main.c b/kernel/module/main.c index 84d567c57575..392ac847d90a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -91,22 +91,22 @@ struct symsearch { * Bounds of module text, for speeding up __module_address. * Protected by module_mutex. */ -static void __mod_update_bounds(void *base, unsigned int size) +static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree) { unsigned long min = (unsigned long)base; unsigned long max = min + size; - if (min < module_addr_min) - module_addr_min = min; - if (max > module_addr_max) - module_addr_max = max; + if (min < tree->addr_min) + tree->addr_min = min; + if (max > tree->addr_max) + tree->addr_max = max; } static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree); if (mod->init_layout.size) - __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree); } static void module_assert_mutex_or_preempt(void) @@ -3017,7 +3017,7 @@ struct module *__module_address(unsigned long addr) module_assert_mutex_or_preempt(); - mod = mod_find(addr); + mod = mod_find(addr, &mod_tree); if (mod) { BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 0bc4ec3b22ce..995fe68059db 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -61,14 +61,14 @@ static const struct latch_tree_ops mod_tree_ops = { .comp = mod_tree_comp, }; -static noinline void __mod_tree_insert(struct mod_tree_node *node) +static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree) { - latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); + latch_tree_insert(&node->node, &tree->root, &mod_tree_ops); } -static void __mod_tree_remove(struct mod_tree_node *node) +static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree) { - latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); + latch_tree_erase(&node->node, &tree->root, &mod_tree_ops); } /* @@ -80,28 +80,28 @@ void mod_tree_insert(struct module *mod) mod->core_layout.mtn.mod = mod; mod->init_layout.mtn.mod = mod; - __mod_tree_insert(&mod->core_layout.mtn); + __mod_tree_insert(&mod->core_layout.mtn, &mod_tree); if (mod->init_layout.size) - __mod_tree_insert(&mod->init_layout.mtn); + __mod_tree_insert(&mod->init_layout.mtn, &mod_tree); } void mod_tree_remove_init(struct module *mod) { if (mod->init_layout.size) - __mod_tree_remove(&mod->init_layout.mtn); + __mod_tree_remove(&mod->init_layout.mtn, &mod_tree); } void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->core_layout.mtn); + __mod_tree_remove(&mod->core_layout.mtn, &mod_tree); mod_tree_remove_init(mod); } -struct module *mod_find(unsigned long addr) +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) { struct latch_tree_node *ltn; - ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); + ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops); if (!ltn) return NULL; -- cgit v1.2.3 From 6ab9942c44b2d213a16b2620e4baf0223122222f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 13:02:13 +0100 Subject: module: Introduce data_layout In order to allow separation of data from text, add another layout, called data_layout. For architectures requesting separation of text and data, only text will go in core_layout and data will go in data_layout. For architectures which keep text and data together, make data_layout an alias of core_layout, that way data_layout can be used for all data manipulations, regardless of whether data is in core_layout or data_layout. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 2 ++ kernel/module/kallsyms.c | 18 +++++++++--------- kernel/module/main.c | 20 ++++++++++++-------- kernel/module/strict_rwx.c | 10 +++++----- 4 files changed, 28 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 0f3146347256..0aabbf5cbcd1 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -20,6 +20,8 @@ /* Maximum number of characters written by module_flags() */ #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) +#define data_layout core_layout + /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 438492e103c8..3e11523bc6f6 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -134,12 +134,12 @@ void layout_symtab(struct module *mod, struct load_info *info) } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_layout.size += strtab_size; - info->core_typeoffs = mod->core_layout.size; - mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = strict_align(mod->core_layout.size); + info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->data_layout.size += strtab_size; + info->core_typeoffs = mod->data_layout.size; + mod->data_layout.size += ndst * sizeof(char); + mod->data_layout.size = strict_align(mod->data_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; @@ -187,9 +187,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info) * Now populate the cut down core kallsyms for after init * and set types up while we still have access to sections. */ - mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; + mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; src = rcu_dereference_sched(mod->kallsyms)->symtab; for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) { rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info); diff --git a/kernel/module/main.c b/kernel/module/main.c index 392ac847d90a..78658283408d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1190,7 +1190,7 @@ static void free_module(struct module *mod) percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); /* Finally, free the core (containing the module structure) */ module_memfree(mod->core_layout.base); @@ -1431,13 +1431,15 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; const char *sname = info->secstrings + s->sh_name; + unsigned int *sizep; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL || module_init_layout_section(sname)) continue; - s->sh_entsize = module_get_offset(mod, &mod->core_layout.size, s, i); + sizep = m ? &mod->data_layout.size : &mod->core_layout.size; + s->sh_entsize = module_get_offset(mod, sizep, s, i); pr_debug("\t%s\n", sname); } switch (m) { @@ -1446,15 +1448,15 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_layout.size = strict_align(mod->core_layout.size); - mod->core_layout.ro_size = mod->core_layout.size; + mod->data_layout.size = strict_align(mod->data_layout.size); + mod->data_layout.ro_size = mod->data_layout.size; break; case 2: /* RO after init */ - mod->core_layout.size = strict_align(mod->core_layout.size); - mod->core_layout.ro_after_init_size = mod->core_layout.size; + mod->data_layout.size = strict_align(mod->data_layout.size); + mod->data_layout.ro_after_init_size = mod->data_layout.size; break; case 4: /* whole core */ - mod->core_layout.size = strict_align(mod->core_layout.size); + mod->data_layout.size = strict_align(mod->data_layout.size); break; } } @@ -2134,6 +2136,8 @@ static int move_module(struct module *mod, struct load_info *info) if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); + else if (!(shdr->sh_flags & SHF_EXECINSTR)) + dest = mod->data_layout.base + shdr->sh_entsize; else dest = mod->core_layout.base + shdr->sh_entsize; @@ -2829,7 +2833,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); module_deallocate(mod, info); free_copy: diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index f36ea54c1dac..fe3c10891407 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -101,12 +101,12 @@ void module_enable_ro(const struct module *mod, bool after_init) set_vm_flush_reset_perms(mod->init_layout.base); frob_text(&mod->core_layout, set_memory_ro); - frob_rodata(&mod->core_layout, set_memory_ro); + frob_rodata(&mod->data_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro); if (after_init) - frob_ro_after_init(&mod->core_layout, set_memory_ro); + frob_ro_after_init(&mod->data_layout, set_memory_ro); } void module_enable_nx(const struct module *mod) @@ -114,9 +114,9 @@ void module_enable_nx(const struct module *mod) if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) return; - frob_rodata(&mod->core_layout, set_memory_nx); - frob_ro_after_init(&mod->core_layout, set_memory_nx); - frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->data_layout, set_memory_nx); + frob_ro_after_init(&mod->data_layout, set_memory_nx); + frob_writable_data(&mod->data_layout, set_memory_nx); frob_rodata(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx); } -- cgit v1.2.3 From 01dc0386efb769056257410ba5754558384006a7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 13:02:14 +0100 Subject: module: Add CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC Add CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC to allow architectures to request having modules data in vmalloc area instead of module area. This is required on powerpc book3s/32 in order to set data non executable, because it is not possible to set executability on page basis, this is done per 256 Mbytes segments. The module area has exec right, vmalloc area has noexec. This can also be useful on other powerpc/32 in order to maximize the chance of code being close enough to kernel core to avoid branch trampolines. Cc: Jason Wessel Acked-by: Daniel Thompson Cc: Douglas Anderson Signed-off-by: Christophe Leroy [mcgrof: rebased in light of kernel/module/kdb.c move] Signed-off-by: Luis Chamberlain --- arch/Kconfig | 6 +++++ include/linux/module.h | 8 +++++++ kernel/module/internal.h | 3 +++ kernel/module/kdb.c | 10 ++++++-- kernel/module/main.c | 58 +++++++++++++++++++++++++++++++++++++++++++-- kernel/module/procfs.c | 8 +++++-- kernel/module/strict_rwx.c | 1 + kernel/module/tree_lookup.c | 8 +++++++ 8 files changed, 96 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 29b0167c088b..24945cee808b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -888,6 +888,12 @@ config MODULES_USE_ELF_REL Modules only use ELF REL relocations. Modules with ELF RELA relocations will give an error. +config ARCH_WANTS_MODULES_DATA_IN_VMALLOC + bool + help + For architectures like powerpc/32 which have constraints on module + allocation and need to allocate module data outside of module area. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help diff --git a/include/linux/module.h b/include/linux/module.h index 5e2059f3afc7..46d4d5f2516e 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -422,6 +422,9 @@ struct module { /* Core layout: rbtree is accessed frequently, so keep together. */ struct module_layout core_layout __module_layout_align; struct module_layout init_layout; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + struct module_layout data_layout; +#endif /* Arch-specific module values */ struct mod_arch_specific arch; @@ -569,6 +572,11 @@ bool is_module_text_address(unsigned long addr); static inline bool within_module_core(unsigned long addr, const struct module *mod) { +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + if ((unsigned long)mod->data_layout.base <= addr && + addr < (unsigned long)mod->data_layout.base + mod->data_layout.size) + return true; +#endif return (unsigned long)mod->core_layout.base <= addr && addr < (unsigned long)mod->core_layout.base + mod->core_layout.size; } diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 0aabbf5cbcd1..3e23bef5884d 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -20,7 +20,9 @@ /* Maximum number of characters written by module_flags() */ #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) +#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC #define data_layout core_layout +#endif /* * Modules' sections will be aligned on page boundaries @@ -154,6 +156,7 @@ struct mod_tree_root { }; extern struct mod_tree_root mod_tree; +extern struct mod_tree_root mod_data_tree; #ifdef CONFIG_MODULES_TREE_LOOKUP void mod_tree_insert(struct module *mod); diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c index a446c699db0a..f4317f92e189 100644 --- a/kernel/module/kdb.c +++ b/kernel/module/kdb.c @@ -26,8 +26,11 @@ int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u 0x%px ", mod->name, - mod->core_layout.size, (void *)mod); + kdb_printf("%-20s%8u", mod->name, mod->core_layout.size); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + kdb_printf("/%8u", mod->data_layout.size); +#endif + kdb_printf(" 0x%px ", (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); #endif @@ -38,6 +41,9 @@ int kdb_lsmod(int argc, const char **argv) else kdb_printf(" (Live)"); kdb_printf(" 0x%px", mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + kdb_printf("/0x%px", mod->data_layout.base); +#endif #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/module/main.c b/kernel/module/main.c index 78658283408d..84b828431dcb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -78,6 +78,12 @@ struct mod_tree_root mod_tree __cacheline_aligned = { .addr_min = -1UL, }; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +struct mod_tree_root mod_data_tree __cacheline_aligned = { + .addr_min = -1UL, +}; +#endif + #define module_addr_min mod_tree.addr_min #define module_addr_max mod_tree.addr_max @@ -107,6 +113,9 @@ static void mod_update_bounds(struct module *mod) __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree); if (mod->init_layout.size) __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + __mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree); +#endif } static void module_assert_mutex_or_preempt(void) @@ -940,6 +949,17 @@ static ssize_t show_coresize(struct module_attribute *mattr, static struct module_attribute modinfo_coresize = __ATTR(coresize, 0444, show_coresize, NULL); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +static ssize_t show_datasize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->data_layout.size); +} + +static struct module_attribute modinfo_datasize = + __ATTR(datasize, 0444, show_datasize, NULL); +#endif + static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { @@ -968,6 +988,9 @@ struct module_attribute *modinfo_attrs[] = { &modinfo_srcversion, &modinfo_initstate, &modinfo_coresize, +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + &modinfo_datasize, +#endif &modinfo_initsize, &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD @@ -1194,6 +1217,9 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_memfree(mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + vfree(mod->data_layout.base); +#endif } void *__symbol_get(const char *symbol) @@ -2124,6 +2150,24 @@ static int move_module(struct module *mod, struct load_info *info) } else mod->init_layout.base = NULL; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + /* Do the allocs. */ + ptr = vmalloc(mod->data_layout.size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); + if (!ptr) { + module_memfree(mod->core_layout.base); + module_memfree(mod->init_layout.base); + return -ENOMEM; + } + + memset(ptr, 0, mod->data_layout.size); + mod->data_layout.base = ptr; +#endif /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { @@ -2299,6 +2343,9 @@ static void module_deallocate(struct module *mod, struct load_info *info) module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); module_memfree(mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + vfree(mod->data_layout.base); +#endif } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3015,13 +3062,20 @@ bool is_module_address(unsigned long addr) struct module *__module_address(unsigned long addr) { struct module *mod; + struct mod_tree_root *tree; - if (addr < module_addr_min || addr > module_addr_max) + if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max) + tree = &mod_tree; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max) + tree = &mod_data_tree; +#endif + else return NULL; module_assert_mutex_or_preempt(); - mod = mod_find(addr, &mod_tree); + mod = mod_find(addr, tree); if (mod) { BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c index 2717e130788e..9a8f4f0f6329 100644 --- a/kernel/module/procfs.c +++ b/kernel/module/procfs.c @@ -67,13 +67,17 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[MODULE_FLAGS_BUF_SIZE]; void *value; + unsigned int size; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) return 0; - seq_printf(m, "%s %u", - mod->name, mod->init_layout.size + mod->core_layout.size); + size = mod->init_layout.size + mod->core_layout.size; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + size += mod->data_layout.size; +#endif + seq_printf(m, "%s %u", mod->name, size); print_unload_info(m, mod); /* Informative for users. */ diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index fe3c10891407..14fbea66f12f 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -75,6 +75,7 @@ bool module_check_misalignment(const struct module *mod) return false; return layout_check_misalignment(&mod->core_layout) || + layout_check_misalignment(&mod->data_layout) || layout_check_misalignment(&mod->init_layout); } diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 995fe68059db..8ec5cfd60496 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -83,6 +83,11 @@ void mod_tree_insert(struct module *mod) __mod_tree_insert(&mod->core_layout.mtn, &mod_tree); if (mod->init_layout.size) __mod_tree_insert(&mod->init_layout.mtn, &mod_tree); + +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + mod->data_layout.mtn.mod = mod; + __mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree); +#endif } void mod_tree_remove_init(struct module *mod) @@ -95,6 +100,9 @@ void mod_tree_remove(struct module *mod) { __mod_tree_remove(&mod->core_layout.mtn, &mod_tree); mod_tree_remove_init(mod); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + __mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree); +#endif } struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) -- cgit v1.2.3 From 55ce556dbf92ec779b65336593d213ceef3f26f3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 13:02:15 +0100 Subject: module: Remove module_addr_min and module_addr_max Replace module_addr_min and module_addr_max by mod_tree.addr_min and mod_tree.addr_max Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 84b828431dcb..05a42d8fcd7a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -63,7 +63,7 @@ * Mutex protects: * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, - * 3) module_addr_min/module_addr_max. + * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); @@ -2972,14 +2972,14 @@ static void cfi_init(struct module *mod) mod->exit = *exit; #endif - cfi_module_add(mod, module_addr_min); + cfi_module_add(mod, mod_tree.addr_min); #endif } static void cfi_cleanup(struct module *mod) { #ifdef CONFIG_CFI_CLANG - cfi_module_remove(mod, module_addr_min); + cfi_module_remove(mod, mod_tree.addr_min); #endif } -- cgit v1.2.3 From be77354a3d7ebd4897ee18eca26dca6df9224c76 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:23 +0530 Subject: bpf: Do write access check for kfunc and global func When passing pointer to some map value to kfunc or global func, in verifier we are passing meta as NULL to various functions, which uses meta->raw_mode to check whether memory is being written to. Since some kfunc or global funcs may also write to memory pointers they receive as arguments, we must check for write access to memory. E.g. in some case map may be read only and this will be missed by current checks. However meta->raw_mode allows for uninitialized memory (e.g. on stack), since there is not enough info available through BTF, we must perform one call for read access (raw_mode = false), and one for write access (raw_mode = true). Fixes: e5069b9c23b3 ("bpf: Support pointers in global func args") Fixes: d583691c47dc ("bpf: Introduce mem, size argument pair support for kfunc") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d175b70067b3..e9807e6e1090 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4919,8 +4919,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - if (meta) - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -4963,24 +4962,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { + bool may_be_null = type_may_be_null(reg->type); + struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; + int err; + if (register_is_null(reg)) return 0; - if (type_may_be_null(reg->type)) { - /* Assuming that the register contains a value check if the memory - * access is safe. Temporarily save and restore the register's state as - * the conversion shouldn't be visible to a caller. - */ - const struct bpf_reg_state saved_reg = *reg; - int rv; - + memset(&meta, 0, sizeof(meta)); + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + if (may_be_null) { + saved_reg = *reg; mark_ptr_not_null_reg(reg); - rv = check_helper_mem_access(env, regno, mem_size, true, NULL); - *reg = saved_reg; - return rv; } - return check_helper_mem_access(env, regno, mem_size, true, NULL); + err = check_helper_mem_access(env, regno, mem_size, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + + if (may_be_null) + *reg = saved_reg; + + return err; } int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, @@ -4989,16 +4997,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; int err; WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + memset(&meta, 0, sizeof(meta)); + if (may_be_null) { saved_reg = *mem_reg; mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, NULL); + err = check_mem_size_reg(env, reg, regno, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); if (may_be_null) *mem_reg = saved_reg; -- cgit v1.2.3 From 97e6d7dab1ca4648821c790a2b7913d6d5d549db Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:24 +0530 Subject: bpf: Check PTR_TO_MEM | MEM_RDONLY in check_helper_mem_access The commit being fixed was aiming to disallow users from incorrectly obtaining writable pointer to memory that is only meant to be read. This is enforced now using a MEM_RDONLY flag. For instance, in case of global percpu variables, when the BTF type is not struct (e.g. bpf_prog_active), the verifier marks register type as PTR_TO_MEM | MEM_RDONLY from bpf_this_cpu_ptr or bpf_per_cpu_ptr helpers. However, when passing such pointer to kfunc, global funcs, or BPF helpers, in check_helper_mem_access, there is no expectation MEM_RDONLY flag will be set, hence it is checked as pointer to writable memory. Later, verifier sets up argument type of global func as PTR_TO_MEM | PTR_MAYBE_NULL, so user can use a global func to get around the limitations imposed by this flag. This check will also cover global non-percpu variables that may be introduced in kernel BTF in future. Also, we update the log message for PTR_TO_BUF case to be similar to PTR_TO_MEM case, so that the reason for error is clear to user. Fixes: 34d3a78c681e ("bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM.") Reviewed-by: Hao Luo Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9807e6e1090..d953e62b5268 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4871,13 +4871,23 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MEM: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + } return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); return -EACCES; + } max_access = &env->prog->aux->max_rdonly_access; } else { -- cgit v1.2.3 From 7b3552d3f9f6897851fc453b5131a967167e43c2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:25 +0530 Subject: bpf: Reject writes for PTR_TO_MAP_KEY in check_helper_mem_access It is not permitted to write to PTR_TO_MAP_KEY, but the current code in check_helper_mem_access would allow for it, reject this case as well, as helpers taking ARG_PTR_TO_UNINIT_MEM also take PTR_TO_MAP_KEY. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d953e62b5268..9c1a02b82ecd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4861,6 +4861,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: -- cgit v1.2.3 From a60707d74bd1d119cf7bcc5101cda912fc46d5e3 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:57 +0800 Subject: sched: Move child_runs_first sysctls to fair.c move child_runs_first sysctls to fair.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 2 -- kernel/sched/fair.c | 19 +++++++++++++++++++ kernel/sched/sched.h | 2 ++ kernel/sysctl.c | 7 ------- 4 files changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c1076b5e17fb..1d2ff3cd1728 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -14,8 +14,6 @@ extern unsigned long sysctl_hung_task_timeout_secs; enum { sysctl_hung_task_timeout_secs = 0 }; #endif -extern unsigned int sysctl_sched_child_runs_first; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..788b1d6a3248 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -109,6 +109,25 @@ static unsigned int sched_nr_latency = 8; * parent will (try to) run first. */ unsigned int sysctl_sched_child_runs_first __read_mostly; +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_child_runs_first_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int __init sched_child_runs_first_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_child_runs_first_sysctls); + return 0; +} +late_initcall(sched_child_runs_first_sysctl_init); +#endif /* * SCHED_OTHER wake-up granularity. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58263f90c559..767fc1de9646 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -108,6 +108,8 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern unsigned int sysctl_sched_child_runs_first; + extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..6bbb8e1af675 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,13 +1659,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- cgit v1.2.3 From f5ef06d58be8311a9425e6a54a053ecb350952f3 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:58 +0800 Subject: sched: Move schedstats sysctls to core.c move schedstats sysctls to core.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 2 -- kernel/sched/core.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 11 ----------- 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1d2ff3cd1728..6c7a6850559b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -64,8 +64,6 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..04440da5955f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4430,7 +4430,7 @@ out: __setup("schedstats=", setup_schedstats); #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4449,6 +4449,26 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, set_schedstats(state); return err; } + +static struct ctl_table sched_schedstats_sysctls[] = { + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_schedstats_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_schedstats_sysctls); + return 0; +} +late_initcall(sched_schedstats_sysctl_init); #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6bbb8e1af675..fc0eeca20718 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,17 +1659,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_TASK_DELAY_ACCT { .procname = "task_delayacct", -- cgit v1.2.3 From d9ab0e63fa7f8405fbb19e28c5191e0880a7f2db Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:59 +0800 Subject: sched: Move rt_period/runtime sysctls to rt.c move rt_period/runtime sysctls to rt.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 11 ----------- kernel/rcu/rcu.h | 2 ++ kernel/sched/core.c | 13 ------------- kernel/sched/rt.c | 43 ++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 4 ++++ kernel/sysctl.c | 14 -------------- 6 files changed, 48 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 6c7a6850559b..4391c1307945 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,15 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -/* - * control realtime throttling: - * - * /proc/sys/kernel/sched_rt_period_us - * /proc/sys/kernel/sched_rt_runtime_us - */ -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - extern unsigned int sysctl_sched_dl_period_max; extern unsigned int sysctl_sched_dl_period_min; @@ -58,8 +49,6 @@ extern int sched_rr_timeslice; int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..7812c740b3bf 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,8 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +extern int sysctl_sched_rt_runtime; + /* * Return the counter portion of a sequence number previously returned * by rcu_seq_snap() or rcu_seq_current(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 04440da5955f..774f3229db37 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -145,12 +145,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = 8; const_debug unsigned int sysctl_sched_nr_migrate = 32; #endif -/* - * period over which we measure -rt task CPU usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - __read_mostly int scheduler_running; #ifdef CONFIG_SCHED_CORE @@ -444,13 +438,6 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ -/* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - - /* * Serialization rules: * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a32c46889af8..5663bb5ff890 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -13,6 +13,47 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + {} +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -2925,7 +2966,7 @@ static void sched_rt_do_global(void) raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_period, old_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 767fc1de9646..3b406c78a8e9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -114,6 +114,10 @@ extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; + /* * Helpers for converting nanosecond timing to jiffy resolution */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fc0eeca20718..029bfe06c68d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,20 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, { .procname = "sched_deadline_period_max_us", .data = &sysctl_sched_dl_period_max, -- cgit v1.2.3 From 84227c12888b1105725cd2de14705b029bcbb4b2 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:00 +0800 Subject: sched: Move deadline_period sysctls to deadline.c move deadline_period sysctls to deadline.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 3 --- kernel/sched/deadline.c | 42 ++++++++++++++++++++++++++++++++++-------- kernel/sysctl.c | 14 -------------- 3 files changed, 34 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4391c1307945..7da9b94c5e1c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,9 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -extern unsigned int sysctl_sched_dl_period_max; -extern unsigned int sysctl_sched_dl_period_min; - #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb4255ae0b2c..82e10b74a7b2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,40 @@ * Fabio Checconi */ +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { return container_of(dl_se, struct task_struct, dl); @@ -2879,14 +2913,6 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_flags |= dl_se->flags; } -/* - * Default limits for DL period; on the top end we guard against small util - * tasks still getting ridiculously long effective runtimes, on the bottom end we - * guard against timer DoS. - */ -unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ -unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ - /* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 029bfe06c68d..2c8c75e11a37 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,20 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - { - .procname = "sched_deadline_period_max_us", - .data = &sysctl_sched_dl_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_deadline_period_min_us", - .data = &sysctl_sched_dl_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, -- cgit v1.2.3 From dafd7a9dad22fadcb290b24dff54e2eae3b89776 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:01 +0800 Subject: sched: Move rr_timeslice sysctls to rt.c move rr_timeslice sysctls to rt.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 5 ----- kernel/sched/rt.c | 13 +++++++++++-- kernel/sched/sched.h | 1 + kernel/sysctl.c | 7 ------- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7da9b94c5e1c..3d307e512d1f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,11 +41,6 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -extern int sysctl_sched_rr_timeslice; -extern int sched_rr_timeslice; - -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5663bb5ff890..71791be36065 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,7 +5,7 @@ */ int sched_rr_timeslice = RR_TIMESLICE; -int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -27,6 +27,8 @@ int sysctl_sched_rt_runtime = 950000; static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #ifdef CONFIG_SYSCTL static struct ctl_table sched_rt_sysctls[] = { { @@ -43,6 +45,13 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, {} }; @@ -3005,7 +3014,7 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3b406c78a8e9..ae0f6e5a76f9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -117,6 +117,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count); extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; /* * Helpers for converting nanosecond timing to jiffy resolution diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2c8c75e11a37..b074f70a3e11 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,13 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, #ifdef CONFIG_UCLAMP_TASK { .procname = "sched_util_clamp_min", -- cgit v1.2.3 From 28f152cd0926596e69d412467b11b6fe6fe4e864 Mon Sep 17 00:00:00 2001 From: Baisong Zhong Date: Fri, 18 Mar 2022 10:54:17 +0800 Subject: sched/rt: fix build error when CONFIG_SYSCTL is disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid random build errors which do not select CONFIG_SYSCTL by depending on it in Kconfig. This fixes the following warning: In file included from kernel/sched/build_policy.c:43: At top level: kernel/sched/rt.c:3017:12: error: ‘sched_rr_handler’ defined but not used [-Werror=unused-function] 3017 | static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, | ^~~~~~~~~~~~~~~~ kernel/sched/rt.c:2978:12: error: ‘sched_rt_handler’ defined but not used [-Werror=unused-function] 2978 | static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, | ^~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors make[2]: *** [scripts/Makefile.build:310: kernel/sched/build_policy.o] Error 1 make[1]: *** [scripts/Makefile.build:638: kernel/sched] Error 2 make[1]: *** Waiting for unfinished jobs.... Reported-by: Hulk Robot Signed-off-by: Baisong Zhong [mcgrof: small build fix, we need sched_rt_can_attach() even when CONFIG_SYSCTL is disabled] Signed-off-by: Luis Chamberlain --- kernel/sched/rt.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 71791be36065..b491a0f8c25d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,7 +5,6 @@ */ int sched_rr_timeslice = RR_TIMESLICE; -static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -25,11 +24,12 @@ unsigned int sysctl_sched_rt_period = 1000000; */ int sysctl_sched_rt_runtime = 950000; +#ifdef CONFIG_SYSCTL +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_SYSCTL static struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", @@ -2911,6 +2911,7 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { int ret = 0; @@ -2921,6 +2922,7 @@ static int sched_rt_global_constraints(void) return ret; } +#endif /* CONFIG_SYSCTL */ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { @@ -2932,6 +2934,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) } #else /* !CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { unsigned long flags; @@ -2949,8 +2953,10 @@ static int sched_rt_global_constraints(void) return 0; } +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { if (sysctl_sched_rt_period <= 0) @@ -3035,6 +3041,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, return ret; } +#endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) -- cgit v1.2.3 From 3267e0156c3341ac25b37a0f60551cdae1634b60 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:02 +0800 Subject: sched: Move uclamp_util sysctls to core.c move uclamp_util sysctls to core.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 8 -------- kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++---------- kernel/sysctl.c | 23 --------------------- 3 files changed, 37 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3d307e512d1f..0934b21a57a4 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,18 +31,10 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -#ifdef CONFIG_UCLAMP_TASK -extern unsigned int sysctl_sched_uclamp_util_min; -extern unsigned int sysctl_sched_uclamp_util_max; -extern unsigned int sysctl_sched_uclamp_util_min_rt_default; -#endif - #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 774f3229db37..ef31751c5799 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1306,10 +1306,10 @@ static void set_load_weight(struct task_struct *p, bool update_load) static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* * By default RT tasks run at the maximum performance point/capacity of the @@ -1326,7 +1326,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; * This knob will not override the system default sched_util_clamp_min defined * above. */ -unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -1779,7 +1779,7 @@ static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { } #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; @@ -4436,8 +4436,12 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, set_schedstats(state); return err; } +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_SCHEDSTATS */ -static struct ctl_table sched_schedstats_sysctls[] = { +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_core_sysctls[] = { +#ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", .data = NULL, @@ -4447,17 +4451,39 @@ static struct ctl_table sched_schedstats_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif /* CONFIG_UCLAMP_TASK */ {} }; - -static int __init sched_schedstats_sysctl_init(void) +static int __init sched_core_sysctl_init(void) { - register_sysctl_init("kernel", sched_schedstats_sysctls); + register_sysctl_init("kernel", sched_core_sysctls); return 0; } -late_initcall(sched_schedstats_sysctl_init); -#endif /* CONFIG_PROC_SYSCTL */ -#endif /* CONFIG_SCHEDSTATS */ +late_initcall(sched_core_sysctl_init); +#endif /* CONFIG_SYSCTL */ /* * fork()/clone()-time setup: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b074f70a3e11..a48c090d57f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,29 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_min_rt_default", - .data = &sysctl_sched_uclamp_util_min_rt_default, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -- cgit v1.2.3 From d4ae80ffa64f87b9c355692b680b603add084e96 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:03 +0800 Subject: sched: Move cfs_bandwidth_slice sysctls to fair.c move cfs_bandwidth_slice sysctls to fair.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 4 ---- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++----------------- kernel/sysctl.c | 10 --------- 3 files changed, 31 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0934b21a57a4..198f77c8a873 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,10 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 788b1d6a3248..265bf7a75a37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -109,25 +109,6 @@ static unsigned int sched_nr_latency = 8; * parent will (try to) run first. */ unsigned int sysctl_sched_child_runs_first __read_mostly; -#ifdef CONFIG_SYSCTL -static struct ctl_table sched_child_runs_first_sysctls[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - {} -}; - -static int __init sched_child_runs_first_sysctl_init(void) -{ - register_sysctl_init("kernel", sched_child_runs_first_sysctls); - return 0; -} -late_initcall(sched_child_runs_first_sysctl_init); -#endif /* * SCHED_OTHER wake-up granularity. @@ -192,7 +173,37 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_fair_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif + {} +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a48c090d57f9..10ab81c7c457 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,16 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) { .procname = "sched_energy_aware", -- cgit v1.2.3 From 8a0441415b3f9b9a920a6a5086580ea3daa7b884 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:04 +0800 Subject: sched: Move energy_aware sysctls to topology.c move energy_aware sysctls to topology.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 6 ------ kernel/sched/topology.c | 25 +++++++++++++++++++++++-- kernel/sysctl.c | 11 ----------- 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 198f77c8a873..e650946816d0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -34,10 +34,4 @@ extern int sysctl_numa_balancing_mode; int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -extern unsigned int sysctl_sched_energy_aware; -int sched_energy_aware_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 810750e62118..05b6c2ad90b9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -206,7 +206,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); -unsigned int sysctl_sched_energy_aware = 1; +static unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -220,7 +220,7 @@ void rebuild_sched_domains_energy(void) } #ifdef CONFIG_PROC_SYSCTL -int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -237,6 +237,27 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table sched_energy_aware_sysctls[] = { + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_energy_aware_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_energy_aware_sysctls); + return 0; +} + +late_initcall(sched_energy_aware_sysctl_init); #endif static void free_pd(struct perf_domain *pd) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 10ab81c7c457..8241c5401ee8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,17 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -- cgit v1.2.3 From 06d177662fb86b80c7fc2290667b9a14cb0bd925 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Thu, 17 Feb 2022 12:23:21 +0800 Subject: kernel/reboot: move reboot sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the poweroff_cmd and ctrl-alt-del sysctls to its own file, kernel/reboot.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/reboot.h | 4 ---- kernel/reboot.c | 34 ++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 14 -------------- 3 files changed, 32 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index af907a3d68d1..a2429648d831 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -71,12 +71,8 @@ extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); -extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); -#define POWEROFF_CMD_PATH_LEN 256 -extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; - extern void orderly_poweroff(bool force); extern void orderly_reboot(void); void hw_protection_shutdown(const char *reason, int ms_until_forced); diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..ed4e6dfb7d44 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -23,7 +23,7 @@ * this indicates whether you can reboot with ctrl-alt-del: the default is yes */ -int C_A_D = 1; +static int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); @@ -417,9 +417,37 @@ void ctrl_alt_del(void) kill_cad_pid(SIGINT, 1); } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +#define POWEROFF_CMD_PATH_LEN 256 +static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; static const char reboot_cmd[] = "/sbin/reboot"; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_reboot_table[] = { + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static void __init kernel_reboot_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_reboot_table); +} +#else +#define kernel_reboot_sysctls_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static int run_cmd(const char *cmd) { char **argv; @@ -886,6 +914,8 @@ static int __init reboot_ksysfs_init(void) return ret; } + kernel_reboot_sysctls_init(); + return 0; } late_initcall(reboot_ksysfs_init); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8241c5401ee8..5e43569ce2be 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1798,13 +1798,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_FUNCTION_TRACER { .procname = "ftrace_enabled", @@ -2111,13 +2104,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, #ifdef CONFIG_KEYS { .procname = "keys", -- cgit v1.2.3 From 43fe219aa56a2fdd8f0623c9470a32b14b0617a5 Mon Sep 17 00:00:00 2001 From: sujiaxun Date: Thu, 17 Feb 2022 18:51:48 -0800 Subject: mm: move oom_kill sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the oom_kill sysctls to their own file, mm/oom_kill.c [sfr@canb.auug.org.au: null-terminate the array] Link: https://lkml.kernel.org/r/20220216193202.28838626@canb.auug.org.au Link: https://lkml.kernel.org/r/20220215093203.31032-1-sujiaxun@uniontech.com Signed-off-by: sujiaxun Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Luis Chamberlain --- include/linux/oom.h | 4 ---- kernel/sysctl.c | 23 ----------------------- mm/oom_kill.c | 38 +++++++++++++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a1432511..02d1e7bbd8cd 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -123,8 +123,4 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -/* sysctls */ -extern int sysctl_oom_dump_tasks; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_panic_on_oom; #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5e43569ce2be..a21c0ea396f3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2241,29 +2241,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7ec38194f8e1..7cc338a9e9e4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -52,9 +52,38 @@ #define CREATE_TRACE_POINTS #include -int sysctl_panic_on_oom; -int sysctl_oom_kill_allocating_task; -int sysctl_oom_dump_tasks = 1; +static int sysctl_panic_on_oom; +static int sysctl_oom_kill_allocating_task; +static int sysctl_oom_dump_tasks = 1; + +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_oom_kill_table[] = { + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* * Serializes oom killer invocations (out_of_memory()) from all contexts to @@ -677,6 +706,9 @@ static void wake_oom_reaper(struct task_struct *tsk) static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_oom_kill_table); +#endif return 0; } subsys_initcall(oom_init) -- cgit v1.2.3 From aa779e5102195e1d9ade95dcbc0bfbd8f916eb59 Mon Sep 17 00:00:00 2001 From: zhanglianjie Date: Thu, 17 Feb 2022 18:51:51 -0800 Subject: mm: move page-writeback sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the page-writeback sysctls to its own file. [akpm@linux-foundation.org: coding-style cleanups] akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warnings] Link: https://lkml.kernel.org/r/20220129012955.26594-1-zhanglianjie@uniontech.com Signed-off-by: zhanglianjie Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Luis Chamberlain --- include/linux/writeback.h | 15 ------- kernel/sysctl.c | 69 ------------------------------- mm/page-writeback.c | 103 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 93 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec..dc2b94e6a94f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -345,28 +345,13 @@ void wb_domain_exit(struct wb_domain *dom); extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ -extern int dirty_background_ratio; -extern unsigned long dirty_background_bytes; -extern int vm_dirty_ratio; -extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; -extern int vm_highmem_is_dirtyable; extern int laptop_mode; -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a21c0ea396f3..36bfe1c92d44 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,8 +100,6 @@ static const int six_hundred_forty_kb = 640 * 1024; #endif -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2263,55 +2261,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, @@ -2483,13 +2432,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, @@ -2587,17 +2529,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e2da284e427..438762173a59 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,30 +70,33 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ -int dirty_background_ratio = 10; +static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ -unsigned long dirty_background_bytes; +static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ -int vm_highmem_is_dirtyable; +static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +static int vm_dirty_ratio = 20; + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ -unsigned long vm_dirty_bytes; +static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks @@ -491,7 +494,8 @@ bool node_dirty_ok(struct pglist_data *pgdat) return nr_pages <= limit; } -int dirty_background_ratio_handler(struct ctl_table *table, int write, +#ifdef CONFIG_SYSCTL +static int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -502,7 +506,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -int dirty_background_bytes_handler(struct ctl_table *table, int write, +static int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -513,7 +517,7 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, +static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; @@ -527,7 +531,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, return ret; } -int dirty_bytes_handler(struct ctl_table *table, int write, +static int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; @@ -540,6 +544,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, } return ret; } +#endif static unsigned long wp_next_time(unsigned long cur_time) { @@ -1981,10 +1986,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } +#ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, +static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; @@ -2005,6 +2011,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } +#endif void laptop_mode_timer_fn(struct timer_list *t) { @@ -2069,6 +2076,79 @@ static int page_writeback_cpu_online(unsigned int cpu) return 0; } +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif + /* * Called early on to tune the page writeback dirty limits. * @@ -2093,6 +2173,9 @@ void __init page_writeback_init(void) page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif } /** -- cgit v1.2.3 From f79c9b8ae8bde10126586c1bb55b5fd027276d8e Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:58:57 +0800 Subject: kernel/lockdep: move lockdep sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the prove_locking and lock_stat sysctls to its own file, kernel/lockdep.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/lockdep.h | 4 ---- kernel/locking/lockdep.c | 35 +++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 21 --------------------- 3 files changed, 33 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 467b94257105..37951c17908e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -16,10 +16,6 @@ struct task_struct; -/* for sysctl */ -extern int prove_locking; -extern int lock_stat; - #ifdef CONFIG_LOCKDEP #include diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c06cab6546ed..a4382ae1be59 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -64,19 +64,50 @@ #include #ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; +static int prove_locking = 1; module_param(prove_locking, int, 0644); #else #define prove_locking 0 #endif #ifdef CONFIG_LOCK_STAT -int lock_stat = 1; +static int lock_stat = 1; module_param(lock_stat, int, 0644); #else #define lock_stat 0 #endif +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_lockdep_table[] = { +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_LOCK_STAT */ + { } +}; + +static __init int kernel_lockdep_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_lockdep_table); + return 0; +} +late_initcall(kernel_lockdep_sysctls_init); +#endif /* CONFIG_SYSCTL */ + DEFINE_PER_CPU(unsigned int, lockdep_recursion); EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 36bfe1c92d44..95380d250c8c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -88,9 +88,6 @@ #ifdef CONFIG_RT_MUTEXES #include #endif -#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) -#include -#endif #if defined(CONFIG_SYSCTL) @@ -1679,24 +1676,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "panic", .data = &panic_timeout, -- cgit v1.2.3 From 9df918698408fd914493aba0b7858fef50eba63a Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:12 +0800 Subject: kernel/panic: move panic sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the oops_all_cpu_backtrace sysctl to its own file, kernel/panic.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/panic.h | 6 ------ kernel/panic.c | 26 +++++++++++++++++++++++++- kernel/sysctl.c | 11 ----------- 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/panic.h b/include/linux/panic.h index f5844908a089..e71161da69c4 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -15,12 +15,6 @@ extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); -#ifdef CONFIG_SMP -extern unsigned int sysctl_oops_all_cpu_backtrace; -#else -#define sysctl_oops_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; diff --git a/kernel/panic.c b/kernel/panic.c index eb4dfb932c85..eb3f2fe4f6d7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -43,7 +43,9 @@ * Should we dump all CPUs backtraces in an oops event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; @@ -73,6 +75,28 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +static struct ctl_table kern_panic_table[] = { + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_panic_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_panic_table); + return 0; +} +late_initcall(kernel_panic_sysctls_init); +#endif + static long no_blink(int state) { return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 95380d250c8c..90fc2212b536 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1922,17 +1922,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SMP - { - .procname = "oops_all_cpu_backtrace", - .data = &sysctl_oops_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, -- cgit v1.2.3 From 801b501439d1b366d524dee4fc1e6b3473a95b9a Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:23 +0800 Subject: kernel/acct: move acct sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the acct sysctl to its own file, kernel/acct.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/acct.h | 1 - kernel/acct.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 12 ------------ 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/acct.h b/include/linux/acct.h index bc70e81895c0..2718c4854815 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -21,7 +21,6 @@ #ifdef CONFIG_BSD_PROCESS_ACCT struct pid_namespace; -extern int acct_parm[]; /* for sysctl */ extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); diff --git a/kernel/acct.c b/kernel/acct.c index 3df53cf1dcd5..13706356ec54 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -70,11 +70,31 @@ * Turned into sysctl-controllable parameters. AV, 12/11/98 */ -int acct_parm[3] = {4, 2, 30}; +static int acct_parm[3] = {4, 2, 30}; #define RESUME (acct_parm[0]) /* >foo% free space - resume */ #define SUSPEND (acct_parm[1]) /* #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -#include -#endif #ifdef CONFIG_RT_MUTEXES #include #endif @@ -1856,15 +1853,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MAGIC_SYSRQ { .procname = "sysrq", -- cgit v1.2.3 From 1186618a6a35d43a865448472a261184b608d13c Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:36 +0800 Subject: kernel/delayacct: move delayacct sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the delayacct sysctl to its own file, kernel/delayacct.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/delayacct.h | 3 --- kernel/delayacct.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 12 ------------ 3 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 3e03d010bd2e..6b16a6930a19 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -61,9 +61,6 @@ extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); -extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index c5e8cea9e05f..2c1e18f7c5cf 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,7 +44,7 @@ void delayacct_init(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, +static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int state = delayacct_on; @@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, set_delayacct(state); return err; } + +static struct ctl_table kern_delayacct_table[] = { + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_delayacct_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_delayacct_table); + return 0; +} +late_initcall(kernel_delayacct_sysctls_init); #endif void __delayacct_tsk_init(struct task_struct *tsk) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5421e28dbb25..9b74ba12a711 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,7 +67,6 @@ #include #include #include -#include #include "../lib/kstrtox.h" @@ -1651,17 +1650,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { -#ifdef CONFIG_TASK_DELAY_ACCT - { - .procname = "task_delayacct", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_delayacct, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", -- cgit v1.2.3 From d772cc2c321900f3f463a124eebeb7218e66dda6 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:49 +0800 Subject: kernel/do_mount_initrd: move real_root_dev sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the real_root_dev sysctl to its own file, kernel/do_mount_initrd.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/initrd.h | 2 -- init/do_mounts_initrd.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 9 --------- 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 1bbe9af48dc3..f1a1f4c92ded 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -29,8 +29,6 @@ static inline void wait_for_initramfs(void) {} extern phys_addr_t phys_initrd_start; extern unsigned long phys_initrd_size; -extern unsigned int real_root_dev; - extern char __initramfs_start[]; extern unsigned long __initramfs_size; diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 533d81ed74d4..327962ea354c 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -14,12 +14,32 @@ unsigned long initrd_start, initrd_end; int initrd_below_start_ok; -unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ +static unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ static int __initdata mount_initrd = 1; phys_addr_t phys_initrd_start __initdata; unsigned long phys_initrd_size __initdata; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_do_mounts_initrd_table[] = { + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static __init int kernel_do_mounts_initrd_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_do_mounts_initrd_table); + return 0; +} +late_initcall(kernel_do_mounts_initrd_sysctls_init); +#endif /* CONFIG_SYSCTL */ + static int __init no_initrd(char *str) { mount_initrd = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9b74ba12a711..10a551f8fcab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1693,15 +1693,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_latencytop, }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { .procname = "print-fatal-signals", -- cgit v1.2.3 From 8e4e83b2278bdfb55cb2b13de07cf0a721ce8af7 Mon Sep 17 00:00:00 2001 From: Wei Xiao Date: Wed, 23 Feb 2022 19:11:53 +0800 Subject: ftrace: move sysctl_ftrace_enabled to ftrace.c This moves ftrace_enabled to trace/ftrace.c. We move sysctls to places where features actually belong to improve the readability of the code and reduce the risk of code merge conflicts. At the same time, the proc-sysctl maintainers do not want to know what sysctl knobs you wish to add for your owner piece of code, we just care about the core logic. Signed-off-by: Wei Xiao Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- include/linux/ftrace.h | 3 --- kernel/sysctl.c | 9 --------- kernel/trace/ftrace.c | 22 +++++++++++++++++++++- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..088b915853dd 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -101,9 +101,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; -extern int -ftrace_enable_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 10a551f8fcab..21172d3dad6e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1751,15 +1751,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif #ifdef CONFIG_STACK_TRACER { .procname = "stack_tracer_enabled", diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..a5efbbc289b4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7921,7 +7921,8 @@ static bool is_permanent_ops_registered(void) return false; } -int +#ifdef CONFIG_SYSCTL +static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -7964,3 +7965,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static struct ctl_table ftrace_sysctls[] = { + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, + {} +}; + +static int __init ftrace_sysctl_init(void) +{ + register_sysctl_init("kernel", ftrace_sysctls); + return 0; +} +late_initcall(ftrace_sysctl_init); +#endif -- cgit v1.2.3 From 807ff7ed34d2c83e09d7987bd6506cc3d520dcdf Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 7 Apr 2022 13:43:15 +0200 Subject: futex: add missing rtmutex.h include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't included here any more since the removal of ww_mutex.h from seqlock.h which causes a build break. Signed-off-by: Christian König Acked-by: Shashank Sharma Fixes: e84815cbbc76 ("seqlock: drop seqcount_ww_mutex_t") Link: https://patchwork.freedesktop.org/patch/msgid/20220407114619.961750-1-christian.koenig@amd.com --- kernel/futex/futex.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c264cbeab71c..b5379c0e6d6d 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -3,6 +3,7 @@ #define _FUTEX_H #include +#include #include #ifdef CONFIG_PREEMPT_RT -- cgit v1.2.3 From b45043192b3e481304062938a6561da2ceea46a6 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 7 Apr 2022 21:04:23 +0800 Subject: bpf: Fix excessive memory allocation in stack_map_alloc() The 'n_buckets * (value_size + sizeof(struct stack_map_bucket))' part of the allocated memory for 'smap' is never used after the memlock accounting was removed, thus get rid of it. [ Note, Daniel: Commit b936ca643ade ("bpf: rework memlock-based memory accounting for maps") moved `cost += n_buckets * (value_size + sizeof(struct stack_map_bucket))` up and therefore before the bpf_map_area_alloc() allocation, sigh. In a later step commit c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()"), and the overflow checks of `cost >= U32_MAX - PAGE_SIZE` moved into bpf_map_charge_init(). And then 370868107bf6 ("bpf: Eliminate rlimit-based memory accounting for stackmap maps") finally removed the bpf_map_charge_init(). Anyway, the original code did the allocation same way as /after/ this fix. ] Fixes: b936ca643ade ("bpf: rework memlock-based memory accounting for maps") Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220407130423.798386-1-ytcoode@gmail.com --- kernel/bpf/stackmap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6131b4a19572..1dd5266fbebb 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -100,7 +100,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From aa1b02e674fe69acd04624f5bcdef94928bc8695 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sun, 10 Apr 2022 14:00:19 +0800 Subject: bpf: Remove redundant assignment to meta.seq in __task_seq_show() The seq argument is assigned to meta.seq twice, the second one is redundant, remove it. This patch also removes a redundant space in bpf_iter_link_attach(). Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220410060020.307283-1-ytcoode@gmail.com --- kernel/bpf/bpf_iter.c | 2 +- kernel/bpf/task_iter.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dea920b3b840..d5d96ceca105 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -545,7 +545,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index d94696198ef8..8c921799def4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -99,7 +99,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task, if (!prog) return 0; - meta.seq = seq; ctx.meta = &meta; ctx.task = task; return bpf_iter_run_prog(prog, &ctx); -- cgit v1.2.3 From 2900005ea287b11dcc8c1b9fcf24893b7ff41d6d Mon Sep 17 00:00:00 2001 From: Yan Zhu Date: Thu, 7 Apr 2022 15:07:59 +0800 Subject: bpf: Move BPF sysctls from kernel/sysctl.c to BPF core We're moving sysctls out of kernel/sysctl.c as it is a mess. We already moved all filesystem sysctls out. And with time the goal is to move all sysctls out to their own subsystem/actual user. kernel/sysctl.c has grown to an insane mess and its easy to run into conflicts with it. The effort to move them out into various subsystems is part of this. Signed-off-by: Yan Zhu Signed-off-by: Daniel Borkmann Cc: Luis Chamberlain Link: https://lore.kernel.org/bpf/20220407070759.29506-1-zhuyan34@huawei.com --- kernel/bpf/syscall.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 79 ----------------------------------------------- 2 files changed, 87 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cdaa1152436a..e9621cfa09f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4908,3 +4908,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = { const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; + +#ifdef CONFIG_SYSCTL +static int bpf_stats_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} + +void __weak unpriv_ebpf_notify(int new_state) +{ +} + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + + unpriv_ebpf_notify(unpriv_enable); + + return ret; +} + +static struct ctl_table bpf_syscall_table[] = { + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = bpf_stats_handler, + }, + { } +}; + +static int __init bpf_syscall_sysctl_init(void) +{ + register_sysctl_init("kernel", bpf_syscall_table); + return 0; +} +late_initcall(bpf_syscall_sysctl_init); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..47139877f62d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,7 +62,6 @@ #include #include #include -#include #include #include #include @@ -148,66 +147,6 @@ static const int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) -static int bpf_stats_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct static_key *key = (struct static_key *)table->data; - static int saved_val; - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&bpf_stats_enabled_mutex); - val = saved_val; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret && val != saved_val) { - if (val) - static_key_slow_inc(key); - else - static_key_slow_dec(key); - saved_val = val; - } - mutex_unlock(&bpf_stats_enabled_mutex); - return ret; -} - -void __weak unpriv_ebpf_notify(int new_state) -{ -} - -static int bpf_unpriv_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret, unpriv_enable = *(int *)table->data; - bool locked_state = unpriv_enable == 1; - struct ctl_table tmp = *table; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - tmp.data = &unpriv_enable; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (locked_state && unpriv_enable != 1) - return -EPERM; - *(int *)table->data = unpriv_enable; - } - - unpriv_ebpf_notify(unpriv_enable); - - return ret; -} -#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ - /* * /proc/sys support */ @@ -2299,24 +2238,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - .proc_handler = bpf_unpriv_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = bpf_stats_handler, - }, -#endif #if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", -- cgit v1.2.3 From 241d50ec5d79b94694adf13853c1f55d0f0b85e6 Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Tue, 12 Apr 2022 18:50:48 -0700 Subject: bpf: Remove unnecessary type castings Remove/clean up unnecessary void * type castings. Signed-off-by: Yu Zhe Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220413015048.12319-1-yuzhe@nfschina.com --- kernel/bpf/bpf_struct_ops.c | 4 ++-- kernel/bpf/hashtab.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 21069dbe9138..de01d37c2d3b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -263,7 +263,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, /* No lock is needed. state and refcnt do not need * to be updated together under atomic context. */ - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); @@ -353,7 +353,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) return err; - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; err = check_zero_holes(t, uvalue->data); if (err) return err; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 65877967f414..c68fbebc8c00 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -738,7 +738,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) */ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { - struct bpf_htab *htab = (struct bpf_htab *)arg; + struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; -- cgit v1.2.3 From 5d79fa0d33258d8e79064316ce8fae37e27bd34b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 7 Apr 2022 15:46:12 +0800 Subject: ftrace: Fix build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SYSCTL and CONFIG_DYNAMIC_FTRACE is n, build warns: kernel/trace/ftrace.c:7912:13: error: ‘is_permanent_ops_registered’ defined but not used [-Werror=unused-function] static bool is_permanent_ops_registered(void) ^~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/trace/ftrace.c:89:12: error: ‘last_ftrace_enabled’ defined but not used [-Werror=unused-variable] static int last_ftrace_enabled; ^~~~~~~~~~~~~~~~~~~ Move is_permanent_ops_registered() to ifdef block and mark last_ftrace_enabled as __maybe_unused to fix this. Fixes: 7cde53da38a3 ("ftrace: move sysctl_ftrace_enabled to ftrace.c") Signed-off-by: YueHaibing Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a5efbbc289b4..db8d553728b6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -86,7 +86,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = { /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; -static int last_ftrace_enabled; +static int __maybe_unused last_ftrace_enabled; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; @@ -7909,6 +7909,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +#ifdef CONFIG_SYSCTL static bool is_permanent_ops_registered(void) { struct ftrace_ops *op; @@ -7921,7 +7922,6 @@ static bool is_permanent_ops_registered(void) return false; } -#ifdef CONFIG_SYSCTL static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From 07410559f38360885e91cff1b800168681ac515c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 09:44:32 +0100 Subject: dma-direct: use is_swiotlb_active in dma_direct_map_page Use the more specific is_swiotlb_active check instead of checking the global swiotlb_force variable. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- kernel/dma/direct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 8a6cd53dbe8c..a78c0ba70645 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -91,7 +91,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, return swiotlb_map(dev, phys, size, dir, attrs); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) + if (is_swiotlb_active(dev)) return swiotlb_map(dev, phys, size, dir, attrs); dev_WARN_ONCE(dev, 1, -- cgit v1.2.3 From 3469d36d470df148e8d940c1a6399510562bf3b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 10:01:26 +0100 Subject: swiotlb: make swiotlb_exit a no-op if SWIOTLB_FORCE is set If force bouncing is enabled we can't release the buffers. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- kernel/dma/swiotlb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 73a41cec9e38..98bb0eb44a7b 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -369,6 +369,9 @@ void __init swiotlb_exit(void) unsigned long tbl_vaddr; size_t tbl_size, slots_size; + if (swiotlb_force == SWIOTLB_FORCE) + return; + if (!mem->nslabs) return; -- cgit v1.2.3 From a2daa27c0c6137481226aee5b3136e453c642929 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:44:42 +0100 Subject: swiotlb: simplify swiotlb_max_segment Remove the bogus Xen override that was usually larger than the actual size and just calculate the value on demand. Note that swiotlb_max_segment still doesn't make sense as an interface and should eventually be removed. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- drivers/xen/swiotlb-xen.c | 2 -- include/linux/swiotlb.h | 1 - kernel/dma/swiotlb.c | 20 +++----------------- 3 files changed, 3 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 47aebd98f52f..485cd06ed39e 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -202,7 +202,6 @@ retry: rc = swiotlb_late_init_with_tbl(start, nslabs); if (rc) return rc; - swiotlb_set_max_segment(PAGE_SIZE); return 0; error: if (nslabs > 1024 && repeat--) { @@ -254,7 +253,6 @@ retry: if (swiotlb_init_with_tbl(start, nslabs, true)) panic("Cannot allocate SWIOTLB buffer"); - swiotlb_set_max_segment(PAGE_SIZE); } #endif /* CONFIG_X86 */ diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index f6c3638255d5..9fb3a568f0c5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -164,7 +164,6 @@ static inline void swiotlb_adjust_size(unsigned long size) #endif /* CONFIG_SWIOTLB */ extern void swiotlb_print_info(void); -extern void swiotlb_set_max_segment(unsigned int); #ifdef CONFIG_DMA_RESTRICTED_POOL struct page *swiotlb_alloc(struct device *dev, size_t size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 98bb0eb44a7b..e0127e397335 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -68,12 +68,6 @@ struct io_tlb_mem io_tlb_default_mem; phys_addr_t swiotlb_unencrypted_base; -/* - * Max segment that we can provide which (if pages are contingous) will - * not be bounced (unless SWIOTLB_FORCE is set). - */ -static unsigned int max_segment; - static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static int __init @@ -97,18 +91,12 @@ early_param("swiotlb", setup_io_tlb_npages); unsigned int swiotlb_max_segment(void) { - return io_tlb_default_mem.nslabs ? max_segment : 0; + if (!io_tlb_default_mem.nslabs) + return 0; + return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE); } EXPORT_SYMBOL_GPL(swiotlb_max_segment); -void swiotlb_set_max_segment(unsigned int val) -{ - if (swiotlb_force == SWIOTLB_FORCE) - max_segment = 1; - else - max_segment = rounddown(val, PAGE_SIZE); -} - unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; @@ -258,7 +246,6 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (verbose) swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } @@ -359,7 +346,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } -- cgit v1.2.3 From 0d5ffd9a256d8995764f9d4a35a8c3917839d169 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:07:28 +0100 Subject: swiotlb: rename swiotlb_late_init_with_default_size swiotlb_late_init_with_default_size is an overly verbose name that doesn't even catch what the function is doing, given that the size is not just a default but the actual requested size. Rename it to swiotlb_init_late. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/pci/sta2x11-fixup.c | 2 +- include/linux/swiotlb.h | 2 +- kernel/dma/swiotlb.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 101081ad64b6..e0c039a75b2d 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev) int size = STA2X11_SWIOTLB_SIZE; /* First instance: register your own swiotlb area */ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_late_init_with_default_size(size)) + if (swiotlb_init_late(size)) dev_emerg(&pdev->dev, "init swiotlb failed\n"); } list_add(&instance->list, &sta2x11_instance_list); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 9fb3a568f0c5..b48b26bfa0ed 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -40,7 +40,7 @@ extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); -extern int swiotlb_late_init_with_default_size(size_t default_size); +int swiotlb_init_late(size_t size); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e0127e397335..9a4fe6e48a07 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -281,11 +281,9 @@ fail: * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ -int -swiotlb_late_init_with_default_size(size_t default_size) +int swiotlb_init_late(size_t size) { - unsigned long nslabs = - ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned long bytes; unsigned char *vstart = NULL; unsigned int order; -- cgit v1.2.3 From c6af2aa9ffc9763826607bc2664ef3ea4475ed18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Mar 2022 17:27:33 +0200 Subject: swiotlb: make the swiotlb_init interface more useful Pass a boolean flag to indicate if swiotlb needs to be enabled based on the addressing needs, and replace the verbose argument with a set of flags, including one to force enable bounce buffering. Note that this patch removes the possibility to force xen-swiotlb use with the swiotlb=force parameter on the command line on x86 (arm and arm64 never supported that), but this interface will be restored shortly. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/arm/mm/init.c | 6 +----- arch/arm64/mm/init.c | 6 +----- arch/ia64/mm/init.c | 4 +--- arch/mips/cavium-octeon/dma-octeon.c | 2 +- arch/mips/loongson64/dma.c | 2 +- arch/mips/sibyte/common/dma.c | 2 +- arch/powerpc/mm/mem.c | 3 ++- arch/powerpc/platforms/pseries/setup.c | 3 --- arch/riscv/mm/init.c | 8 +------- arch/s390/mm/init.c | 3 +-- arch/x86/kernel/pci-dma.c | 15 +++++++-------- drivers/xen/swiotlb-xen.c | 4 ++-- include/linux/swiotlb.h | 15 +++++++-------- include/trace/events/swiotlb.h | 29 ++++++++++------------------ kernel/dma/swiotlb.c | 35 ++++++++++++++++++---------------- 15 files changed, 55 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index fe249ea91908..ce64bdb55a16 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -271,11 +271,7 @@ static void __init free_highpages(void) void __init mem_init(void) { #ifdef CONFIG_ARM_LPAE - if (swiotlb_force == SWIOTLB_FORCE || - max_pfn > arm_dma_pfn_limit) - swiotlb_init(1); - else - swiotlb_force = SWIOTLB_NO_FORCE; + swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE); #endif set_max_mapnr(pfn_to_page(max_pfn) - mem_map); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1e7b1550e2fc..bd4095b7fb40 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -398,11 +398,7 @@ void __init bootmem_init(void) */ void __init mem_init(void) { - if (swiotlb_force == SWIOTLB_FORCE || - max_pfn > PFN_DOWN(arm64_dma_phys_limit)) - swiotlb_init(1); - else if (!xen_swiotlb_detect()) - swiotlb_force = SWIOTLB_NO_FORCE; + swiotlb_init(max_pfn > PFN_DOWN(arm64_dma_phys_limit), SWIOTLB_VERBOSE); /* this will put all unused low memory onto the freelists */ memblock_free_all(); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 5d165607bf35..3c3e15b22608 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -437,9 +437,7 @@ mem_init (void) if (iommu_detected) break; #endif -#ifdef CONFIG_SWIOTLB - swiotlb_init(1); -#endif + swiotlb_init(true, SWIOTLB_VERBOSE); } while (0); #ifdef CONFIG_FLATMEM diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c index fb7547e21726..9fbba6a8fa4c 100644 --- a/arch/mips/cavium-octeon/dma-octeon.c +++ b/arch/mips/cavium-octeon/dma-octeon.c @@ -235,5 +235,5 @@ void __init plat_swiotlb_setup(void) #endif swiotlb_adjust_size(swiotlbsize); - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_VERBOSE); } diff --git a/arch/mips/loongson64/dma.c b/arch/mips/loongson64/dma.c index 364f2f27c872..8220a1bc0db6 100644 --- a/arch/mips/loongson64/dma.c +++ b/arch/mips/loongson64/dma.c @@ -24,5 +24,5 @@ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) void __init plat_swiotlb_setup(void) { - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_VERBOSE); } diff --git a/arch/mips/sibyte/common/dma.c b/arch/mips/sibyte/common/dma.c index eb47a94f3583..c5c2c782aff6 100644 --- a/arch/mips/sibyte/common/dma.c +++ b/arch/mips/sibyte/common/dma.c @@ -10,5 +10,5 @@ void __init plat_swiotlb_setup(void) { - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_VERBOSE); } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 4d221d033804..74ca516c3e7e 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -251,7 +252,7 @@ void __init mem_init(void) if (is_secure_guest()) svm_swiotlb_init(); else - swiotlb_init(0); + swiotlb_init(ppc_swiotlb_enable, 0); #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 955ff8aa1644..0f74b2284773 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -849,9 +849,6 @@ static void __init pSeries_setup_arch(void) } ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare; - - if (swiotlb_force == SWIOTLB_FORCE) - ppc_swiotlb_enable = 1; } static void pseries_panic(char *str) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9535bea8688c..181ffd322eaf 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -120,13 +120,7 @@ void __init mem_init(void) BUG_ON(!mem_map); #endif /* CONFIG_FLATMEM */ -#ifdef CONFIG_SWIOTLB - if (swiotlb_force == SWIOTLB_FORCE || - max_pfn > PFN_DOWN(dma32_phys_limit)) - swiotlb_init(1); - else - swiotlb_force = SWIOTLB_NO_FORCE; -#endif + swiotlb_init(max_pfn > PFN_DOWN(dma32_phys_limit), SWIOTLB_VERBOSE); memblock_free_all(); print_vm_layout(); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 86ffd0d51fd5..6fb6bf64326f 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -185,8 +185,7 @@ static void pv_init(void) return; /* make sure bounce buffers are shared */ - swiotlb_force = SWIOTLB_FORCE; - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); swiotlb_update_mem_attributes(); } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 04140e20ef1a..a705a199bf8a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -39,6 +39,7 @@ int iommu_detected __read_mostly = 0; #ifdef CONFIG_SWIOTLB bool x86_swiotlb_enable; +static unsigned int x86_swiotlb_flags; static void __init pci_swiotlb_detect(void) { @@ -58,16 +59,16 @@ static void __init pci_swiotlb_detect(void) * bounce buffers as the hypervisor can't access arbitrary VM memory * that is not explicitly shared with it. */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - swiotlb_force = SWIOTLB_FORCE; - - if (swiotlb_force == SWIOTLB_FORCE) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_FORCE; + } } #else static inline void __init pci_swiotlb_detect(void) { } +#define x86_swiotlb_flags 0 #endif /* CONFIG_SWIOTLB */ #ifdef CONFIG_SWIOTLB_XEN @@ -75,8 +76,7 @@ static bool xen_swiotlb; static void __init pci_xen_swiotlb_init(void) { - if (!xen_initial_domain() && !x86_swiotlb_enable && - swiotlb_force != SWIOTLB_FORCE) + if (!xen_initial_domain() && !x86_swiotlb_enable) return; x86_swiotlb_enable = true; xen_swiotlb = true; @@ -120,8 +120,7 @@ void __init pci_iommu_alloc(void) gart_iommu_hole_init(); amd_iommu_detect(); detect_intel_iommu(); - if (x86_swiotlb_enable) - swiotlb_init(0); + swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } /* diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 485cd06ed39e..c2da3eb4826e 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -251,7 +251,7 @@ retry: panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc); } - if (swiotlb_init_with_tbl(start, nslabs, true)) + if (swiotlb_init_with_tbl(start, nslabs, SWIOTLB_VERBOSE)) panic("Cannot allocate SWIOTLB buffer"); } #endif /* CONFIG_X86 */ @@ -376,7 +376,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, /* * Oh well, have to allocate and map a bounce buffer. */ - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + trace_swiotlb_bounced(dev, dev_addr, size); map = swiotlb_tbl_map_single(dev, phys, size, size, 0, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b48b26bfa0ed..ae0407173e84 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -13,11 +13,8 @@ struct device; struct page; struct scatterlist; -enum swiotlb_force { - SWIOTLB_NORMAL, /* Default - depending on HW DMA mask etc. */ - SWIOTLB_FORCE, /* swiotlb=force */ - SWIOTLB_NO_FORCE, /* swiotlb=noforce */ -}; +#define SWIOTLB_VERBOSE (1 << 0) /* verbose initialization */ +#define SWIOTLB_FORCE (1 << 1) /* force bounce buffering */ /* * Maximum allowable number of contiguous slabs to map, @@ -36,8 +33,7 @@ enum swiotlb_force { /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) -extern void swiotlb_init(int verbose); -int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); +int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); int swiotlb_init_late(size_t size); @@ -126,13 +122,16 @@ static inline bool is_swiotlb_force_bounce(struct device *dev) return mem && mem->force_bounce; } +void swiotlb_init(bool addressing_limited, unsigned int flags); void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); bool is_swiotlb_active(struct device *dev); void __init swiotlb_adjust_size(unsigned long size); #else -#define swiotlb_force SWIOTLB_NO_FORCE +static inline void swiotlb_init(bool addressing_limited, unsigned int flags) +{ +} static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; diff --git a/include/trace/events/swiotlb.h b/include/trace/events/swiotlb.h index 705be43b71ab..da05c9ebd224 100644 --- a/include/trace/events/swiotlb.h +++ b/include/trace/events/swiotlb.h @@ -8,20 +8,15 @@ #include TRACE_EVENT(swiotlb_bounced, - - TP_PROTO(struct device *dev, - dma_addr_t dev_addr, - size_t size, - enum swiotlb_force swiotlb_force), - - TP_ARGS(dev, dev_addr, size, swiotlb_force), + TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size), + TP_ARGS(dev, dev_addr, size), TP_STRUCT__entry( - __string( dev_name, dev_name(dev) ) - __field( u64, dma_mask ) - __field( dma_addr_t, dev_addr ) - __field( size_t, size ) - __field( enum swiotlb_force, swiotlb_force ) + __string(dev_name, dev_name(dev)) + __field(u64, dma_mask) + __field(dma_addr_t, dev_addr) + __field(size_t, size) + __field(bool, force) ), TP_fast_assign( @@ -29,19 +24,15 @@ TRACE_EVENT(swiotlb_bounced, __entry->dma_mask = (dev->dma_mask ? *dev->dma_mask : 0); __entry->dev_addr = dev_addr; __entry->size = size; - __entry->swiotlb_force = swiotlb_force; + __entry->force = is_swiotlb_force_bounce(dev); ), - TP_printk("dev_name: %s dma_mask=%llx dev_addr=%llx " - "size=%zu %s", + TP_printk("dev_name: %s dma_mask=%llx dev_addr=%llx size=%zu %s", __get_str(dev_name), __entry->dma_mask, (unsigned long long)__entry->dev_addr, __entry->size, - __print_symbolic(__entry->swiotlb_force, - { SWIOTLB_NORMAL, "NORMAL" }, - { SWIOTLB_FORCE, "FORCE" }, - { SWIOTLB_NO_FORCE, "NO_FORCE" })) + __entry->force ? "FORCE" : "NORMAL") ); #endif /* _TRACE_SWIOTLB_H */ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9a4fe6e48a07..86e877a96b82 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -62,7 +62,8 @@ #define INVALID_PHYS_ADDR (~(phys_addr_t)0) -enum swiotlb_force swiotlb_force; +static bool swiotlb_force_bounce; +static bool swiotlb_force_disable; struct io_tlb_mem io_tlb_default_mem; @@ -81,9 +82,9 @@ setup_io_tlb_npages(char *str) if (*str == ',') ++str; if (!strcmp(str, "force")) - swiotlb_force = SWIOTLB_FORCE; + swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) - swiotlb_force = SWIOTLB_NO_FORCE; + swiotlb_force_disable = true; return 0; } @@ -202,7 +203,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->index = 0; mem->late_alloc = late_alloc; - if (swiotlb_force == SWIOTLB_FORCE) + if (swiotlb_force_bounce) mem->force_bounce = true; spin_lock_init(&mem->lock); @@ -224,12 +225,13 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, + unsigned int flags) { struct io_tlb_mem *mem = &io_tlb_default_mem; size_t alloc_size; - if (swiotlb_force == SWIOTLB_NO_FORCE) + if (swiotlb_force_disable) return 0; /* protect against double initialization */ @@ -243,8 +245,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) __func__, alloc_size, PAGE_SIZE); swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); + mem->force_bounce = flags & SWIOTLB_FORCE; - if (verbose) + if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); return 0; } @@ -253,20 +256,21 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ -void __init -swiotlb_init(int verbose) +void __init swiotlb_init(bool addressing_limit, unsigned int flags) { size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); void *tlb; - if (swiotlb_force == SWIOTLB_NO_FORCE) + if (!addressing_limit && !swiotlb_force_bounce) + return; + if (swiotlb_force_disable) return; /* Get IO TLB memory from the low pages */ tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) goto fail; - if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose)) + if (swiotlb_init_with_tbl(tlb, default_nslabs, flags)) goto fail_free_mem; return; @@ -289,7 +293,7 @@ int swiotlb_init_late(size_t size) unsigned int order; int rc = 0; - if (swiotlb_force == SWIOTLB_NO_FORCE) + if (swiotlb_force_disable) return 0; /* @@ -328,7 +332,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long bytes = nslabs << IO_TLB_SHIFT; - if (swiotlb_force == SWIOTLB_NO_FORCE) + if (swiotlb_force_disable) return 0; /* protect against double initialization */ @@ -353,7 +357,7 @@ void __init swiotlb_exit(void) unsigned long tbl_vaddr; size_t tbl_size, slots_size; - if (swiotlb_force == SWIOTLB_FORCE) + if (swiotlb_force_bounce) return; if (!mem->nslabs) @@ -704,8 +708,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, phys_addr_t swiotlb_addr; dma_addr_t dma_addr; - trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, - swiotlb_force); + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, attrs); -- cgit v1.2.3 From 8ba2ed1be90fc210126f68186564707478552c95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:36:57 +0200 Subject: swiotlb: add a SWIOTLB_ANY flag to lift the low memory restriction Power SVM wants to allocate a swiotlb buffer that is not restricted to low memory for the trusted hypervisor scheme. Consolidate the support for this into the swiotlb_init interface by adding a new flag. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/powerpc/include/asm/svm.h | 4 ---- arch/powerpc/include/asm/swiotlb.h | 1 + arch/powerpc/kernel/dma-swiotlb.c | 1 + arch/powerpc/mm/mem.c | 5 +---- arch/powerpc/platforms/pseries/svm.c | 26 +------------------------- include/linux/swiotlb.h | 1 + kernel/dma/swiotlb.c | 11 +++++++++-- 7 files changed, 14 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index 7546402d796a..85580b30aba4 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -15,8 +15,6 @@ static inline bool is_secure_guest(void) return mfmsr() & MSR_S; } -void __init svm_swiotlb_init(void); - void dtl_cache_ctor(void *addr); #define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL) @@ -27,8 +25,6 @@ static inline bool is_secure_guest(void) return false; } -static inline void svm_swiotlb_init(void) {} - #define get_dtl_cache_ctor() NULL #endif /* CONFIG_PPC_SVM */ diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 3c1a1cd16128..4203b5e0a88e 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -9,6 +9,7 @@ #include extern unsigned int ppc_swiotlb_enable; +extern unsigned int ppc_swiotlb_flags; #ifdef CONFIG_SWIOTLB void swiotlb_detect_4g(void); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index fc7816126a40..ba256c37bcc0 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -10,6 +10,7 @@ #include unsigned int ppc_swiotlb_enable; +unsigned int ppc_swiotlb_flags; void __init swiotlb_detect_4g(void) { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 74ca516c3e7e..46fb78e3bb36 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -249,10 +249,7 @@ void __init mem_init(void) * back to to-down. */ memblock_set_bottom_up(true); - if (is_secure_guest()) - svm_swiotlb_init(); - else - swiotlb_init(ppc_swiotlb_enable, 0); + swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index c5228f4969eb..3b4045d508ec 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -28,7 +28,7 @@ static int __init init_svm(void) * need to use the SWIOTLB buffer for DMA even if dma_capable() says * otherwise. */ - swiotlb_force = SWIOTLB_FORCE; + ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE; /* Share the SWIOTLB buffer with the host. */ swiotlb_update_mem_attributes(); @@ -37,30 +37,6 @@ static int __init init_svm(void) } machine_early_initcall(pseries, init_svm); -/* - * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it - * can allocate the buffer anywhere in memory. Since the hypervisor doesn't have - * any addressing limitation, we don't need to allocate it in low addresses. - */ -void __init svm_swiotlb_init(void) -{ - unsigned char *vstart; - unsigned long bytes, io_tlb_nslabs; - - io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false)) - return; - - - memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - panic("SVM: Cannot allocate SWIOTLB buffer"); -} - int set_memory_encrypted(unsigned long addr, int numpages) { if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ae0407173e84..eabdd8998702 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -15,6 +15,7 @@ struct scatterlist; #define SWIOTLB_VERBOSE (1 << 0) /* verbose initialization */ #define SWIOTLB_FORCE (1 << 1) /* force bounce buffering */ +#define SWIOTLB_ANY (1 << 2) /* allow any memory for the buffer */ /* * Maximum allowable number of contiguous slabs to map, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 86e877a96b82..f6e091424af3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -266,8 +266,15 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags) if (swiotlb_force_disable) return; - /* Get IO TLB memory from the low pages */ - tlb = memblock_alloc_low(bytes, PAGE_SIZE); + /* + * By default allocate the bounce buffer memory from low memory, but + * allow to pick a location everywhere for hypervisors with guest + * memory encryption. + */ + if (flags & SWIOTLB_ANY) + tlb = memblock_alloc(bytes, PAGE_SIZE); + else + tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) goto fail; if (swiotlb_init_with_tbl(tlb, default_nslabs, flags)) -- cgit v1.2.3 From 742519538e6b07250c8085bbff4bd358bc03bf16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:12:59 +0100 Subject: swiotlb: pass a gfp_mask argument to swiotlb_init_late Let the caller chose a zone to allocate from. This will be used later on by the xen-swiotlb initialization on arm. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/pci/sta2x11-fixup.c | 2 +- include/linux/swiotlb.h | 2 +- kernel/dma/swiotlb.c | 7 ++----- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index e0c039a75b2d..c7e6faf59a86 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev) int size = STA2X11_SWIOTLB_SIZE; /* First instance: register your own swiotlb area */ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size)) + if (swiotlb_init_late(size, GFP_DMA)) dev_emerg(&pdev->dev, "init swiotlb failed\n"); } list_add(&instance->list, &sta2x11_instance_list); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index eabdd8998702..ee655f2e4d28 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -37,7 +37,7 @@ struct scatterlist; int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); -int swiotlb_init_late(size_t size); +int swiotlb_init_late(size_t size, gfp_t gfp_mask); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f6e091424af3..119187afc65e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -292,7 +292,7 @@ fail: * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ -int swiotlb_init_late(size_t size) +int swiotlb_init_late(size_t size, gfp_t gfp_mask) { unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned long bytes; @@ -303,15 +303,12 @@ int swiotlb_init_late(size_t size) if (swiotlb_force_disable) return 0; - /* - * Get IO TLB memory from the low pages - */ order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, order); if (vstart) break; -- cgit v1.2.3 From 7374153d294eb51de5a81ac38ff1c4fef8927bec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Mar 2022 08:02:57 +0100 Subject: swiotlb: provide swiotlb_init variants that remap the buffer To shared more code between swiotlb and xen-swiotlb, offer a swiotlb_init_remap interface and add a remap callback to swiotlb_init_late that will allow Xen to remap the buffer without duplicating much of the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/pci/sta2x11-fixup.c | 2 +- include/linux/swiotlb.h | 5 ++++- kernel/dma/swiotlb.c | 36 +++++++++++++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index c7e6faf59a86..7368afc03998 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev) int size = STA2X11_SWIOTLB_SIZE; /* First instance: register your own swiotlb area */ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size, GFP_DMA)) + if (swiotlb_init_late(size, GFP_DMA, NULL)) dev_emerg(&pdev->dev, "init swiotlb failed\n"); } list_add(&instance->list, &sta2x11_instance_list); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ee655f2e4d28..7b50c82f84ce 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -36,8 +36,11 @@ struct scatterlist; int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); +void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)); +int swiotlb_init_late(size_t size, gfp_t gfp_mask, + int (*remap)(void *tlb, unsigned long nslabs)); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); -int swiotlb_init_late(size_t size, gfp_t gfp_mask); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 119187afc65e..f6acfc7a41bf 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -256,9 +256,11 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ -void __init swiotlb_init(bool addressing_limit, unsigned int flags) +void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)) { - size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); + unsigned long nslabs = default_nslabs; + size_t bytes; void *tlb; if (!addressing_limit && !swiotlb_force_bounce) @@ -271,12 +273,23 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags) * allow to pick a location everywhere for hypervisors with guest * memory encryption. */ +retry: + bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); if (flags & SWIOTLB_ANY) tlb = memblock_alloc(bytes, PAGE_SIZE); else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) goto fail; + if (remap && remap(tlb, nslabs) < 0) { + memblock_free(tlb, PAGE_ALIGN(bytes)); + + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + if (nslabs < IO_TLB_MIN_SLABS) + panic("%s: Failed to remap %zu bytes\n", + __func__, bytes); + goto retry; + } if (swiotlb_init_with_tbl(tlb, default_nslabs, flags)) goto fail_free_mem; return; @@ -287,12 +300,18 @@ fail: pr_warn("Cannot allocate buffer"); } +void __init swiotlb_init(bool addressing_limit, unsigned int flags) +{ + return swiotlb_init_remap(addressing_limit, flags, NULL); +} + /* * Systems with larger DMA zones (those that don't support ISA) can * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ -int swiotlb_init_late(size_t size, gfp_t gfp_mask) +int swiotlb_init_late(size_t size, gfp_t gfp_mask, + int (*remap)(void *tlb, unsigned long nslabs)) { unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned long bytes; @@ -303,6 +322,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask) if (swiotlb_force_disable) return 0; +retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; bytes = nslabs << IO_TLB_SHIFT; @@ -323,6 +343,16 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask) (PAGE_SIZE << order) >> 20); nslabs = SLABS_PER_PAGE << order; } + if (remap) + rc = remap(vstart, nslabs); + if (rc) { + free_pages((unsigned long)vstart, order); + + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + if (nslabs < IO_TLB_MIN_SLABS) + return rc; + goto retry; + } rc = swiotlb_late_init_with_tbl(vstart, nslabs); if (rc) free_pages((unsigned long)vstart, order); -- cgit v1.2.3 From 6424e31b1c050a25aea033206d5f626f3523448c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2022 07:41:04 +0100 Subject: swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl No users left. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 2 -- kernel/dma/swiotlb.c | 77 +++++++++++++------------------------------------ 2 files changed, 20 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 7b50c82f84ce..7ed35dd3de6e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -34,13 +34,11 @@ struct scatterlist; /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) -int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)); int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)); -extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f6acfc7a41bf..e2ef0864eb1e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -225,33 +225,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, - unsigned int flags) -{ - struct io_tlb_mem *mem = &io_tlb_default_mem; - size_t alloc_size; - - if (swiotlb_force_disable) - return 0; - - /* protect against double initialization */ - if (WARN_ON_ONCE(mem->nslabs)) - return -ENOMEM; - - alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); - mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem->slots) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); - - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); - mem->force_bounce = flags & SWIOTLB_FORCE; - - if (flags & SWIOTLB_VERBOSE) - swiotlb_print_info(); - return 0; -} - /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. @@ -259,7 +232,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = default_nslabs; + size_t alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); size_t bytes; void *tlb; @@ -280,7 +255,8 @@ retry: else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) - goto fail; + panic("%s: failed to allocate tlb structure\n", __func__); + if (remap && remap(tlb, nslabs) < 0) { memblock_free(tlb, PAGE_ALIGN(bytes)); @@ -290,14 +266,17 @@ retry: __func__, bytes); goto retry; } - if (swiotlb_init_with_tbl(tlb, default_nslabs, flags)) - goto fail_free_mem; - return; -fail_free_mem: - memblock_free(tlb, bytes); -fail: - pr_warn("Cannot allocate buffer"); + mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem->slots) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), default_nslabs, false); + mem->force_bounce = flags & SWIOTLB_FORCE; + + if (flags & SWIOTLB_VERBOSE) + swiotlb_print_info(); } void __init swiotlb_init(bool addressing_limit, unsigned int flags) @@ -313,6 +292,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags) int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned long bytes; unsigned char *vstart = NULL; @@ -353,33 +333,16 @@ retry: return rc; goto retry; } - rc = swiotlb_late_init_with_tbl(vstart, nslabs); - if (rc) - free_pages((unsigned long)vstart, order); - - return rc; -} - -int -swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) -{ - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long bytes = nslabs << IO_TLB_SHIFT; - - if (swiotlb_force_disable) - return 0; - - /* protect against double initialization */ - if (WARN_ON_ONCE(mem->nslabs)) - return -ENOMEM; mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); - if (!mem->slots) + if (!mem->slots) { + free_pages((unsigned long)vstart, order); return -ENOMEM; + } - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); + set_memory_decrypted((unsigned long)vstart, bytes >> PAGE_SHIFT); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true); swiotlb_print_info(); return 0; -- cgit v1.2.3 From 055eb95533273bc334794dbc598400d10800528f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 14 Apr 2022 09:12:33 -0700 Subject: bpf: Move rcu lock management out of BPF_PROG_RUN routines Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") switched a bunch of BPF_PROG_RUN macros to inline routines. This changed the semantic a bit. Due to arguments expansion of macros, it used to be: rcu_read_lock(); array = rcu_dereference(cgrp->bpf.effective[atype]); ... Now, with with inline routines, we have: array_rcu = rcu_dereference(cgrp->bpf.effective[atype]); /* array_rcu can be kfree'd here */ rcu_read_lock(); array = rcu_dereference(array_rcu); I'm assuming in practice rcu subsystem isn't fast enough to trigger this but let's use rcu API properly. Also, rename to lower caps to not confuse with macros. Additionally, drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY. See [1] for more context. [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u v2 - keep rcu locks inside by passing cgroup_bpf Fixes: 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220414161233.170780-1-sdf@google.com --- drivers/media/rc/bpf-lirc.c | 8 ++- include/linux/bpf.h | 115 +++------------------------------------- kernel/bpf/cgroup.c | 124 ++++++++++++++++++++++++++++++++++++++------ kernel/trace/bpf_trace.c | 5 +- 4 files changed, 124 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 3eff08d7b8e5..fe17c7f98e81 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -216,8 +216,12 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) raw->bpf_sample = sample; - if (raw->progs) - BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run); + if (raw->progs) { + rcu_read_lock(); + bpf_prog_run_array(rcu_dereference(raw->progs), + &raw->bpf_sample, bpf_prog_run); + rcu_read_unlock(); + } } /* diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb5298735ce..7bf441563ffc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1221,7 +1221,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); + * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. @@ -1315,83 +1315,22 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline int -BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval, u32 *ret_flags) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - u32 func_ret; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - func_ret = run_prog(prog, ctx); - if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - *(ret_flags) |= (func_ret >> 1); - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - -static __always_inline int -BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - static __always_inline u32 -BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + if (unlikely(!array)) - goto out; + return ret; + + migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -1400,50 +1339,10 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, item++; } bpf_reset_run_ctx(old_run_ctx); -out: - rcu_read_unlock(); migrate_enable(); return ret; } -/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs - * so BPF programs can request cwr for TCP packets. - * - * Current cgroup skb programs can only return 0 or 1 (0 to drop the - * packet. This macro changes the behavior so the low order bit - * indicates whether the packet should be dropped (0) or not (1) - * and the next bit is a congestion notification bit. This could be - * used by TCP to call tcp_enter_cwr() - * - * Hence, new allowed return values of CGROUP EGRESS BPF programs are: - * 0: drop packet - * 1: keep packet - * 2: drop packet and cn - * 3: keep packet and cn - * - * This macro then converts it to one of the NET_XMIT or an error - * code that is then interpreted as drop packet (and no cn): - * 0: NET_XMIT_SUCCESS skb should be transmitted - * 1: NET_XMIT_DROP skb should be dropped and cn - * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -err skb should be dropped - */ -#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ - ({ \ - u32 _flags = 0; \ - bool _cn; \ - u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ - _cn = _flags & BPF_RET_SET_CN; \ - if (_ret && !IS_ERR_VALUE((long)_ret)) \ - _ret = -EFAULT; \ - if (!_ret) \ - _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ - else \ - _ret = (_cn ? NET_XMIT_DROP : _ret); \ - _ret; \ - }) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 128028efda64..0cb6211fcb58 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,6 +22,72 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* __always_inline is necessary to prevent indirect call through run_prog + * function pointer. + */ +static __always_inline int +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval, u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 func_ret; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + *(ret_flags) |= (func_ret >> 1); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + +static __always_inline int +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -1075,11 +1141,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); if (atype == CGROUP_INET_EGRESS) { - ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); + u32 flags = 0; + bool cn; + + ret = bpf_prog_run_array_cg_flags( + &cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0, &flags); + + /* Return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * The returned value is then converted to one of the NET_XMIT + * or an error code that is then interpreted as drop packet + * (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -err skb should be dropped + */ + + cn = flags & BPF_RET_SET_CN; + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; + if (!ret) + ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); + else + ret = (cn ? NET_XMIT_DROP : ret); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0); if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; } @@ -1109,8 +1202,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1155,8 +1247,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0, flags); + return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype, + &ctx, bpf_prog_run, 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1182,8 +1274,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, + 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1200,8 +1292,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0); rcu_read_unlock(); return ret; @@ -1366,8 +1457,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1459,7 +1549,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, &ctx, bpf_prog_run, 0); release_sock(sk); @@ -1559,7 +1649,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, &ctx, bpf_prog_run, retval); release_sock(sk); @@ -1608,7 +1698,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, &ctx, bpf_prog_run, retval); if (ret < 0) return ret; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b26f3da943de..f15b826f9899 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -129,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); + rcu_read_lock(); + ret = bpf_prog_run_array(rcu_dereference(call->prog_array), + ctx, bpf_prog_run); + rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3 From eb596b0905584a9389585b0f437cf8a2faeb14d0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 19 Apr 2022 22:16:07 +0530 Subject: bpf: Ensure type tags precede modifiers in BTF It is guaranteed that for modifiers, clang always places type tags before other modifiers, and then the base type. We would like to rely on this guarantee inside the kernel to make it simple to parse type tags from BTF. However, a user would be allowed to construct a BTF without such guarantees. Hence, add a pass to check that in modifier chains, type tags only occur at the head of the chain, and then don't occur later in the chain. If we see a type tag, we can have one or more type tags preceding other modifiers that then never have another type tag. If we see other modifiers, all modifiers following them should never be a type tag. Instead of having to walk chains we verified previously, we can remember the last good modifier type ID which headed a good chain. At that point, we must have verified all other chains headed by type IDs less than it. This makes the verification process less costly, and it becomes a simple O(n) pass. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220419164608.1990559-2-memxor@gmail.com --- kernel/bpf/btf.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0918a39279f6..7906b9bf7ff8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4541,6 +4541,48 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return 0; } +static int btf_check_type_tags(struct btf_verifier_env *env, + struct btf *btf, int start_id) +{ + int i, n, good_id = start_id - 1; + bool in_tags; + + n = btf_nr_types(btf); + for (i = start_id; i < n; i++) { + const struct btf_type *t; + u32 cur_id = i; + + t = btf_type_by_id(btf, i); + if (!t) + return -EINVAL; + if (!btf_type_is_modifier(t)) + continue; + + cond_resched(); + + in_tags = btf_type_is_type_tag(t); + while (btf_type_is_modifier(t)) { + if (btf_type_is_type_tag(t)) { + if (!in_tags) { + btf_verifier_log(env, "Type tags don't precede modifiers"); + return -EINVAL; + } + } else if (in_tags) { + in_tags = false; + } + if (cur_id <= good_id) + break; + /* Move to next type */ + cur_id = t->type; + t = btf_type_by_id(btf, cur_id); + if (!t) + return -EINVAL; + } + good_id = i; + } + return 0; +} + static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u32 log_level, char __user *log_ubuf, u32 log_size) { @@ -4608,6 +4650,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; @@ -4809,6 +4855,10 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); @@ -4894,6 +4944,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u if (err) goto errout; + err = btf_check_type_tags(env, btf, btf_nr_types(base_btf)); + if (err) + goto errout; + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; -- cgit v1.2.3 From dcf456c9a095a6e71f53d6f6f004133ee851ee70 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 18 Apr 2022 15:51:58 +0000 Subject: bpf: Fix usage of trace RCU in local storage. bpf_{sk,task,inode}_storage_free() do not need to use call_rcu_tasks_trace as no BPF program should be accessing the owner as it's being destroyed. The only other reader at this point is bpf_local_storage_map_free() which uses normal RCU. The only path that needs trace RCU are: * bpf_local_storage_{delete,update} helpers * map_{delete,update}_elem() syscalls Fixes: 0fe4b381a59e ("bpf: Allow bpf_local_storage to be used by sleepable programs") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220418155158.2865678-1-kpsingh@kernel.org --- include/linux/bpf_local_storage.h | 4 ++-- kernel/bpf/bpf_inode_storage.c | 4 ++-- kernel/bpf/bpf_local_storage.c | 29 +++++++++++++++++++---------- kernel/bpf/bpf_task_storage.c | 4 ++-- net/core/bpf_sk_storage.c | 6 +++--- 5 files changed, 28 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 493e63258497..7ea18d4da84b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -143,9 +143,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_omem); + bool uncharge_omem, bool use_trace_rcu); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem); +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 96be8d518885..10424a1cda51 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -90,7 +90,7 @@ void bpf_inode_storage_free(struct inode *inode) */ bpf_selem_unlink_map(selem); free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); @@ -149,7 +149,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 01aa2b51ec4d..8ce40fd869f6 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -106,7 +106,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) */ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem) + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -150,11 +150,16 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + if (use_trace_rcu) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + else + kfree_rcu(selem, rcu); + return free_local_storage; } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, + bool use_trace_rcu) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -169,12 +174,16 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true); + local_storage, selem, true, use_trace_rcu); raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (free_local_storage) - call_rcu_tasks_trace(&local_storage->rcu, + if (free_local_storage) { + if (use_trace_rcu) + call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_rcu); + else + kfree_rcu(local_storage, rcu); + } } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -214,14 +223,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); - __bpf_selem_unlink_storage(selem); + __bpf_selem_unlink_storage(selem, use_trace_rcu); } struct bpf_local_storage_data * @@ -466,7 +475,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false); + false, true); } unlock: @@ -548,7 +557,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, migrate_disable(); __this_cpu_inc(*busy_counter); } - bpf_selem_unlink(selem); + bpf_selem_unlink(selem, false); if (busy_counter) { __this_cpu_dec(*busy_counter); migrate_enable(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 6638a0ecc3d2..57904263a710 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -102,7 +102,7 @@ void bpf_task_storage_free(struct task_struct *task) */ bpf_selem_unlink_map(selem); free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); @@ -192,7 +192,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index e3ac36380520..83d7641ef67b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -75,8 +75,8 @@ void bpf_sk_storage_free(struct sock *sk) * sk_storage. */ bpf_selem_unlink_map(selem); - free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage, - selem, true); + free_sk_storage = bpf_selem_unlink_storage_nolock( + sk_storage, selem, true, false); } raw_spin_unlock_bh(&sk_storage->lock); rcu_read_unlock(); -- cgit v1.2.3 From 42ba1308074d9046386d58b56e793604be48ce22 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 15 Apr 2022 21:33:42 +0530 Subject: bpf: Make btf_find_field more generic Next commit introduces field type 'kptr' whose kind will not be struct, but pointer, and it will not be limited to one offset, but multiple ones. Make existing btf_find_struct_field and btf_find_datasec_var functions amenable to use for finding kptrs in map value, by moving spin_lock and timer specific checks into their own function. The alignment, and name are checked before the function is called, so it is the last point where we can skip field or return an error before the next loop iteration happens. Size of the field and type is meant to be checked inside the function. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220415160354.1050687-2-memxor@gmail.com --- kernel/bpf/btf.c | 120 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7906b9bf7ff8..0493310d981f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3163,24 +3163,44 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +enum btf_field_type { + BTF_FIELD_SPIN_LOCK, + BTF_FIELD_TIMER, +}; + +struct btf_field_info { + u32 off; +}; + +static int btf_find_struct(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + if (!__btf_type_is_struct(t)) + return 0; + if (t->size != sz) + return 0; + if (info->off != -ENOENT) + /* only one such field is allowed */ + return -E2BIG; + info->off = off; + return 0; +} + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info) { const struct btf_member *member; - u32 i, off = -ENOENT; + u32 i, off; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (!__btf_type_is_struct(member_type)) - continue; - if (member_type->size != sz) - continue; + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ @@ -3188,46 +3208,76 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t off /= 8; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + return btf_find_struct(btf, member_type, off, sz, info); + default: + return -EFAULT; + } } - return off; + return 0; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info) { const struct btf_var_secinfo *vsi; - u32 i, off = -ENOENT; + u32 i, off; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - if (!__btf_type_is_struct(var_type)) - continue; - if (var_type->size != sz) + off = vsi->offset; + + if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) continue; if (vsi->size != sz) continue; - if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) - continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; - off = vsi->offset; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + return btf_find_struct(btf, var_type, off, sz, info); + default: + return -EFAULT; + } } - return off; + return 0; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + enum btf_field_type field_type, + struct btf_field_info *info) { + const char *name; + int sz, align; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + name = "bpf_spin_lock"; + sz = sizeof(struct bpf_spin_lock); + align = __alignof__(struct bpf_spin_lock); + break; + case BTF_FIELD_TIMER: + name = "bpf_timer"; + sz = sizeof(struct bpf_timer); + align = __alignof__(struct bpf_timer); + break; + default: + return -EFAULT; + } if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align); + return btf_find_struct_field(btf, t, name, sz, align, field_type, info); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align); + return btf_find_datasec_var(btf, t, name, sz, align, field_type, info); return -EINVAL; } @@ -3237,16 +3287,24 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, */ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_spin_lock", - sizeof(struct bpf_spin_lock), - __alignof__(struct bpf_spin_lock)); + struct btf_field_info info = { .off = -ENOENT }; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info); + if (ret < 0) + return ret; + return info.off; } int btf_find_timer(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_timer", - sizeof(struct bpf_timer), - __alignof__(struct bpf_timer)); + struct btf_field_info info = { .off = -ENOENT }; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info); + if (ret < 0) + return ret; + return info.off; } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, -- cgit v1.2.3 From e9147b4422e1f35b9c229c980c596ccf03d61562 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 15 Apr 2022 21:33:43 +0530 Subject: bpf: Move check_ptr_off_reg before check_map_access Some functions in next patch want to use this function, and those functions will be called by check_map_access, hence move it before check_map_access. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Joanne Koong Link: https://lore.kernel.org/bpf/20220415160354.1050687-3-memxor@gmail.com --- kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c1a02b82ecd..71827d14724a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3469,6 +3469,44 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, return 0; } +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) +{ + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. + */ + + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); + return -EACCES; + } + + return 0; +} + +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -3980,44 +4018,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - bool fixed_off_ok) -{ - /* Access to this pointer-typed register or passing it to a helper - * is only allowed in its original, unmodified form. - */ - - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); - return -EACCES; - } - - return 0; -} - -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) -{ - return __check_ptr_off_reg(env, reg, regno, false); -} - static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, -- cgit v1.2.3 From f8b7d2b4c192118c37ab24c0540d1134dd0104d8 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 15 Apr 2022 14:29:57 -0700 Subject: ftrace: fix building with SYSCTL=n but DYNAMIC_FTRACE=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One can enable dyanmic tracing but disable sysctls. When this is doen we get the compile kernel warning: CC kernel/trace/ftrace.o kernel/trace/ftrace.c:3086:13: warning: ‘ftrace_shutdown_sysctl’ defined but not used [-Wunused-function] 3086 | static void ftrace_shutdown_sysctl(void) | ^~~~~~~~~~~~~~~~~~~~~~ kernel/trace/ftrace.c:3068:13: warning: ‘ftrace_startup_sysctl’ defined but not used [-Wunused-function] 3068 | static void ftrace_startup_sysctl(void) When CONFIG_DYNAMIC_FTRACE=n the ftrace_startup_sysctl() and routines ftrace_shutdown_sysctl() still compiles, so these are actually more just used for when SYSCTL=y. Fix this then by just moving these routines to when sysctls are enabled. Fixes: 7cde53da38a3 ("ftrace: move sysctl_ftrace_enabled to ftrace.c") Reported-by: kernel test robot Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/trace/ftrace.c | 71 ++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index db8d553728b6..d9424fd9a183 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3065,40 +3065,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_startup_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* Force update next time */ - saved_ftrace_func = NULL; - /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) { - command = FTRACE_UPDATE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_START_FUNC_RET; - ftrace_startup_enable(command); - } -} - -static void ftrace_shutdown_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) { - command = FTRACE_DISABLE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_STOP_FUNC_RET; - ftrace_run_update_code(command); - } -} - static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; unsigned long ftrace_number_of_pages; @@ -7267,9 +7233,6 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_all(int command) { } -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) - static void ftrace_update_trampoline(struct ftrace_ops *ops) { } @@ -7910,6 +7873,40 @@ int unregister_ftrace_function(struct ftrace_ops *ops) EXPORT_SYMBOL_GPL(unregister_ftrace_function); #ifdef CONFIG_SYSCTL +static void ftrace_startup_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } +} + +static void ftrace_shutdown_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } +} + static bool is_permanent_ops_registered(void) { struct ftrace_ops *op; -- cgit v1.2.3 From 988f11e046401f8561c4afefa506a50f0203de40 Mon Sep 17 00:00:00 2001 From: liaohua Date: Thu, 7 Apr 2022 15:29:48 +0800 Subject: latencytop: move sysctl to its own file This moves latencytop sysctl to kernel/latencytop.c Signed-off-by: liaohua Signed-off-by: Luis Chamberlain --- include/linux/latencytop.h | 3 --- kernel/latencytop.c | 41 +++++++++++++++++++++++++++++------------ kernel/sysctl.c | 10 ---------- 3 files changed, 29 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index abe3d95f795b..84f1053cf2a8 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,9 +38,6 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - #else static inline void diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 166d7bf49666..76166df011a4 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -55,6 +55,7 @@ #include #include #include +#include static DEFINE_RAW_SPINLOCK(latency_lock); @@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; +#ifdef CONFIG_SYSCTL +static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} + +static struct ctl_table latencytop_sysctl[] = { + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, + {} +}; +#endif + void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; @@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = { static int __init init_lstats_procfs(void) { proc_create("latency_stats", 0644, NULL, &lstats_proc_ops); +#ifdef CONFIG_SYSCTL + register_sysctl_init("kernel", latencytop_sysctl); +#endif return 0; } - -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - int err; - - err = proc_dointvec(table, write, buffer, lenp, ppos); - if (latencytop_enabled) - force_schedstat_enabled(); - - return err; -} device_initcall(init_lstats_procfs); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0fdf465a93d..b60345cbadf0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include "../lib/kstrtox.h" @@ -1623,15 +1622,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, #endif { .procname = "print-fatal-signals", -- cgit v1.2.3 From 8fd7c2144d1292f15c901211750dee021ed5079a Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 21 Apr 2022 11:38:43 -0700 Subject: ftrace: fix building with SYSCTL=y but DYNAMIC_FTRACE=n Ok so hopefully this is the last of it. 0day picked up a build failure [0] when SYSCTL=y but DYNAMIC_FTRACE=n. This can be fixed by just declaring an empty routine for the calls moved just recently. [0] https://lkml.kernel.org/r/202204161203.6dSlgKJX-lkp@intel.com Reported-by: kernel test robot Fixes: f8b7d2b4c192 ("ftrace: fix building with SYSCTL=n but DYNAMIC_FTRACE=y") Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/trace/ftrace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d9424fd9a183..1f89039f0feb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7873,6 +7873,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops) EXPORT_SYMBOL_GPL(unregister_ftrace_function); #ifdef CONFIG_SYSCTL + +#ifdef CONFIG_DYNAMIC_FTRACE static void ftrace_startup_sysctl(void) { int command; @@ -7906,6 +7908,10 @@ static void ftrace_shutdown_sysctl(void) ftrace_run_update_code(command); } } +#else +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ static bool is_permanent_ops_registered(void) { -- cgit v1.2.3 From 10a5a651e3afc9b0b381f47e8930972e4e918397 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 31 Mar 2022 13:57:17 +0800 Subject: workqueue: Restrict kworker in the offline CPU pool running on housekeeping CPUs When a CPU is going offline, all workers on the CPU's pool will have their cpus_allowed cleared to cpu_possible_mask and can run on any CPUs including the isolated ones. Instead, set cpus_allowed to wq_unbound_cpumask so that the can avoid isolated CPUs. Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0d2514b4ff0d..4056f2a3f9d5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5001,7 +5001,7 @@ static void unbind_workers(int cpu) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); } mutex_unlock(&wq_pool_attach_mutex); -- cgit v1.2.3 From faebd693c59387b7b765fab64b543855e15a91b4 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:36 +0206 Subject: printk: rename cpulock functions Since the printk cpulock is CPU-reentrant and since it is used in all contexts, its usage must be carefully considered and most likely will require programming locklessly. To avoid mistaking the printk cpulock as a typical lock, rename it to cpu_sync. The main functions then become: printk_cpu_sync_get_irqsave(flags); printk_cpu_sync_put_irqrestore(flags); Add extra notes of caution in the function description to help developers understand the requirements for correct usage. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-2-john.ogness@linutronix.de --- include/linux/printk.h | 54 +++++++++++++++++++++++--------------- kernel/printk/printk.c | 71 +++++++++++++++++++++++++------------------------- lib/dump_stack.c | 4 +-- lib/nmi_backtrace.c | 4 +-- 4 files changed, 73 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 1522df223c0f..859323a52985 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -277,43 +277,55 @@ static inline void printk_trigger_flush(void) #endif #ifdef CONFIG_SMP -extern int __printk_cpu_trylock(void); -extern void __printk_wait_on_cpu_lock(void); -extern void __printk_cpu_unlock(void); +extern int __printk_cpu_sync_try_get(void); +extern void __printk_cpu_sync_wait(void); +extern void __printk_cpu_sync_put(void); /** - * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. + * printk_cpu_sync_get_irqsave() - Acquire the printk cpu-reentrant spinning + * lock and disable interrupts. * @flags: Stack-allocated storage for saving local interrupt state, - * to be passed to printk_cpu_unlock_irqrestore(). + * to be passed to printk_cpu_sync_put_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. + * + * CAUTION: This function must be used carefully. It does not behave like a + * typical lock. Here are important things to watch out for... + * + * * This function is reentrant on the same CPU. Therefore the calling + * code must not assume exclusive access to data if code accessing the + * data can run reentrant or within NMI context on the same CPU. + * + * * If there exists usage of this function from NMI context, it becomes + * unsafe to perform any type of locking or spinning to wait for other + * CPUs after calling this function from any context. This includes + * using spinlocks or any other busy-waiting synchronization methods. */ -#define printk_cpu_lock_irqsave(flags) \ - for (;;) { \ - local_irq_save(flags); \ - if (__printk_cpu_trylock()) \ - break; \ - local_irq_restore(flags); \ - __printk_wait_on_cpu_lock(); \ +#define printk_cpu_sync_get_irqsave(flags) \ + for (;;) { \ + local_irq_save(flags); \ + if (__printk_cpu_sync_try_get()) \ + break; \ + local_irq_restore(flags); \ + __printk_cpu_sync_wait(); \ } /** - * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning - * lock and restore interrupts. - * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). + * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning + * lock and restore interrupts. + * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave(). */ -#define printk_cpu_unlock_irqrestore(flags) \ +#define printk_cpu_sync_put_irqrestore(flags) \ do { \ - __printk_cpu_unlock(); \ + __printk_cpu_sync_put(); \ local_irq_restore(flags); \ - } while (0) \ + } while (0) #else -#define printk_cpu_lock_irqsave(flags) ((void)flags) -#define printk_cpu_unlock_irqrestore(flags) ((void)flags) +#define printk_cpu_sync_get_irqsave(flags) ((void)flags) +#define printk_cpu_sync_put_irqrestore(flags) ((void)flags) #endif /* CONFIG_SMP */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index da03c15ecc89..13a1eebe72af 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3667,26 +3667,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif #ifdef CONFIG_SMP -static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); -static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); +static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); /** - * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant - * spinning lock is not owned by any CPU. + * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant + * spinning lock is not owned by any CPU. * * Context: Any context. */ -void __printk_wait_on_cpu_lock(void) +void __printk_cpu_sync_wait(void) { do { cpu_relax(); - } while (atomic_read(&printk_cpulock_owner) != -1); + } while (atomic_read(&printk_cpu_sync_owner) != -1); } -EXPORT_SYMBOL(__printk_wait_on_cpu_lock); +EXPORT_SYMBOL(__printk_cpu_sync_wait); /** - * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant - * spinning lock. + * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant + * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the @@ -3695,7 +3695,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock); * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ -int __printk_cpu_trylock(void) +int __printk_cpu_sync_try_get(void) { int cpu; int old; @@ -3705,79 +3705,80 @@ int __printk_cpu_trylock(void) /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with - * __printk_cpu_unlock:B. + * __printk_cpu_sync_put:B. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then - * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_put:A can never read from + * __printk_cpu_sync_try_get:B. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of this CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of this CPU */ - old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, - cpu); /* LMM(__printk_cpu_trylock:A) */ + old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, + cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing - * data: LMM(__printk_cpu_trylock:B) + * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ - atomic_inc(&printk_cpulock_nested); + atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } -EXPORT_SYMBOL(__printk_cpu_trylock); +EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** - * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. + * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ -void __printk_cpu_unlock(void) +void __printk_cpu_sync_put(void) { - if (atomic_read(&printk_cpulock_nested)) { - atomic_dec(&printk_cpulock_nested); + if (atomic_read(&printk_cpu_sync_nested)) { + atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: - * LMM(__printk_cpu_unlock:A) + * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs - * with __printk_cpu_trylock:A. + * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, - * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of the next CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of the next CPU */ - atomic_set_release(&printk_cpulock_owner, - -1); /* LMM(__printk_cpu_unlock:B) */ + atomic_set_release(&printk_cpu_sync_owner, + -1); /* LMM(__printk_cpu_sync_put:B) */ } -EXPORT_SYMBOL(__printk_cpu_unlock); +EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */ diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 6b7f1bf6715d..83471e81501a 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,9 +102,9 @@ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) * Permit this cpu to perform nested stack dumps while serialising * against other CPUs */ - printk_cpu_lock_irqsave(flags); + printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); - printk_cpu_unlock_irqrestore(flags); + printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 199ab201d501..d01aec6ae15c 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -99,7 +99,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) * Allow nested NMI backtraces while serializing * against other CPUs. */ - printk_cpu_lock_irqsave(flags); + printk_cpu_sync_get_irqsave(flags); if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); @@ -110,7 +110,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) else dump_stack(); } - printk_cpu_unlock_irqrestore(flags); + printk_cpu_sync_put_irqrestore(flags); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -- cgit v1.2.3 From 1f5d783094cf28b4905f51cad846eb5d1db6673e Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:38 +0206 Subject: printk: add missing memory barrier to wake_up_klogd() It is important that any new records are visible to preparing waiters before the waker checks if the wait queue is empty. Otherwise it is possible that: - there are new records available - the waker sees an empty wait queue and does not wake - the preparing waiter sees no new records and begins to wait This is exactly the problem that the function description of waitqueue_active() warns about. Use wq_has_sleeper() instead of waitqueue_active() because it includes the necessary full memory barrier. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13a1eebe72af..f817dfb4852d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -746,8 +746,19 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, goto out; } + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with wake_up_klogd:A. + */ ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, atomic64_read(&user->seq), r)); + prb_read_valid(prb, + atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } @@ -1513,7 +1524,18 @@ static int syslog_print(char __user *buf, int size) seq = syslog_seq; mutex_unlock(&syslog_lock); - len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with wake_up_klogd:A. + */ + len = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) @@ -3316,7 +3338,18 @@ void wake_up_klogd(void) return; preempt_disable(); - if (waitqueue_active(&log_wait)) { + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the wait queue is empty. + * + * The full memory barrier within wq_has_sleeper() pairs with the full + * memory barrier within set_current_state() of + * prepare_to_wait_event(), which is called after ___wait_event() adds + * the waiter but before it has checked the wait condition. + * + * This pairs with devkmsg_read:A and syslog_print:A. + */ + if (wq_has_sleeper(&log_wait)) { /* LMM(wake_up_klogd:A) */ this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } -- cgit v1.2.3 From 938ba4084abcf6fdd21d9078513c52f8fb9b00d0 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:39 +0206 Subject: printk: wake up all waiters There can be multiple tasks waiting for new records. They should all be woken. Use wake_up_interruptible_all() instead of wake_up_interruptible(). Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-5-john.ogness@linutronix.de --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f817dfb4852d..e23357002648 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3326,7 +3326,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) } if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + wake_up_interruptible_all(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = -- cgit v1.2.3 From 5341b93dea8c39d7612f7a227015d4b1d5cf30db Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:40 +0206 Subject: printk: wake waiters for safe and NMI contexts When printk() is called from safe or NMI contexts, it will directly store the record (vprintk_store()) and then defer the console output. However, defer_console_output() only causes console printing and does not wake any waiters of new records. Wake waiters from defer_console_output() so that they also are aware of the new records from safe and NMI contexts. Fixes: 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when accessing the main log buffer in NMI") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-6-john.ogness@linutronix.de --- kernel/printk/printk.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e23357002648..7bb148a1debb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -754,7 +754,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * - * This pairs with wake_up_klogd:A. + * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, prb_read_valid(prb, @@ -1532,7 +1532,7 @@ static int syslog_print(char __user *buf, int size) * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * - * This pairs with wake_up_klogd:A. + * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ @@ -3332,7 +3332,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); -void wake_up_klogd(void) +static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; @@ -3349,22 +3349,26 @@ void wake_up_klogd(void) * * This pairs with devkmsg_read:A and syslog_print:A. */ - if (wq_has_sleeper(&log_wait)) { /* LMM(wake_up_klogd:A) */ - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ + (val & PRINTK_PENDING_OUTPUT)) { + this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } -void defer_console_output(void) +void wake_up_klogd(void) { - if (!printk_percpu_data_ready()) - return; + __wake_up_klogd(PRINTK_PENDING_WAKEUP); +} - preempt_disable(); - this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); - preempt_enable(); +void defer_console_output(void) +{ + /* + * New messages may have been added directly to the ringbuffer + * using vprintk_store(), so wake any waiters as well. + */ + __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) -- cgit v1.2.3 From 9f0844de49cf0557d5c359131004acbb179c174e Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:41 +0206 Subject: printk: get caller_id/timestamp after migration disable Currently the local CPU timestamp and caller_id for the record are collected while migration is enabled. Since this information is CPU-specific, it should be collected with migration disabled. Migration is disabled immediately after collecting this information anyway, so just move the information collection to after the migration disabling. Signed-off-by: John Ogness Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-7-john.ogness@linutronix.de --- kernel/printk/printk.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7bb148a1debb..82ad3d3d0d4a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2063,7 +2063,7 @@ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + raw_smp_processor_id(); + 0x80000000 + smp_processor_id(); } /** @@ -2145,7 +2145,6 @@ int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { - const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; @@ -2155,10 +2154,14 @@ int vprintk_store(int facility, int level, u8 *recursion_ptr; u16 reserve_size; va_list args2; + u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is @@ -2167,8 +2170,7 @@ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); - if (!printk_enter_irqsave(recursion_ptr, irqflags)) - return 0; + caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be -- cgit v1.2.3 From 1f47e8af45fdd754c5538f8aeb7857a6c087fd6a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:42 +0206 Subject: printk: call boot_delay_msec() in printk_delay() boot_delay_msec() is always called immediately before printk_delay() so just call it from within printk_delay(). Signed-off-by: John Ogness Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-8-john.ogness@linutronix.de --- kernel/printk/printk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 82ad3d3d0d4a..2f99e0b383b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2048,8 +2048,10 @@ static u8 *__printk_recursion_counter(void) int printk_delay_msec __read_mostly; -static inline void printk_delay(void) +static inline void printk_delay(int level) { + boot_delay_msec(level); + if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; @@ -2274,8 +2276,7 @@ asmlinkage int vprintk_emit(int facility, int level, in_sched = true; } - boot_delay_msec(level); - printk_delay(); + printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); -- cgit v1.2.3 From 1fc0ca9e0db61882208650b3603071e9f4b5cfee Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:43 +0206 Subject: printk: add con_printk() macro for console details It is useful to generate log messages that include details about the related console. Rather than duplicate the code to assemble the details, put that code into a macro con_printk(). Once console printers become threaded, this macro will find more users. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-9-john.ogness@linutronix.de --- kernel/printk/printk.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2f99e0b383b9..e36d3ed41afa 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3015,6 +3015,11 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } +#define con_printk(lvl, con, fmt, ...) \ + printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \ + (con->flags & CON_BOOT) ? "boot" : "", \ + con->name, con->index, ##__VA_ARGS__) + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -3153,9 +3158,7 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - pr_info("%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); + con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { @@ -3174,9 +3177,7 @@ int unregister_console(struct console *console) struct console *con; int res; - pr_info("%sconsole [%s%d] disabled\n", - (console->flags & CON_BOOT) ? "boot" : "" , - console->name, console->index); + con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) -- cgit v1.2.3 From a699449bb13b70b8bd10dc03ad7327ea3993221e Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:44 +0206 Subject: printk: refactor and rework printing logic Refactor/rework printing logic in order to prepare for moving to threaded console printing. - Move @console_seq into struct console so that the current "position" of each console can be tracked individually. - Move @console_dropped into struct console so that the current drop count of each console can be tracked individually. - Modify printing logic so that each console independently loads, prepares, and prints its next record. - Remove exclusive_console logic. Since console positions are handled independently, replaying past records occurs naturally. - Update the comments explaining why preemption is disabled while printing from printk() context. With these changes, there is a change in behavior: the console replaying the log (formerly exclusive console) will no longer block other consoles. New messages appear on the other consoles while the newly added console is still replaying. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-10-john.ogness@linutronix.de --- include/linux/console.h | 2 + kernel/printk/printk.c | 441 +++++++++++++++++++++++++----------------------- 2 files changed, 230 insertions(+), 213 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index 7cd758a4f44e..8c1686e2c233 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -151,6 +151,8 @@ struct console { int cflag; uint ispeed; uint ospeed; + u64 seq; + unsigned long dropped; void *data; struct console *next; }; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e36d3ed41afa..3dea8bbaf402 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,11 +280,6 @@ static bool panic_in_progress(void) */ static int console_locked, console_suspended; -/* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - /* * Array of consoles built from command line options (console=) */ @@ -374,12 +369,6 @@ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - struct latched_seq { seqcount_latch_t latch; u64 val[2]; @@ -1933,47 +1922,26 @@ static int console_trylock_spinning(void) } /* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. + * Call the specified console driver, asking it to write out the specified + * text and length. For non-extended consoles, if any records have been + * dropped, a dropped message will be written out first. */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static void call_console_driver(struct console *con, const char *text, size_t len) { static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; + size_t dropped_len; trace_console_rcuidle(text, len); - if (!console_drivers) - return; - - if (console_dropped) { + if (con->dropped && !(con->flags & CON_EXTENDED)) { dropped_len = snprintf(dropped_text, sizeof(dropped_text), "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; + con->dropped); + con->dropped = 0; + con->write(con, dropped_text, dropped_len); } - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } - } + con->write(con, text, len); } /* @@ -2283,15 +2251,18 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console + * The caller may be holding system-critical or + * timing-sensitive locks. Disable preemption during + * printing of all remaining records to all consoles so that + * this context can return as soon as possible. Hopefully + * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. + * semaphore. The release will print out buffers. With the + * spinning variant, this context tries to take over the + * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); @@ -2329,11 +2300,9 @@ EXPORT_SYMBOL(_printk); #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 +#define prb_next_seq(rb) 0 static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; static size_t record_print_text(const struct printk_record *r, bool syslog, bool time) @@ -2350,8 +2319,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} +static void call_console_driver(struct console *con, const char *text, size_t len) { } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2621,22 +2589,6 @@ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; - - return 0; -} - /* * Return true when this CPU should unlock console_sem without pushing all * messages to the console. This reduces the chance that the console is @@ -2657,15 +2609,182 @@ static bool abandon_console_lock_in_panic(void) } /* - * Can we actually use the console at this time on this cpu? + * Check if the given console is currently capable and allowed to print + * records. + * + * Requires the console_lock. + */ +static inline bool console_is_usable(struct console *con) +{ + if (!(con->flags & CON_ENABLED)) + return false; + + if (!con->write) + return false; + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + return false; + + return true; +} + +static void __console_unlock(void) +{ + console_locked = 0; + up_console_sem(); +} + +/* + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns false if the given console has no next record to print, otherwise + * true. * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * Requires the console_lock. */ -static inline int can_use_console(void) +static bool console_emit_next_record(struct console *con, bool *handover) { - return cpu_online(raw_smp_processor_id()) || have_callable_console(); + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; + static int panic_console_dropped; + struct printk_info info; + struct printk_record r; + unsigned long flags; + char *write_text; + size_t len; + + prb_rec_init_rd(&r, &info, text, sizeof(text)); + + *handover = false; + + if (!prb_read_valid(prb, con->seq, &r)) + return false; + + if (con->seq != r.info->seq) { + con->dropped += r.info->seq - con->seq; + con->seq = r.info->seq; + if (panic_in_progress() && panic_console_dropped++ > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); + } + } + + /* Skip record that has level above the console loglevel. */ + if (suppress_message_printing(r.info->level)) { + con->seq++; + goto skip; + } + + if (con->flags & CON_EXTENDED) { + write_text = &ext_text[0]; + len = info_print_ext_header(ext_text, sizeof(ext_text), r.info); + len += msg_print_ext_body(ext_text + len, sizeof(ext_text) - len, + &r.text_buf[0], r.info->text_len, &r.info->dev_info); + } else { + write_text = &text[0]; + len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + } + + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + + stop_critical_timings(); /* don't trace print latency */ + call_console_driver(con, write_text, len); + start_critical_timings(); + + con->seq++; + + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); +skip: + return true; +} + +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + bool any_usable = false; + struct console *con; + bool any_progress; + + *next_seq = 0; + *handover = false; + + do { + any_progress = false; + + for_each_console(con) { + bool progress; + + if (!console_is_usable(con)) + continue; + any_usable = true; + + progress = console_emit_next_record(con, handover); + if (*handover) + return false; + + /* Track the next of the highest seq flushed. */ + if (con->seq > *next_seq) + *next_seq = con->seq; + + if (!progress) + continue; + any_progress = true; + + /* Allow panic_cpu to take over the consoles safely. */ + if (abandon_console_lock_in_panic()) + return false; + + if (do_cond_resched) + cond_resched(); + } + } while (any_progress); + + return any_usable; } /** @@ -2678,28 +2797,20 @@ static inline int can_use_console(void) * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * - * If there is output waiting, we wake /dev/kmsg and syslog() users. - * * console_unlock(); may be called from any context. */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - static int panic_console_dropped; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; + bool do_cond_resched; + bool handover; + bool flushed; + u64 next_seq; if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2708,125 +2819,34 @@ void console_unlock(void) * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. + * messages practically incapacitating the system. Therefore, create + * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; - -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; - - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - if (panic_in_progress() && panic_console_dropped++ > 10) { - suppress_panic_printk = 1; - pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); - } - } - - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } + do { + console_may_schedule = 0; - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } + flushed = console_flush_all(do_cond_resched, &next_seq, &handover); + if (!handover) + __console_unlock(); /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. + * Abort if there was a failure to flush all messages to all + * usable consoles. Either it is not possible to flush (in + * which case it would be an infinite loop of retrying) or + * another context has taken over printing. */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; + if (!flushed) + break; /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). + * Some context may have added new records after + * console_flush_all() but before unlocking the console. + * Re-check if there is a new record to flush. If the trylock + * fails, another context is already handling the printing. */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; - - /* Allow panic_cpu to take over the consoles safely */ - if (abandon_console_lock_in_panic()) - break; - - if (do_cond_resched) - cond_resched(); - } - - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - - console_locked = 0; - up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && !abandon_console_lock_in_panic() && console_trylock()) - goto again; + } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } EXPORT_SYMBOL(console_unlock); @@ -2886,8 +2906,14 @@ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + c->seq = seq; + } console_unlock(); } @@ -3127,26 +3153,15 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; + newcon->dropped = 0; if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; - /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - console_seq = syslog_seq; + newcon->seq = syslog_seq; mutex_unlock(&syslog_lock); + } else { + /* Begin with next message. */ + newcon->seq = prb_next_seq(prb); } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3 From 03a749e628fdbc665d7f9712637f880a79da8b78 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:45 +0206 Subject: printk: move buffer definitions into console_emit_next_record() caller Extended consoles print extended messages and do not print messages about dropped records. Non-extended consoles print "normal" messages as well as extra messages about dropped records. Currently the buffers for these various message types are defined within the functions that might use them and their usage is based upon the CON_EXTENDED flag. This will be a problem when moving to kthread printers because each printer must be able to provide its own buffers. Move all the message buffer definitions outside of console_emit_next_record(). The caller knows if extended or dropped messages should be printed and can specify the appropriate buffers to use. The console_emit_next_record() and call_console_driver() functions can know what to print based on whether specified buffers are non-NULL. With this change, buffer definition/allocation/specification is separated from the code that does the various types of string printing. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-11-john.ogness@linutronix.de --- kernel/printk/printk.c | 60 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3dea8bbaf402..dec5355c5b5b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -394,6 +394,9 @@ static struct latched_seq clear_seq = { /* the maximum size of a formatted record (i.e. with prefix added per line) */ #define CONSOLE_LOG_MAX 1024 +/* the maximum size for a dropped text message */ +#define DROPPED_TEXT_MAX 64 + /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @@ -1923,18 +1926,18 @@ static int console_trylock_spinning(void) /* * Call the specified console driver, asking it to write out the specified - * text and length. For non-extended consoles, if any records have been + * text and length. If @dropped_text is non-NULL and any records have been * dropped, a dropped message will be written out first. */ -static void call_console_driver(struct console *con, const char *text, size_t len) +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) { - static char dropped_text[64]; size_t dropped_len; trace_console_rcuidle(text, len); - if (con->dropped && !(con->flags & CON_EXTENDED)) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), + if (con->dropped && dropped_text) { + dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, "** %lu printk messages dropped **\n", con->dropped); con->dropped = 0; @@ -2296,6 +2299,7 @@ EXPORT_SYMBOL(_printk); #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 +#define DROPPED_TEXT_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false @@ -2319,7 +2323,10 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_driver(struct console *con, const char *text, size_t len) { } +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) +{ +} static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2644,6 +2651,14 @@ static void __console_unlock(void) * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * + * @text is a buffer of size CONSOLE_LOG_MAX. + * + * If extended messages should be printed, @ext_text is a buffer of size + * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. + * + * If dropped messages should be printed, @dropped_text is a buffer of size + * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. + * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. @@ -2653,10 +2668,9 @@ static void __console_unlock(void) * * Requires the console_lock. */ -static bool console_emit_next_record(struct console *con, bool *handover) +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; static int panic_console_dropped; struct printk_info info; struct printk_record r; @@ -2664,7 +2678,7 @@ static bool console_emit_next_record(struct console *con, bool *handover) char *write_text; size_t len; - prb_rec_init_rd(&r, &info, text, sizeof(text)); + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); *handover = false; @@ -2686,13 +2700,13 @@ static bool console_emit_next_record(struct console *con, bool *handover) goto skip; } - if (con->flags & CON_EXTENDED) { - write_text = &ext_text[0]; - len = info_print_ext_header(ext_text, sizeof(ext_text), r.info); - len += msg_print_ext_body(ext_text + len, sizeof(ext_text) - len, + if (ext_text) { + write_text = ext_text; + len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); + len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { - write_text = &text[0]; + write_text = text; len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } @@ -2710,7 +2724,7 @@ static bool console_emit_next_record(struct console *con, bool *handover) console_lock_spinning_enable(); stop_critical_timings(); /* don't trace print latency */ - call_console_driver(con, write_text, len); + call_console_driver(con, write_text, len, dropped_text); start_critical_timings(); con->seq++; @@ -2746,6 +2760,9 @@ skip: */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { + static char dropped_text[DROPPED_TEXT_MAX]; + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; bool any_usable = false; struct console *con; bool any_progress; @@ -2763,7 +2780,16 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove continue; any_usable = true; - progress = console_emit_next_record(con, handover); + if (con->flags & CON_EXTENDED) { + /* Extended consoles do not print "dropped messages". */ + progress = console_emit_next_record(con, &text[0], + &ext_text[0], NULL, + handover); + } else { + progress = console_emit_next_record(con, &text[0], + NULL, &dropped_text[0], + handover); + } if (*handover) return false; -- cgit v1.2.3 From 3b604ca81202eea2a917eb6491e90f610fba0ec7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:46 +0206 Subject: printk: add pr_flush() Provide a might-sleep function to allow waiting for console printers to catch up to the latest logged message. Use pr_flush() whenever it is desirable to get buffered messages printed before continuing: suspend_console(), resume_console(), console_stop(), console_start(), console_unblank(). Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-12-john.ogness@linutronix.de --- include/linux/printk.h | 7 +++++ kernel/printk/printk.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index b70a42f94031..091fba7283e1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,6 +170,8 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit +extern bool pr_flush(int timeout_ms, bool reset_on_progress); + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -220,6 +222,11 @@ static inline void printk_deferred_exit(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dec5355c5b5b..a06999d55278 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2296,6 +2296,8 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); + #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 @@ -2328,6 +2330,7 @@ static void call_console_driver(struct console *con, const char *text, size_t le { } static bool suppress_message_printing(int level) { return false; } +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -2515,6 +2518,7 @@ void suspend_console(void) if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_flush(1000, true); console_lock(); console_suspended = 1; up_console_sem(); @@ -2527,6 +2531,7 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + pr_flush(1000, true); } /** @@ -2912,6 +2917,9 @@ void console_unblank(void) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); console_unlock(); + + if (!oops_in_progress) + pr_flush(1000, true); } /** @@ -2970,6 +2978,7 @@ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { + __pr_flush(console, 1000, true); console_lock(); console->flags &= ~CON_ENABLED; console_unlock(); @@ -2981,6 +2990,7 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3352,6 +3362,79 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* If @con is specified, only wait for that console. Otherwise wait for all. */ +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *c; + u64 last_diff = 0; + u64 printk_seq; + u64 diff; + u64 seq; + + might_sleep(); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + console_lock(); + for_each_console(c) { + if (con && con != c) + continue; + if (!console_is_usable(c)) + continue; + printk_seq = c->seq; + if (printk_seq < seq) + diff += seq - printk_seq; + } + console_unlock(); + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + /* no timeout limit */ + msleep(100); + } else if (remaining < 100) { + msleep(remaining); + remaining = 0; + } else { + msleep(100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Process context. May sleep while acquiring console lock. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return __pr_flush(NULL, timeout_ms, reset_on_progress); +} +EXPORT_SYMBOL(pr_flush); + /* * Delayed printk version, for scheduler-internal messages: */ -- cgit v1.2.3 From 2bb2b7b57f81255c13f4395ea911d6bdc70c9fe2 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:47 +0206 Subject: printk: add functions to prefer direct printing Once kthread printing is available, console printing will no longer occur in the context of the printk caller. However, there are some special contexts where it is desirable for the printk caller to directly print out kernel messages. Using pr_flush() to wait for threaded printers is only possible if the caller is in a sleepable context and the kthreads are active. That is not always the case. Introduce printk_prefer_direct_enter() and printk_prefer_direct_exit() functions to explicitly (and globally) activate/deactivate preferred direct console printing. The term "direct console printing" refers to printing to all enabled consoles from the context of the printk caller. The term "prefer" is used because this type of printing is only best effort. If the console is currently locked or other printers are already actively printing, the printk caller will need to rely on the other contexts to handle the printing. This preferred direct printing is how all printing has been handled until now (unless it was explicitly deferred). When kthread printing is introduced, there may be some unanticipated problems due to kthreads being unable to flush important messages. In order to minimize such risks, preferred direct printing is activated for the primary important messages when the system experiences general types of major errors. These are: - emergency reboot/shutdown - cpu and rcu stalls - hard and soft lockups - hung tasks - warn - sysrq Note that since kthread printing does not yet exist, no behavior changes result from this commit. This is only implementing the counter and marking the various places where preferred direct printing is active. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Paul E. McKenney # for RCU Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-13-john.ogness@linutronix.de --- drivers/tty/sysrq.c | 2 ++ include/linux/printk.h | 11 +++++++++++ kernel/hung_task.c | 11 ++++++++++- kernel/panic.c | 4 ++++ kernel/printk/printk.c | 28 ++++++++++++++++++++++++++++ kernel/rcu/tree_stall.h | 2 ++ kernel/reboot.c | 14 +++++++++++++- kernel/watchdog.c | 4 ++++ kernel/watchdog_hld.c | 4 ++++ 9 files changed, 78 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index bbfd004449b5..2884cd638d64 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -578,6 +578,7 @@ void __handle_sysrq(int key, bool check_mask) rcu_sysrq_start(); rcu_read_lock(); + printk_prefer_direct_enter(); /* * Raise the apparent loglevel to maximum so that the sysrq header * is shown to provide the user with positive feedback. We do not @@ -619,6 +620,7 @@ void __handle_sysrq(int key, bool check_mask) pr_cont("\n"); console_loglevel = orig_log_level; } + printk_prefer_direct_exit(); rcu_read_unlock(); rcu_sysrq_end(); diff --git a/include/linux/printk.h b/include/linux/printk.h index 091fba7283e1..cd26aab0ab2a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,6 +170,9 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit +extern void printk_prefer_direct_enter(void); +extern void printk_prefer_direct_exit(void); + extern bool pr_flush(int timeout_ms, bool reset_on_progress); /* @@ -222,6 +225,14 @@ static inline void printk_deferred_exit(void) { } +static inline void printk_prefer_direct_enter(void) +{ +} + +static inline void printk_prefer_direct_exit(void) +{ +} + static inline bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f7655..02a65d554340 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { + printk_prefer_direct_enter(); + if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", @@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; + + printk_prefer_direct_exit(); } touch_nmi_watchdog(); @@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); - if (hung_task_show_lock) + if (hung_task_show_lock) { + printk_prefer_direct_enter(); debug_show_all_locks(); + printk_prefer_direct_exit(); + } if (hung_task_show_all_bt) { hung_task_show_all_bt = false; + printk_prefer_direct_enter(); trigger_all_cpu_backtrace(); + printk_prefer_direct_exit(); } if (hung_task_call_panic) diff --git a/kernel/panic.c b/kernel/panic.c index 55b50e052ec3..7d422597403f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -560,6 +560,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); + printk_prefer_direct_enter(); + if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -597,6 +599,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); + + printk_prefer_direct_exit(); } #ifndef __WARN_FLAGS diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a06999d55278..ed7f738261cc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -362,6 +362,34 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK +static atomic_t printk_prefer_direct = ATOMIC_INIT(0); + +/** + * printk_prefer_direct_enter - cause printk() calls to attempt direct + * printing to all enabled consoles + * + * Since it is not possible to call into the console printing code from any + * context, there is no guarantee that direct printing will occur. + * + * This globally effects all printk() callers. + * + * Context: Any context. + */ +void printk_prefer_direct_enter(void) +{ + atomic_inc(&printk_prefer_direct); +} + +/** + * printk_prefer_direct_exit - restore printk() behavior + * + * Context: Any context. + */ +void printk_prefer_direct_exit(void) +{ + WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); +} + DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..d612707c2ed0 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -619,6 +619,7 @@ static void print_cpu_stall(unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ + printk_prefer_direct_enter(); trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); @@ -656,6 +657,7 @@ static void print_cpu_stall(unsigned long gps) */ set_tsk_need_resched(current); set_preempt_need_resched(); + printk_prefer_direct_exit(); } static void check_cpu_stall(struct rcu_data *rdp) diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..4177645e74d6 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -447,9 +447,11 @@ static int __orderly_reboot(void) ret = run_cmd(reboot_cmd); if (ret) { + printk_prefer_direct_enter(); pr_warn("Failed to start orderly reboot: forcing the issue\n"); emergency_sync(); kernel_restart(NULL); + printk_prefer_direct_exit(); } return ret; @@ -462,6 +464,7 @@ static int __orderly_poweroff(bool force) ret = run_cmd(poweroff_cmd); if (ret && force) { + printk_prefer_direct_enter(); pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* @@ -471,6 +474,7 @@ static int __orderly_poweroff(bool force) */ emergency_sync(); kernel_power_off(); + printk_prefer_direct_exit(); } return ret; @@ -528,6 +532,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); */ static void hw_failure_emergency_poweroff_func(struct work_struct *work) { + printk_prefer_direct_enter(); + /* * We have reached here after the emergency shutdown waiting period has * expired. This means orderly_poweroff has not been able to shut off @@ -544,6 +550,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work) */ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); emergency_restart(); + + printk_prefer_direct_exit(); } static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, @@ -582,11 +590,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) { static atomic_t allow_proceed = ATOMIC_INIT(1); + printk_prefer_direct_enter(); + pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) - return; + goto out; /* * Queue a backup emergency shutdown in the event of @@ -594,6 +604,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) */ hw_failure_emergency_poweroff(ms_until_forced); orderly_poweroff(true); +out: + printk_prefer_direct_exit(); } EXPORT_SYMBOL_GPL(hw_protection_shutdown); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9166220457bc..40024e03d422 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); + printk_prefer_direct_enter(); + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); + + printk_prefer_direct_exit(); } return HRTIMER_RESTART; diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 247bf0b1582c..701f35f0e2d4 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; + printk_prefer_direct_enter(); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); @@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event, if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + printk_prefer_direct_exit(); + __this_cpu_write(hard_watchdog_warn, true); return; } -- cgit v1.2.3 From 09c5ba0aa2fcfdadb17d045c3ee6f86d69270df7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:48 +0206 Subject: printk: add kthread console printers Create a kthread for each console to perform console printing. During normal operation (@system_state == SYSTEM_RUNNING), the kthread printers are responsible for all printing on their respective consoles. During non-normal operation, console printing is done as it has been: within the context of the printk caller or within irqwork triggered by the printk caller, referred to as direct printing. Since threaded console printers are responsible for all printing during normal operation, this also includes messages generated via deferred printk calls. If direct printing is in effect during a deferred printk call, the queued irqwork will perform the direct printing. To make it clear that this is the only time that the irqwork will perform direct printing, rename the flag PRINTK_PENDING_OUTPUT to PRINTK_PENDING_DIRECT_OUTPUT. Threaded console printers synchronize against each other and against console lockers by taking the console lock for each message that is printed. Note that the kthread printers do not care about direct printing. They will always try to print if new records are available. They can be blocked by direct printing, but will be woken again once direct printing is finished. Console unregistration is a bit tricky because the associated kthread printer cannot be stopped while the console lock is held. A policy is implemented that states: whichever task clears con->thread (under the console lock) is responsible for stopping the kthread. unregister_console() will clear con->thread while the console lock is held and then stop the kthread after releasing the console lock. For consoles that have implemented the exit() callback, the kthread is stopped before exit() is called. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-14-john.ogness@linutronix.de --- include/linux/console.h | 2 + kernel/printk/printk.c | 329 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 309 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index 8c1686e2c233..9a251e70c090 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,6 +153,8 @@ struct console { uint ospeed; u64 seq; unsigned long dropped; + struct task_struct *thread; + void *data; struct console *next; }; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ed7f738261cc..e4cdc424c826 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -361,6 +361,13 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * A flag to signify if printk_activate_kthreads() has already started the + * kthread printers. If true, any later registered consoles must start their + * own kthread directly. The flag is write protected by the console_lock. + */ +static bool printk_kthreads_available; + #ifdef CONFIG_PRINTK static atomic_t printk_prefer_direct = ATOMIC_INIT(0); @@ -390,6 +397,39 @@ void printk_prefer_direct_exit(void) WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); } +/* + * Calling printk() always wakes kthread printers so that they can + * flush the new message to their respective consoles. Also, if direct + * printing is allowed, printk() tries to flush the messages directly. + * + * Direct printing is allowed in situations when the kthreads + * are not available or the system is in a problematic state. + * + * See the implementation about possible races. + */ +static inline bool allow_direct_printing(void) +{ + /* + * Checking kthread availability is a possible race because the + * kthread printers can become permanently disabled during runtime. + * However, doing that requires holding the console_lock, so any + * pending messages will be direct printed by console_unlock(). + */ + if (!printk_kthreads_available) + return true; + + /* + * Prefer direct printing when the system is in a problematic state. + * The context that sets this state will always see the updated value. + * The other contexts do not care. Anyway, direct printing is just a + * best effort. The direct output is only possible when console_lock + * is not already taken and no kthread printers are actively printing. + */ + return (system_state > SYSTEM_RUNNING || + oops_in_progress || + atomic_read(&printk_prefer_direct)); +} + DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -2280,10 +2320,10 @@ asmlinkage int vprintk_emit(int facility, int level, printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (!in_sched && allow_direct_printing()) { /* * The caller may be holding system-critical or - * timing-sensitive locks. Disable preemption during + * timing-sensitive locks. Disable preemption during direct * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. @@ -2326,6 +2366,8 @@ EXPORT_SYMBOL(_printk); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); +static void printk_start_kthread(struct console *con); + #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 @@ -2359,6 +2401,8 @@ static void call_console_driver(struct console *con, const char *text, size_t le } static bool suppress_message_printing(int level) { return false; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } +static void printk_start_kthread(struct console *con) { } +static bool allow_direct_printing(void) { return true; } #endif /* CONFIG_PRINTK */ @@ -2559,6 +2603,13 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + + /* + * While suspended, new records may have been added to the + * ringbuffer. Wake up the kthread printers to print them. + */ + wake_up_klogd(); + pr_flush(1000, true); } @@ -2577,6 +2628,9 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); + + /* Wake kthread printers. Some may have become usable. */ + wake_up_klogd(); } return 0; } @@ -2648,18 +2702,9 @@ static bool abandon_console_lock_in_panic(void) return atomic_read(&panic_cpu) != raw_smp_processor_id(); } -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_lock. - */ -static inline bool console_is_usable(struct console *con) +static inline bool __console_is_usable(short flags) { - if (!(con->flags & CON_ENABLED)) - return false; - - if (!con->write) + if (!(flags & CON_ENABLED)) return false; /* @@ -2668,12 +2713,26 @@ static inline bool console_is_usable(struct console *con) * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ if (!cpu_online(raw_smp_processor_id()) && - !(con->flags & CON_ANYTIME)) + !(flags & CON_ANYTIME)) return false; return true; } +/* + * Check if the given console is currently capable and allowed to print + * records. + * + * Requires the console_lock. + */ +static inline bool console_is_usable(struct console *con) +{ + if (!con->write) + return false; + + return __console_is_usable(con->flags); +} + static void __console_unlock(void) { console_locked = 0; @@ -2786,8 +2845,8 @@ skip: * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this - * is not the panic CPU). Regardless the reason, the caller should assume it - * is not useful to immediately try again. + * is not the panic CPU or direct printing is not preferred). Regardless the + * reason, the caller should assume it is not useful to immediately try again. * * Requires the console_lock. */ @@ -2804,6 +2863,10 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove *handover = false; do { + /* Let the kthread printers do the work if they can. */ + if (!allow_direct_printing()) + return false; + any_progress = false; for_each_console(con) { @@ -3018,6 +3081,10 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + + /* Wake the newly enabled kthread printer. */ + wake_up_klogd(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3218,6 +3285,8 @@ void register_console(struct console *newcon) nr_ext_console_drivers++; newcon->dropped = 0; + newcon->thread = NULL; + if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); @@ -3227,6 +3296,10 @@ void register_console(struct console *newcon) /* Begin with next message. */ newcon->seq = prb_next_seq(prb); } + + if (printk_kthreads_available) + printk_start_kthread(newcon); + console_unlock(); console_sysfs_notify(); @@ -3253,6 +3326,7 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { + struct task_struct *thd; struct console *con; int res; @@ -3293,7 +3367,20 @@ int unregister_console(struct console *console) console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; + + /* + * console->thread can only be cleared under the console lock. But + * stopping the thread must be done without the console lock. The + * task that clears @thread is the task that stops the kthread. + */ + thd = console->thread; + console->thread = NULL; + console_unlock(); + + if (thd) + kthread_stop(thd); + console_sysfs_notify(); if (console->exit) @@ -3389,6 +3476,20 @@ static int __init printk_late_init(void) } late_initcall(printk_late_init); +static int __init printk_activate_kthreads(void) +{ + struct console *con; + + console_lock(); + printk_kthreads_available = true; + for_each_console(con) + printk_start_kthread(con); + console_unlock(); + + return 0; +} +early_initcall(printk_activate_kthreads); + #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) @@ -3463,11 +3564,180 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) } EXPORT_SYMBOL(pr_flush); +static void __printk_fallback_preferred_direct(void) +{ + printk_prefer_direct_enter(); + pr_err("falling back to preferred direct printing\n"); + printk_kthreads_available = false; +} + +/* + * Enter preferred direct printing, but never exit. Mark console threads as + * unavailable. The system is then forever in preferred direct printing and + * any printing threads will exit. + * + * Must *not* be called under console_lock. Use + * __printk_fallback_preferred_direct() if already holding console_lock. + */ +static void printk_fallback_preferred_direct(void) +{ + console_lock(); + __printk_fallback_preferred_direct(); + console_unlock(); +} + +static bool printer_should_wake(struct console *con, u64 seq) +{ + short flags; + + if (kthread_should_stop() || !printk_kthreads_available) + return true; + + if (console_suspended) + return false; + + /* + * This is an unsafe read from con->flags, but a false positive is + * not a problem. Worst case it would allow the printer to wake up + * although it is disabled. But the printer will notice that when + * attempting to print and instead go back to sleep. + */ + flags = data_race(READ_ONCE(con->flags)); + + if (!__console_is_usable(flags)) + return false; + + return prb_read_valid(prb, seq, NULL); +} + +static int printk_kthread_func(void *data) +{ + struct console *con = data; + char *dropped_text = NULL; + char *ext_text = NULL; + bool handover; + u64 seq = 0; + char *text; + int error; + + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) { + con_printk(KERN_ERR, con, "failed to allocate text buffer\n"); + printk_fallback_preferred_direct(); + goto out; + } + + if (con->flags & CON_EXTENDED) { + ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!ext_text) { + con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n"); + printk_fallback_preferred_direct(); + goto out; + } + } else { + dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); + if (!dropped_text) { + con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n"); + printk_fallback_preferred_direct(); + goto out; + } + } + + con_printk(KERN_INFO, con, "printing thread started\n"); + + for (;;) { + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ + error = wait_event_interruptible(log_wait, + printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */ + + if (kthread_should_stop() || !printk_kthreads_available) + break; + + if (error) + continue; + + console_lock(); + + if (console_suspended) { + up_console_sem(); + continue; + } + + if (!console_is_usable(con)) { + __console_unlock(); + continue; + } + + /* + * Even though the printk kthread is always preemptible, it is + * still not allowed to call cond_resched() from within + * console drivers. The task may become non-preemptible in the + * console driver call chain. For example, vt_console_print() + * takes a spinlock and then can call into fbcon_redraw(), + * which can conditionally invoke cond_resched(). + */ + console_may_schedule = 0; + console_emit_next_record(con, text, ext_text, dropped_text, &handover); + if (handover) + continue; + + seq = con->seq; + + __console_unlock(); + } + + con_printk(KERN_INFO, con, "printing thread stopped\n"); +out: + kfree(dropped_text); + kfree(ext_text); + kfree(text); + + console_lock(); + /* + * If this kthread is being stopped by another task, con->thread will + * already be NULL. That is fine. The important thing is that it is + * NULL after the kthread exits. + */ + con->thread = NULL; + console_unlock(); + + return 0; +} + +/* Must be called under console_lock. */ +static void printk_start_kthread(struct console *con) +{ + /* + * Do not start a kthread if there is no write() callback. The + * kthreads assume the write() callback exists. + */ + if (!con->write) + return; + + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { + con->thread = NULL; + con_printk(KERN_ERR, con, "unable to start printing thread\n"); + __printk_fallback_preferred_direct(); + return; + } +} + /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_OUTPUT 0x02 +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_DIRECT_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); @@ -3475,10 +3745,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_OUTPUT) { + if (pending & PRINTK_PENDING_DIRECT_OUTPUT) { + printk_prefer_direct_enter(); + /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); + + printk_prefer_direct_exit(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -3503,10 +3777,11 @@ static void __wake_up_klogd(int val) * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * - * This pairs with devkmsg_read:A and syslog_print:A. + * This pairs with devkmsg_read:A, syslog_print:A, and + * printk_kthread_func:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ - (val & PRINTK_PENDING_OUTPUT)) { + (val & PRINTK_PENDING_DIRECT_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } @@ -3524,7 +3799,17 @@ void defer_console_output(void) * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ - __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); + int val = PRINTK_PENDING_WAKEUP; + + /* + * Make sure that some context will print the messages when direct + * printing is allowed. This happens in situations when the kthreads + * may not be as reliable or perhaps unusable. + */ + if (allow_direct_printing()) + val |= PRINTK_PENDING_DIRECT_OUTPUT; + + __wake_up_klogd(val); } void printk_trigger_flush(void) -- cgit v1.2.3 From df86ca0d2f0fa6be525a25b0b3d836d361f85754 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Apr 2022 20:39:43 -0700 Subject: bpf: Allow attach TRACING programs through LINK_CREATE command Allow attaching BTF-aware TRACING programs, previously attachable only through BPF_RAW_TRACEPOINT_OPEN command, through LINK_CREATE command: - BTF-aware raw tracepoints (tp_btf in libbpf lingo); - fentry/fexit/fmod_ret programs; - BPF LSM programs. This change converges all bpf_link-based attachments under LINK_CREATE command allowing to further extend the API with features like BPF cookie under "multiplexed" link_create section of bpf_attr. Non-BTF-aware raw tracepoints are left under BPF_RAW_TRACEPOINT_OPEN, but there is nothing preventing opening them up to LINK_CREATE as well. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Kuifeng Lee Link: https://lore.kernel.org/bpf/20220421033945.3602803-2-andrii@kernel.org --- kernel/bpf/syscall.c | 110 ++++++++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e9621cfa09f2..e9e3e49c0eb7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3030,66 +3030,45 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } #endif /* CONFIG_PERF_EVENTS */ -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd - -static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +static int bpf_raw_tp_link_attach(struct bpf_prog *prog, + const char __user *user_tp_name) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; const char *tp_name; char buf[128]; int err; - if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) - return -EINVAL; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - switch (prog->type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: - if (attr->raw_tracepoint.name) { + if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. */ - err = -EINVAL; - goto out_put_prog; - } + return -EINVAL; if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; break; } - err = bpf_tracing_prog_attach(prog, 0, 0); - if (err >= 0) - return err; - goto out_put_prog; + return bpf_tracing_prog_attach(prog, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: - if (strncpy_from_user(buf, - u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(buf) - 1) < 0) { - err = -EFAULT; - goto out_put_prog; - } + if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) + return -EFAULT; buf[sizeof(buf) - 1] = 0; tp_name = buf; break; default: - err = -EINVAL; - goto out_put_prog; + return -EINVAL; } btp = bpf_get_raw_tracepoint(tp_name); - if (!btp) { - err = -ENOENT; - goto out_put_prog; - } + if (!btp) + return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER); if (!link) { @@ -3116,11 +3095,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) out_put_btp: bpf_put_raw_tracepoint(btp); -out_put_prog: - bpf_prog_put(prog); return err; } +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + int fd; + + if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) + return -EINVAL; + + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + if (fd < 0) + bpf_prog_put(prog); + return fd; +} + static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -3189,7 +3186,13 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; + case BPF_LSM_MAC: + return BPF_PROG_TYPE_LSM; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: @@ -4246,21 +4249,6 @@ err_put: return err; } -static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, - struct bpf_prog *prog) -{ - if (attr->link_create.attach_type != prog->expected_attach_type) - return -EINVAL; - - if (prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, uattr, prog); - else if (prog->type == BPF_PROG_TYPE_EXT) - return bpf_tracing_prog_attach(prog, - attr->link_create.target_fd, - attr->link_create.target_btf_id); - return -EINVAL; -} - #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { @@ -4282,15 +4270,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: - ret = tracing_bpf_link_attach(attr, uattr, prog); - goto out; + break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; goto out; } - ptype = prog->type; break; case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type != BPF_PERF_EVENT && @@ -4298,7 +4284,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = -EINVAL; goto out; } - ptype = prog->type; break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); @@ -4309,7 +4294,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) break; } - switch (ptype) { + switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -4319,8 +4304,25 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_EXT: + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); + break; + case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: - ret = tracing_bpf_link_attach(attr, uattr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + ret = bpf_raw_tp_link_attach(prog, NULL); + else if (prog->expected_attach_type == BPF_TRACE_ITER) + ret = bpf_iter_link_attach(attr, uattr, prog); + else + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: -- cgit v1.2.3 From 9ea4dcf49878bb9546b8fa9319dcbdc9b7ee20f8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Apr 2022 15:58:11 -0700 Subject: PM: CXL: Disable suspend The CXL specification claims S3 support at a hardware level, but at a system software level there are some missing pieces. Section 9.4 (CXL 2.0) rightly claims that "CXL mem adapters may need aux power to retain memory context across S3", but there is no enumeration mechanism for the OS to determine if a given adapter has that support. Moreover the save state and resume image for the system may inadvertantly end up in a CXL device that needs to be restored before the save state is recoverable. I.e. a circular dependency that is not resolvable without a third party save-area. Arrange for the cxl_mem driver to fail S3 attempts. This still nominaly allows for suspend, but requires unbinding all CXL memory devices before the suspend to ensure the typical DRAM flow is taken. The cxl_mem unbind flow is intended to also tear down all CXL memory regions associated with a given cxl_memdev. It is reasonable to assume that any device participating in a System RAM range published in the EFI memory map is covered by aux power and save-area outside the device itself. So this restriction can be minimized in the future once pre-existing region enumeration support arrives, and perhaps a spec update to clarify if the EFI memory map is sufficent for determining the range of devices managed by platform-firmware for S3 support. Per Rafael, if the CXL configuration prevents suspend then it should fail early before tasks are frozen, and mem_sleep should stop showing 'mem' as an option [1]. Effectively CXL augments the platform suspend ->valid() op since, for example, the ACPI ops are not aware of the CXL / PCI dependencies. Given the split role of platform firmware vs OS provisioned CXL memory it is up to the cxl_mem driver to determine if the CXL configuration has elements that platform firmware may not be prepared to restore. Link: https://lore.kernel.org/r/CAJZ5v0hGVN_=3iU8OLpHY3Ak35T5+JcBM-qs8SbojKrpd0VXsA@mail.gmail.com [1] Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/165066828317.3907920.5690432272182042556.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/Makefile | 2 +- drivers/cxl/Kconfig | 4 ++++ drivers/cxl/Makefile | 2 +- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/suspend.c | 24 ++++++++++++++++++++++++ drivers/cxl/cxlmem.h | 11 +++++++++++ drivers/cxl/mem.c | 22 +++++++++++++++++++++- include/linux/pm.h | 9 +++++++++ kernel/power/hibernate.c | 2 +- kernel/power/main.c | 5 ++++- kernel/power/suspend.c | 3 ++- 11 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 drivers/cxl/core/suspend.c (limited to 'kernel') diff --git a/drivers/Makefile b/drivers/Makefile index 020780b6b4d2..f735c4955143 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -72,9 +72,9 @@ obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_DAX) += dax/ -obj-$(CONFIG_CXL_BUS) += cxl/ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ obj-$(CONFIG_NUBUS) += nubus/ +obj-y += cxl/ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index b88ab956bb7c..f64e3984689f 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -98,4 +98,8 @@ config CXL_PORT default CXL_BUS tristate +config CXL_SUSPEND + def_bool y + depends on SUSPEND && CXL_MEM + endif diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile index ce267ef11d93..a78270794150 100644 --- a/drivers/cxl/Makefile +++ b/drivers/cxl/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CXL_BUS) += core/ +obj-y += core/ obj-$(CONFIG_CXL_PCI) += cxl_pci.o obj-$(CONFIG_CXL_MEM) += cxl_mem.o obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 6d37cd78b151..9d35085d25af 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CXL_BUS) += cxl_core.o +obj-$(CONFIG_CXL_SUSPEND) += suspend.o ccflags-y += -I$(srctree)/drivers/cxl cxl_core-y := port.o diff --git a/drivers/cxl/core/suspend.c b/drivers/cxl/core/suspend.c new file mode 100644 index 000000000000..a5984d96ea1d --- /dev/null +++ b/drivers/cxl/core/suspend.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include "cxlmem.h" + +static atomic_t mem_active; + +bool cxl_mem_active(void) +{ + return atomic_read(&mem_active) != 0; +} + +void cxl_mem_active_inc(void) +{ + atomic_inc(&mem_active); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, CXL); + +void cxl_mem_active_dec(void) +{ + atomic_dec(&mem_active); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_active_dec, CXL); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 243dd86a8b46..7235d2f976e5 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -353,6 +353,17 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); struct cxl_dev_state *cxl_dev_state_create(struct device *dev); void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); +#ifdef CONFIG_CXL_SUSPEND +void cxl_mem_active_inc(void); +void cxl_mem_active_dec(void); +#else +static inline void cxl_mem_active_inc(void) +{ +} +static inline void cxl_mem_active_dec(void) +{ +} +#endif struct cxl_hdm { struct cxl_component_regs regs; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 43e73d259207..1bd2e0b67f59 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -138,6 +138,11 @@ out: return retval; } +static void enable_suspend(void *data) +{ + cxl_mem_active_dec(); +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -194,7 +199,22 @@ static int cxl_mem_probe(struct device *dev) out: cxl_device_unlock(&parent_port->dev); put_device(&parent_port->dev); - return rc; + + /* + * The kernel may be operating out of CXL memory on this device, + * there is no spec defined way to determine whether this device + * preserves contents over suspend, and there is no simple way + * to arrange for the suspend image to avoid CXL memory which + * would setup a circular dependency between PCI resume and save + * state restoration. + * + * TODO: support suspend when all the regions this device is + * hosting are locked and covered by the system address map, + * i.e. platform firmware owns restoring the HDM configuration + * that it locked. + */ + cxl_mem_active_inc(); + return devm_add_action_or_reset(dev, enable_suspend, NULL); } static struct cxl_driver cxl_mem_driver = { diff --git a/include/linux/pm.h b/include/linux/pm.h index e65b3ab28377..7911c4c9a7be 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -36,6 +36,15 @@ static inline void pm_vt_switch_unregister(struct device *dev) } #endif /* CONFIG_VT_CONSOLE_SLEEP */ +#ifdef CONFIG_CXL_SUSPEND +bool cxl_mem_active(void); +#else +static inline bool cxl_mem_active(void) +{ + return false; +} +#endif + /* * Device power management */ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 938d5c78b421..20a66bf9f465 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -83,7 +83,7 @@ bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION) && - !secretmem_active(); + !secretmem_active() && !cxl_mem_active(); } /** diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..3e6be1c33e0b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -127,7 +127,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *s = buf; suspend_state_t i; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { + if (i >= PM_SUSPEND_MEM && cxl_mem_active()) + continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; @@ -136,6 +138,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, else s += sprintf(s, "%s ", label); } + } /* Convert the last space to a newline if needed. */ if (s != buf) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6fcdee7e87a5..827075944d28 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -236,7 +236,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE || valid_state(state); + return state == PM_SUSPEND_TO_IDLE || + (valid_state(state) && !cxl_mem_active()); } static int platform_suspend_prepare(suspend_state_t state) -- cgit v1.2.3 From 867a448d587e7fa845bceaf4ee1c632448f2a9fa Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:15 +0300 Subject: fsnotify: pass flags argument to fsnotify_alloc_group() Add flags argument to fsnotify_alloc_group(), define and use the flag FSNOTIFY_GROUP_USER in inotify and fanotify instead of the helper fsnotify_alloc_user_group() to indicate user allocation. Although the flag FSNOTIFY_GROUP_USER is currently not used after group allocation, we store the flags argument in the group struct for future use of other group flags. Link: https://lore.kernel.org/r/20220422120327.3459282-5-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/nfsd/filecache.c | 3 ++- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fanotify/fanotify_user.c | 3 ++- fs/notify/group.c | 21 +++++++++------------ fs/notify/inotify/inotify_user.c | 3 ++- include/linux/fsnotify_backend.h | 8 ++++++-- kernel/audit_fsnotify.c | 3 ++- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 9 files changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2c1b027774d4..74ddb828fd75 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -678,7 +678,8 @@ nfsd_file_cache_init(void) goto out_shrinker; } - nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops, + 0); if (IS_ERR(nfsd_file_fsnotify_group)) { pr_err("nfsd: unable to create fsnotify group: %ld\n", PTR_ERR(nfsd_file_fsnotify_group)); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 829dd4a61b66..e4779926edf4 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -401,7 +401,7 @@ static int __init dnotify_init(void) SLAB_PANIC|SLAB_ACCOUNT); dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); - dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); + dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, 0); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); dnotify_sysctl_init(); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9b32b76a9c30..3649c99b3e45 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1355,7 +1355,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) f_flags |= O_NONBLOCK; /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ - group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); + group = fsnotify_alloc_group(&fanotify_fsnotify_ops, + FSNOTIFY_GROUP_USER); if (IS_ERR(group)) { return PTR_ERR(group); } diff --git a/fs/notify/group.c b/fs/notify/group.c index b7d4d64f87c2..18446b7b0d49 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -112,7 +112,8 @@ void fsnotify_put_group(struct fsnotify_group *group) EXPORT_SYMBOL_GPL(fsnotify_put_group); static struct fsnotify_group *__fsnotify_alloc_group( - const struct fsnotify_ops *ops, gfp_t gfp) + const struct fsnotify_ops *ops, + int flags, gfp_t gfp) { struct fsnotify_group *group; @@ -133,6 +134,7 @@ static struct fsnotify_group *__fsnotify_alloc_group( INIT_LIST_HEAD(&group->marks_list); group->ops = ops; + group->flags = flags; return group; } @@ -140,20 +142,15 @@ static struct fsnotify_group *__fsnotify_alloc_group( /* * Create a new fsnotify_group and hold a reference for the group returned. */ -struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops, + int flags) { - return __fsnotify_alloc_group(ops, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(fsnotify_alloc_group); + gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT : + GFP_KERNEL; -/* - * Create a new fsnotify_group and hold a reference for the group returned. - */ -struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops) -{ - return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT); + return __fsnotify_alloc_group(ops, flags, gfp); } -EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group); +EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index d8907d32a05b..146890ecd93a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -656,7 +656,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) struct fsnotify_group *group; struct inotify_event_info *oevent; - group = fsnotify_alloc_user_group(&inotify_fsnotify_ops); + group = fsnotify_alloc_group(&inotify_fsnotify_ops, + FSNOTIFY_GROUP_USER); if (IS_ERR(group)) return group; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b1c72edd9784..f0bf557af009 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -210,6 +210,9 @@ struct fsnotify_group { unsigned int priority; bool shutdown; /* group is being shut down, don't queue more events */ +#define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ + int flags; + /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user @@ -543,8 +546,9 @@ static inline void fsnotify_update_flags(struct dentry *dentry) /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ -extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops); -extern struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops); +extern struct fsnotify_group *fsnotify_alloc_group( + const struct fsnotify_ops *ops, + int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 02348b48447c..35fe149586c8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -181,7 +181,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = { static int __init audit_fsnotify_init(void) { - audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops, + 0); if (IS_ERR(audit_fsnotify_group)) { audit_fsnotify_group = NULL; audit_panic("cannot create audit fsnotify group"); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e7315d487163..b5c02f8573fe 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1074,7 +1074,7 @@ static int __init audit_tree_init(void) audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC); - audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0); if (IS_ERR(audit_tree_group)) audit_panic("cannot initialize fsnotify group for rectree watches"); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 713b256be944..4b0957aa2cd4 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -493,7 +493,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = { static int __init audit_watch_init(void) { - audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0); if (IS_ERR(audit_watch_group)) { audit_watch_group = NULL; audit_panic("cannot create audit fsnotify group"); -- cgit v1.2.3 From f3010343d9e119da35ee864b3a28993bb5c78ed7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:16 +0300 Subject: fsnotify: make allow_dups a property of the group Instead of passing the allow_dups argument to fsnotify_add_mark() as an argument, define the group flag FSNOTIFY_GROUP_DUPS to express the allow_dups behavior and set this behavior at group creation time for all calls of fsnotify_add_mark(). Rename the allow_dups argument to generic add_flags argument for future use. Link: https://lore.kernel.org/r/20220422120327.3459282-6-amir73il@gmail.com Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/mark.c | 12 ++++++------ include/linux/fsnotify_backend.h | 13 +++++++------ kernel/audit_fsnotify.c | 4 ++-- 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c86982be2d50..1fb246ea6175 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -574,7 +574,7 @@ out: static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int allow_dups, __kernel_fsid_t *fsid) + int add_flags, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -633,7 +633,7 @@ restart: if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && - !allow_dups) { + !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } @@ -668,7 +668,7 @@ out_err: */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int allow_dups, __kernel_fsid_t *fsid) + int add_flags, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -688,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); if (ret) goto err; @@ -708,14 +708,14 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int allow_dups, + unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); mutex_unlock(&group->mark_mutex); return ret; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index f0bf557af009..dd440e6ff528 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -211,6 +211,7 @@ struct fsnotify_group { bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ +#define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ int flags; /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ @@ -641,26 +642,26 @@ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn, /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int allow_dups, __kernel_fsid_t *fsid); + int add_flags, __kernel_fsid_t *fsid); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int allow_dups, + unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, - int allow_dups) + int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, - int allow_dups) + int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, allow_dups, + FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); } diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 35fe149586c8..6432a37ac1c9 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -100,7 +100,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true); + ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); @@ -182,7 +182,7 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = { static int __init audit_fsnotify_init(void) { audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops, - 0); + FSNOTIFY_GROUP_DUPS); if (IS_ERR(audit_fsnotify_group)) { audit_fsnotify_group = NULL; audit_panic("cannot create audit fsnotify group"); -- cgit v1.2.3 From 960bdff24ce802e38df918ebfcbfa62744d6ae22 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:19 +0300 Subject: audit: use fsnotify group lock helpers audit inode marks pin the inode so there is no need to set the FSNOTIFY_GROUP_NOFS flag. Link: https://lore.kernel.org/r/20220422120327.3459282-9-amir73il@gmail.com Suggested-by: Jan Kara Link: https://lore.kernel.org/r/20220321112310.vpr7oxro2xkz5llh@quack3.lan/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/audit_tree.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b5c02f8573fe..e867c17d3f84 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -351,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) struct audit_chunk *new; int size; - mutex_lock(&audit_tree_group->mark_mutex); + fsnotify_group_lock(audit_tree_group); /* * mark_mutex stabilizes chunk attached to the mark so we can check * whether it didn't change while we've dropped hash_lock. @@ -368,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); fsnotify_detach_mark(mark); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); audit_mark_put_chunk(chunk); fsnotify_free_mark(mark); return; @@ -385,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) */ replace_chunk(new, chunk); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); audit_mark_put_chunk(chunk); return; out_mutex: - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); } /* Call with group->mark_mutex held, releases it */ @@ -400,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); return -ENOMEM; } mark = alloc_mark(); if (!mark) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); kfree(chunk); return -ENOMEM; } if (fsnotify_add_inode_mark_locked(mark, inode, 0)) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); kfree(chunk); return -ENOSPC; @@ -422,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); fsnotify_detach_mark(mark); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); kfree(chunk); @@ -444,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) */ insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); /* * Drop our initial reference. When mark we point to is getting freed, * we get notification through ->freeing_mark callback and cleanup @@ -462,7 +462,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) struct audit_node *p; int n; - mutex_lock(&audit_tree_group->mark_mutex); + fsnotify_group_lock(audit_tree_group); mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); if (!mark) return create_chunk(inode, tree); @@ -478,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); return 0; } @@ -487,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); return -ENOMEM; } @@ -495,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); kfree(chunk); return 0; @@ -515,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) */ replace_chunk(chunk, old); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */ audit_mark_put_chunk(old); @@ -1044,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark, { struct audit_chunk *chunk; - mutex_lock(&mark->group->mark_mutex); + fsnotify_group_lock(mark->group); spin_lock(&hash_lock); chunk = mark_chunk(mark); replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); - mutex_unlock(&mark->group->mark_mutex); + fsnotify_group_unlock(mark->group); if (chunk) { evict_chunk(chunk); audit_mark_put_chunk(chunk); -- cgit v1.2.3 From a467257ffe4bdb13eacddec0137013f6a1140b81 Mon Sep 17 00:00:00 2001 From: yingelin Date: Sun, 24 Apr 2022 10:57:40 +0800 Subject: kernel/kexec_core: move kexec_core sysctls into its own file This move the kernel/kexec_core.c respective sysctls to its own file. kernel/sysctl.c has grown to an insane mess, We move sysctls to places where features actually belong to improve the readability and reduce merge conflicts. At the same time, the proc-sysctl maintainers can easily care about the core logic other than the sysctl knobs added for some feature. We already moved all filesystem sysctls out. This patch is part of the effort to move kexec related sysctls out. Signed-off-by: yingelin Acked-by: Baoquan He Signed-off-by: Luis Chamberlain --- kernel/kexec_core.c | 22 ++++++++++++++++++++++ kernel/sysctl.c | 13 ------------- 2 files changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..a0456baf52cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -936,6 +936,28 @@ int kimage_load_segment(struct kimage *image, struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; +#ifdef CONFIG_SYSCTL +static struct ctl_table kexec_core_sysctls[] = { + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init kexec_core_sysctl_init(void) +{ + register_sysctl_init("kernel", kexec_core_sysctls); + return 0; +} +late_initcall(kexec_core_sysctl_init); +#endif /* * No panic_cpu check version of crash_kexec(). This function is called diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b60345cbadf0..0f3cb61a2e39 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include @@ -1712,18 +1711,6 @@ static struct ctl_table kern_table[] = { .proc_handler = tracepoint_printk_sysctl, }, #endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", -- cgit v1.2.3 From d9d31cf88702ae071bec033e5c8714048aa71285 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 25 Apr 2022 15:04:48 -0700 Subject: bpf: Use bpf_prog_run_array_cg_flags everywhere Rename bpf_prog_run_array_cg_flags to bpf_prog_run_array_cg and use it everywhere. check_return_code already enforces sane return ranges for all cgroup types. (only egress and bind hooks have uncanonical return ranges, the rest is using [0, 1]) No functional changes. v2: - 'func_ret & 1' under explicit test (Andrii & Martin) Suggested-by: Alexei Starovoitov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220425220448.3669032-1-sdf@google.com --- include/linux/bpf-cgroup.h | 8 ++---- kernel/bpf/cgroup.c | 72 ++++++++++++++++------------------------------ 2 files changed, 26 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 88a51b242adc..669d96d074ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -225,24 +225,20 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ - u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ - NULL, \ - &__unused_flags); \ + NULL, NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ - u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ - t_ctx, \ - &__unused_flags); \ + t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0cb6211fcb58..afb414b26d01 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -25,50 +25,18 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ -static __always_inline int -bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp, - enum cgroup_bpf_attach_type atype, - const void *ctx, bpf_prog_run_fn run_prog, - int retval, u32 *ret_flags) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - u32 func_ret; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(cgrp->effective[atype]); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - func_ret = run_prog(prog, ctx); - if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - *(ret_flags) |= (func_ret >> 1); - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - static __always_inline int bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, enum cgroup_bpf_attach_type atype, const void *ctx, bpf_prog_run_fn run_prog, - int retval) + int retval, u32 *ret_flags) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; + u32 func_ret; run_ctx.retval = retval; migrate_disable(); @@ -78,7 +46,12 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) + func_ret = run_prog(prog, ctx); + if (ret_flags) { + *(ret_flags) |= (func_ret >> 1); + func_ret &= 1; + } + if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -1144,9 +1117,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, u32 flags = 0; bool cn; - ret = bpf_prog_run_array_cg_flags( - &cgrp->bpf, atype, - skb, __bpf_prog_run_save_cb, 0, &flags); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, + __bpf_prog_run_save_cb, 0, &flags); /* Return values of CGROUP EGRESS BPF programs are: * 0: drop packet @@ -1172,7 +1144,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, ret = (cn ? NET_XMIT_DROP : ret); } else { ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, - skb, __bpf_prog_run_save_cb, 0); + skb, __bpf_prog_run_save_cb, 0, + NULL); if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; } @@ -1202,7 +1175,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, + NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1247,8 +1221,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype, - &ctx, bpf_prog_run, 0, flags); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, + 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1275,7 +1249,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, - 0); + 0, NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1292,7 +1266,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); return ret; @@ -1457,7 +1432,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1550,7 +1526,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, - &ctx, bpf_prog_run, 0); + &ctx, bpf_prog_run, 0, NULL); release_sock(sk); if (ret) @@ -1650,7 +1626,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, - &ctx, bpf_prog_run, retval); + &ctx, bpf_prog_run, retval, NULL); release_sock(sk); if (ret < 0) @@ -1699,7 +1675,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, - &ctx, bpf_prog_run, retval); + &ctx, bpf_prog_run, retval, NULL); if (ret < 0) return ret; -- cgit v1.2.3 From 61df10c7799e27807ad5e459eec9d77cddf8bf45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:49 +0530 Subject: bpf: Allow storing unreferenced kptr in map This commit introduces a new pointer type 'kptr' which can be embedded in a map value to hold a PTR_TO_BTF_ID stored by a BPF program during its invocation. When storing such a kptr, BPF program's PTR_TO_BTF_ID register must have the same type as in the map value's BTF, and loading a kptr marks the destination register as PTR_TO_BTF_ID with the correct kernel BTF and BTF ID. Such kptr are unreferenced, i.e. by the time another invocation of the BPF program loads this pointer, the object which the pointer points to may not longer exist. Since PTR_TO_BTF_ID loads (using BPF_LDX) are patched to PROBE_MEM loads by the verifier, it would safe to allow user to still access such invalid pointer, but passing such pointers into BPF helpers and kfuncs should not be permitted. A future patch in this series will close this gap. The flexibility offered by allowing programs to dereference such invalid pointers while being safe at runtime frees the verifier from doing complex lifetime tracking. As long as the user may ensure that the object remains valid, it can ensure data read by it from the kernel object is valid. The user indicates that a certain pointer must be treated as kptr capable of accepting stores of PTR_TO_BTF_ID of a certain type, by using a BTF type tag 'kptr' on the pointed to type of the pointer. Then, this information is recorded in the object BTF which will be passed into the kernel by way of map's BTF information. The name and kind from the map value BTF is used to look up the in-kernel type, and the actual BTF and BTF ID is recorded in the map struct in a new kptr_off_tab member. For now, only storing pointers to structs is permitted. An example of this specification is shown below: #define __kptr __attribute__((btf_type_tag("kptr"))) struct map_value { ... struct task_struct __kptr *task; ... }; Then, in a BPF program, user may store PTR_TO_BTF_ID with the type task_struct into the map, and then load it later. Note that the destination register is marked PTR_TO_BTF_ID_OR_NULL, as the verifier cannot know whether the value is NULL or not statically, it must treat all potential loads at that map value offset as loading a possibly NULL pointer. Only BPF_LDX, BPF_STX, and BPF_ST (with insn->imm = 0 to denote NULL) are allowed instructions that can access such a pointer. On BPF_LDX, the destination register is updated to be a PTR_TO_BTF_ID, and on BPF_STX, it is checked whether the source register type is a PTR_TO_BTF_ID with same BTF type as specified in the map BTF. The access size must always be BPF_DW. For the map in map support, the kptr_off_tab for outer map is copied from the inner map's kptr_off_tab. It was chosen to do a deep copy instead of introducing a refcount to kptr_off_tab, because the copy only needs to be done when paramterizing using inner_map_fd in the map in map case, hence would be unnecessary for all other users. It is not permitted to use MAP_FREEZE command and mmap for BPF map having kptrs, similar to the bpf_timer case. A kptr also requires that BPF program has both read and write access to the map (hence both BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG are disallowed). Note that check_map_access must be called from both check_helper_mem_access and for the BPF instructions, hence the kptr check must distinguish between ACCESS_DIRECT and ACCESS_HELPER, and reject ACCESS_HELPER cases. We rename stack_access_src to bpf_access_src and reuse it for this purpose. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-2-memxor@gmail.com --- include/linux/bpf.h | 31 ++++++++- include/linux/btf.h | 2 + kernel/bpf/btf.c | 167 ++++++++++++++++++++++++++++++++++++++++++------ kernel/bpf/map_in_map.c | 5 +- kernel/bpf/syscall.c | 103 ++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 161 +++++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 433 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7bf441563ffc..ce12124048c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,24 @@ struct bpf_map_ops { const struct bpf_iter_seq_info *iter_seq_info; }; +enum { + /* Support at most 8 pointers in a BPF map value */ + BPF_MAP_VALUE_OFF_MAX = 8, +}; + +struct bpf_map_value_off_desc { + u32 offset; + struct { + struct btf *btf; + u32 btf_id; + } kptr; +}; + +struct bpf_map_value_off { + u32 nr_off; + struct bpf_map_value_off_desc off[]; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -171,6 +189,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + struct bpf_map_value_off *kptr_off_tab; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -184,7 +203,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 14 bytes hole */ + /* 6 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -217,6 +236,11 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } +static inline bool map_value_has_kptrs(const struct bpf_map *map) +{ + return !IS_ERR_OR_NULL(map->kptr_off_tab); +} + static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) @@ -1396,6 +1420,11 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); +void bpf_map_free_kptr_off_tab(struct bpf_map *map); +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); + struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); diff --git a/include/linux/btf.h b/include/linux/btf.h index 36bc09b8e890..19c297f9a52f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -123,6 +123,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0493310d981f..563ac61e6d6b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3166,9 +3166,16 @@ static void btf_struct_log(struct btf_verifier_env *env, enum btf_field_type { BTF_FIELD_SPIN_LOCK, BTF_FIELD_TIMER, + BTF_FIELD_KPTR, +}; + +enum { + BTF_FIELD_IGNORE = 0, + BTF_FIELD_FOUND = 1, }; struct btf_field_info { + u32 type_id; u32 off; }; @@ -3176,29 +3183,57 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info) { if (!__btf_type_is_struct(t)) - return 0; + return BTF_FIELD_IGNORE; if (t->size != sz) - return 0; - if (info->off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; + return BTF_FIELD_IGNORE; info->off = off; - return 0; + return BTF_FIELD_FOUND; +} + +static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + u32 res_id; + + /* For PTR, sz is always == 8 */ + if (!btf_type_is_ptr(t)) + return BTF_FIELD_IGNORE; + t = btf_type_by_id(btf, t->type); + + if (!btf_type_is_type_tag(t)) + return BTF_FIELD_IGNORE; + /* Reject extra tags */ + if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) + return -EINVAL; + if (strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + return -EINVAL; + + /* Get the base type */ + t = btf_type_skip_modifiers(btf, t->type, &res_id); + /* Only pointer to struct is allowed */ + if (!__btf_type_is_struct(t)) + return -EINVAL; + + info->type_id = res_id; + info->off = off; + return BTF_FIELD_FOUND; } static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, const char *name, int sz, int align, enum btf_field_type field_type, - struct btf_field_info *info) + struct btf_field_info *info, int info_cnt) { const struct btf_member *member; + struct btf_field_info tmp; + int ret, idx = 0; u32 i, off; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; off = __btf_member_bit_offset(t, member); @@ -3212,20 +3247,38 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t switch (field_type) { case BTF_FIELD_SPIN_LOCK: case BTF_FIELD_TIMER: - return btf_find_struct(btf, member_type, off, sz, info); + ret = btf_find_struct(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return 0; + return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, const char *name, int sz, int align, enum btf_field_type field_type, - struct btf_field_info *info) + struct btf_field_info *info, int info_cnt) { const struct btf_var_secinfo *vsi; + struct btf_field_info tmp; + int ret, idx = 0; u32 i, off; for_each_vsi(i, t, vsi) { @@ -3234,7 +3287,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, off = vsi->offset; - if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) continue; if (vsi->size != sz) continue; @@ -3244,17 +3297,33 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BTF_FIELD_SPIN_LOCK: case BTF_FIELD_TIMER: - return btf_find_struct(btf, var_type, off, sz, info); + ret = btf_find_struct(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return 0; + return idx; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, enum btf_field_type field_type, - struct btf_field_info *info) + struct btf_field_info *info, int info_cnt) { const char *name; int sz, align; @@ -3270,14 +3339,19 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, sz = sizeof(struct bpf_timer); align = __alignof__(struct bpf_timer); break; + case BTF_FIELD_KPTR: + name = NULL; + sz = sizeof(u64); + align = 8; + break; default: return -EFAULT; } if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align, field_type, info); + return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align, field_type, info); + return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); return -EINVAL; } @@ -3287,26 +3361,77 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, */ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) { - struct btf_field_info info = { .off = -ENOENT }; + struct btf_field_info info; int ret; - ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info); + ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); if (ret < 0) return ret; + if (!ret) + return -ENOENT; return info.off; } int btf_find_timer(const struct btf *btf, const struct btf_type *t) { - struct btf_field_info info = { .off = -ENOENT }; + struct btf_field_info info; int ret; - ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info); + ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); if (ret < 0) return ret; + if (!ret) + return -ENOENT; return info.off; } +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t) +{ + struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; + struct bpf_map_value_off *tab; + struct btf *kernel_btf = NULL; + int ret, i, nr_off; + + ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + if (ret < 0) + return ERR_PTR(ret); + if (!ret) + return NULL; + + nr_off = ret; + tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_off; i++) { + const struct btf_type *t; + s32 id; + + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info_arr[i].type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) { + ret = id; + goto end; + } + + tab->off[i].offset = info_arr[i].off; + tab->off[i].kptr.btf_id = id; + tab->off[i].kptr.btf = kernel_btf; + } + tab->nr_off = nr_off; + return tab; +end: + while (i--) + btf_put(tab->off[i].kptr.btf); + kfree(tab); + return ERR_PTR(ret); +} + static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5cd8f5277279..135205d0d560 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { + bpf_map_free_kptr_off_tab(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->timer_off == meta1->timer_off && - meta0->map_flags == meta1->map_flags; + meta0->map_flags == meta1->map_flags && + bpf_map_equal_kptr_off_tab(meta0, meta1); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e9e3e49c0eb7..575b09339360 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -473,12 +474,84 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +static int bpf_map_kptr_off_cmp(const void *a, const void *b) +{ + const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + + if (off_desc1->offset < off_desc2->offset) + return -1; + else if (off_desc1->offset > off_desc2->offset) + return 1; + return 0; +} + +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +{ + /* Since members are iterated in btf_find_field in increasing order, + * offsets appended to kptr_off_tab are in increasing order, so we can + * do bsearch to find exact match. + */ + struct bpf_map_value_off *tab; + + if (!map_value_has_kptrs(map)) + return NULL; + tab = map->kptr_off_tab; + return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); +} + +void bpf_map_free_kptr_off_tab(struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + if (!map_value_has_kptrs(map)) + return; + for (i = 0; i < tab->nr_off; i++) + btf_put(tab->off[i].kptr.btf); + kfree(tab); + map->kptr_off_tab = NULL; +} + +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; + int size, i; + + if (!map_value_has_kptrs(map)) + return ERR_PTR(-ENOENT); + size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); + new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_tab) + return ERR_PTR(-ENOMEM); + /* Do a deep copy of the kptr_off_tab */ + for (i = 0; i < tab->nr_off; i++) + btf_get(tab->off[i].kptr.btf); + return new_tab; +} + +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +{ + struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; + bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + int size; + + if (!a_has_kptr && !b_has_kptr) + return true; + if (a_has_kptr != b_has_kptr) + return false; + if (tab_a->nr_off != tab_b->nr_off) + return false; + size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); + return !memcmp(tab_a, tab_b, size); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); + bpf_map_free_kptr_off_tab(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -640,7 +713,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) int err; if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map)) + map_value_has_timer(map) || map_value_has_kptrs(map)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -820,9 +893,33 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EOPNOTSUPP; } - if (map->ops->map_check_btf) + map->kptr_off_tab = btf_parse_kptrs(btf, value_type); + if (map_value_has_kptrs(map)) { + if (!bpf_capable()) { + ret = -EPERM; + goto free_map_tab; + } + if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { + ret = -EACCES; + goto free_map_tab; + } + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + } + + if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); + if (ret < 0) + goto free_map_tab; + } + return ret; +free_map_tab: + bpf_map_free_kptr_off_tab(map); return ret; } @@ -1639,7 +1736,7 @@ static int map_freeze(const union bpf_attr *attr) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map)) { + map_value_has_timer(map) || map_value_has_kptrs(map)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71827d14724a..17ca586e0585 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3211,7 +3211,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return 0; } -enum stack_access_src { +enum bpf_access_src { ACCESS_DIRECT = 1, /* the access is performed by an instruction */ ACCESS_HELPER = 2, /* the access is performed by a helper */ }; @@ -3219,7 +3219,7 @@ enum stack_access_src { static int check_stack_range_initialized(struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, + enum bpf_access_src type, struct bpf_call_arg_meta *meta); static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) @@ -3507,9 +3507,109 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, return __check_ptr_off_reg(env, reg, regno, false); } +static int map_kptr_match_type(struct bpf_verifier_env *env, + struct bpf_map_value_off_desc *off_desc, + struct bpf_reg_state *reg, u32 regno) +{ + const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + const char *reg_name = ""; + + if (base_type(reg->type) != PTR_TO_BTF_ID || type_flag(reg->type) != PTR_MAYBE_NULL) + goto bad_type; + + if (!btf_is_kernel(reg->btf)) { + verbose(env, "R%d must point to kernel BTF\n", regno); + return -EINVAL; + } + /* We need to verify reg->type and reg->btf, before accessing reg->btf */ + reg_name = kernel_type_name(reg->btf, reg->btf_id); + + if (__check_ptr_off_reg(env, reg, regno, true)) + return -EACCES; + + /* A full type match is needed, as BTF can be vmlinux or module BTF, and + * we also need to take into account the reg->off. + * + * We want to support cases like: + * + * struct foo { + * struct bar br; + * struct baz bz; + * }; + * + * struct foo *v; + * v = func(); // PTR_TO_BTF_ID + * val->foo = v; // reg->off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * // first member type of struct after comparison fails + * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * // to match type + * + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * is zero. + */ + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + off_desc->kptr.btf, off_desc->kptr.btf_id)) + goto bad_type; + return 0; +bad_type: + verbose(env, "invalid kptr access, R%d type=%s%s ", regno, + reg_type_str(env, reg->type), reg_name); + verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name); + return -EINVAL; +} + +static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, + int value_regno, int insn_idx, + struct bpf_map_value_off_desc *off_desc) +{ + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; + int class = BPF_CLASS(insn->code); + struct bpf_reg_state *val_reg; + + /* Things we already checked for in check_map_access and caller: + * - Reject cases where variable offset may touch kptr + * - size of access (must be BPF_DW) + * - tnum_is_const(reg->var_off) + * - off_desc->offset == off + reg->var_off.value + */ + /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n"); + return -EACCES; + } + + if (class == BPF_LDX) { + val_reg = reg_state(env, value_regno); + /* We can simply mark the value_regno receiving the pointer + * value from map as PTR_TO_BTF_ID, with the correct type. + */ + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, + off_desc->kptr.btf_id, PTR_MAYBE_NULL); + /* For mark_ptr_or_null_reg */ + val_reg->id = ++env->id_gen; + } else if (class == BPF_STX) { + val_reg = reg_state(env, value_regno); + if (!register_is_null(val_reg) && + map_kptr_match_type(env, off_desc, val_reg, value_regno)) + return -EACCES; + } else if (class == BPF_ST) { + if (insn->imm) { + verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", + off_desc->offset); + return -EACCES; + } + } else { + verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n"); + return -EACCES; + } + return 0; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) + int off, int size, bool zero_size_allowed, + enum bpf_access_src src) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -3545,6 +3645,36 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_kptrs(map)) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) { + u32 p = tab->off[i].offset; + + if (reg->smin_value + off < p + sizeof(u64) && + p < reg->umax_value + off + size) { + if (src != ACCESS_DIRECT) { + verbose(env, "kptr cannot be accessed indirectly by helper\n"); + return -EACCES; + } + if (!tnum_is_const(reg->var_off)) { + verbose(env, "kptr access cannot have variable offset\n"); + return -EACCES; + } + if (p != off + reg->var_off.value) { + verbose(env, "kptr access misaligned expected=%u off=%llu\n", + p, off + reg->var_off.value); + return -EACCES; + } + if (size != bpf_size_to_bytes(BPF_DW)) { + verbose(env, "kptr access size must be BPF_DW\n"); + return -EACCES; + } + break; + } + } + } return err; } @@ -4316,7 +4446,7 @@ static int check_stack_slot_within_bounds(int off, static int check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, - enum stack_access_src src, enum bpf_access_type type) + enum bpf_access_src src, enum bpf_access_type type) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -4412,6 +4542,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { + struct bpf_map_value_off_desc *kptr_off_desc = NULL; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4420,8 +4552,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access_type(env, regno, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) { + err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + if (err) + return err; + if (tnum_is_const(reg->var_off)) + kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, + off + reg->var_off.value); + if (kptr_off_desc) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, + kptr_off_desc); + } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; /* if map is read-only, track its contents as scalars */ @@ -4724,7 +4864,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i static int check_stack_range_initialized( struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, struct bpf_call_arg_meta *meta) + enum bpf_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); @@ -4874,7 +5014,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, BPF_READ)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, - zero_size_allowed); + zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { if (meta && meta->raw_mode) { @@ -5642,7 +5782,8 @@ skip_type_check: } err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false); + map->value_size - reg->off, false, + ACCESS_HELPER); if (err) return err; @@ -7462,7 +7603,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false)) { + if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; -- cgit v1.2.3 From 8f14852e89113d738c99c375b4c8b8b7e1073df1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:50 +0530 Subject: bpf: Tag argument to be released in bpf_func_proto Add a new type flag for bpf_arg_type that when set tells verifier that for a release function, that argument's register will be the one for which meta.ref_obj_id will be set, and which will then be released using release_reference. To capture the regno, introduce a new field release_regno in bpf_call_arg_meta. This would be required in the next patch, where we may either pass NULL or a refcounted pointer as an argument to the release function bpf_kptr_xchg. Just releasing only when meta.ref_obj_id is set is not enough, as there is a case where the type of argument needed matches, but the ref_obj_id is set to 0. Hence, we must enforce that whenever meta.ref_obj_id is zero, the register that is to be released can only be NULL for a release function. Since we now indicate whether an argument is to be released in bpf_func_proto itself, is_release_function helper has lost its utitlity, hence refactor code to work without it, and just rely on meta.release_regno to know when to release state for a ref_obj_id. Still, the restriction of one release argument and only one ref_obj_id passed to BPF helper or kfunc remains. This may be lifted in the future. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-3-memxor@gmail.com --- include/linux/bpf.h | 5 +- include/linux/bpf_verifier.h | 3 +- kernel/bpf/btf.c | 11 ++-- kernel/bpf/ringbuf.c | 4 +- kernel/bpf/verifier.c | 76 ++++++++++++---------- net/core/filter.c | 2 +- .../testing/selftests/bpf/verifier/ref_tracking.c | 2 +- tools/testing/selftests/bpf/verifier/sock.c | 6 +- 8 files changed, 60 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ce12124048c0..492edd2c5713 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -366,7 +366,10 @@ enum bpf_type_flag { */ MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_PERCPU, + /* Indicates that the argument will be released. */ + OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = OBJ_RELEASE, }; /* Max number of base types. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3a9d2d7cc6b7..1f1e7f2ea967 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -523,8 +523,7 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type, - bool is_release_func); + enum bpf_arg_type arg_type); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 563ac61e6d6b..f0287342204f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6047,6 +6047,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, * verifier sees. */ for (i = 0; i < nargs; i++) { + enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; @@ -6067,7 +6068,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); + if (rel && reg->ref_obj_id) + arg_type |= OBJ_RELEASE; + ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) return ret; @@ -6099,11 +6102,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced - * PTR_TO_BTF_ID, check_func_arg_reg_off relies - * on only one referenced register being allowed - * for kfuncs. - */ + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ if (reg->ref_obj_id) { if (ref_obj_id) { bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 710ba9de12ce..5173fd37590f 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -404,7 +404,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -417,7 +417,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17ca586e0585..5426bab7f02c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -245,6 +245,7 @@ struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; bool pkt_access; + u8 release_regno; int regno; int access_size; int mem_size; @@ -471,17 +472,6 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -/* Determine whether the function releases some resources allocated by another - * function call. The first reference type argument will be assumed to be - * released by release_reference(). - */ -static bool is_release_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_release || - func_id == BPF_FUNC_ringbuf_submit || - func_id == BPF_FUNC_ringbuf_discard; -} - static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || @@ -5326,6 +5316,11 @@ static bool arg_type_is_int_ptr(enum bpf_arg_type type) type == ARG_PTR_TO_LONG; } +static bool arg_type_is_release(enum bpf_arg_type type) +{ + return type & OBJ_RELEASE; +} + static int int_ptr_type_to_size(enum bpf_arg_type type) { if (type == ARG_PTR_TO_INT) @@ -5536,11 +5531,10 @@ found: int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type, - bool is_release_func) + enum bpf_arg_type arg_type) { - bool fixed_off_ok = false, release_reg; enum bpf_reg_type type = reg->type; + bool fixed_off_ok = false; switch ((u32)type) { case SCALAR_VALUE: @@ -5558,7 +5552,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, /* Some of the argument types nevertheless require a * zero register offset. */ - if (arg_type != ARG_PTR_TO_ALLOC_MEM) + if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) return 0; break; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -5566,19 +5560,17 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, */ case PTR_TO_BTF_ID: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. We rely on the property that - * only one referenced register can be passed to BPF helpers and - * kfuncs. In the other cases, fixed offset can be non-zero. + * it's fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. */ - release_reg = is_release_func && reg->ref_obj_id; - if (release_reg && reg->off) { + if (arg_type_is_release(arg_type) && reg->off) { verbose(env, "R%d must have zero offset when passed to release func\n", regno); return -EINVAL; } - /* For release_reg == true, fixed_off_ok must be false, but we - * already checked and rejected reg->off != 0 above, so set to - * true to allow fixed offset for all other cases. + /* For arg is release pointer, fixed_off_ok must be false, but + * we already checked and rejected reg->off != 0 above, so set + * to true to allow fixed offset for all other cases. */ fixed_off_ok = true; break; @@ -5637,14 +5629,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; skip_type_check: - /* check_func_arg_reg_off relies on only one referenced register being - * allowed for BPF helpers. - */ + if (arg_type_is_release(arg_type)) { + if (!reg->ref_obj_id && !register_is_null(reg)) { + verbose(env, "R%d must be referenced when passed to release function\n", + regno); + return -EINVAL; + } + if (meta->release_regno) { + verbose(env, "verifier internal error: more than one release argument\n"); + return -EFAULT; + } + meta->release_regno = regno; + } + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -6151,7 +6153,8 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id, + struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && @@ -6835,7 +6838,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn, func_id, &meta); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -6868,8 +6871,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (is_release_function(func_id)) { - err = release_reference(env, meta.ref_obj_id); + regs = cur_regs(env); + + if (meta.release_regno) { + err = -EINVAL; + if (meta.ref_obj_id) + err = release_reference(env, meta.ref_obj_id); + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ + else if (register_is_null(®s[meta.release_regno])) + err = 0; if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -6877,8 +6889,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } - regs = cur_regs(env); - switch (func_id) { case BPF_FUNC_tail_call: err = check_reference_leak(env); diff --git a/net/core/filter.c b/net/core/filter.c index 8847316ee20e..da04ad179fda 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6621,7 +6621,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index fbd682520e47..57a83d763ec1 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -796,7 +796,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "reference has not been acquired before", + .errstr = "R1 must be referenced when passed to release function", }, { /* !bpf_sk_fullsock(sk) is checked but !bpf_tcp_sock(sk) is not checked */ diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 86b24cad27a7..d11d0b28be41 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -417,7 +417,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "reference has not been acquired before", + .errstr = "R1 must be referenced when passed to release function", }, { "bpf_sk_release(bpf_sk_fullsock(skb->sk))", @@ -436,7 +436,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "reference has not been acquired before", + .errstr = "R1 must be referenced when passed to release function", }, { "bpf_sk_release(bpf_tcp_sock(skb->sk))", @@ -455,7 +455,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "reference has not been acquired before", + .errstr = "R1 must be referenced when passed to release function", }, { "sk_storage_get(map, skb->sk, NULL, 0): value == NULL", -- cgit v1.2.3 From c0a5a21c25f37c9fd7b36072f9968cdff1e4aa13 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:51 +0530 Subject: bpf: Allow storing referenced kptr in map Extending the code in previous commits, introduce referenced kptr support, which needs to be tagged using 'kptr_ref' tag instead. Unlike unreferenced kptr, referenced kptr have a lot more restrictions. In addition to the type matching, only a newly introduced bpf_kptr_xchg helper is allowed to modify the map value at that offset. This transfers the referenced pointer being stored into the map, releasing the references state for the program, and returning the old value and creating new reference state for the returned pointer. Similar to unreferenced pointer case, return value for this case will also be PTR_TO_BTF_ID_OR_NULL. The reference for the returned pointer must either be eventually released by calling the corresponding release function, otherwise it must be transferred into another map. It is also allowed to call bpf_kptr_xchg with a NULL pointer, to clear the value, and obtain the old value if any. BPF_LDX, BPF_STX, and BPF_ST cannot access referenced kptr. A future commit will permit using BPF_LDX for such pointers, but attempt at making it safe, since the lifetime of object won't be guaranteed. There are valid reasons to enforce the restriction of permitting only bpf_kptr_xchg to operate on referenced kptr. The pointer value must be consistent in face of concurrent modification, and any prior values contained in the map must also be released before a new one is moved into the map. To ensure proper transfer of this ownership, bpf_kptr_xchg returns the old value, which the verifier would require the user to either free or move into another map, and releases the reference held for the pointer being moved in. In the future, direct BPF_XCHG instruction may also be permitted to work like bpf_kptr_xchg helper. Note that process_kptr_func doesn't have to call check_helper_mem_access, since we already disallow rdonly/wronly flags for map, which is what check_map_access_type checks, and we already ensure the PTR_TO_MAP_VALUE refers to kptr by obtaining its off_desc, so check_map_access is also not required. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-4-memxor@gmail.com --- include/linux/bpf.h | 8 ++++ include/uapi/linux/bpf.h | 12 ++++++ kernel/bpf/btf.c | 10 ++++- kernel/bpf/helpers.c | 24 +++++++++++ kernel/bpf/verifier.c | 98 ++++++++++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 12 ++++++ 6 files changed, 151 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 492edd2c5713..24310837bafc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -160,8 +160,14 @@ enum { BPF_MAP_VALUE_OFF_MAX = 8, }; +enum bpf_kptr_type { + BPF_KPTR_UNREF, + BPF_KPTR_REF, +}; + struct bpf_map_value_off_desc { u32 offset; + enum bpf_kptr_type type; struct { struct btf *btf; u32 btf_id; @@ -418,6 +424,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ + ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -427,6 +434,7 @@ enum bpf_arg_type { ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, + ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d14b10b85e51..444fe6f1cf35 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5143,6 +5143,17 @@ union bpf_attr { * The **hash_algo** is returned on success, * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5339,6 +5350,7 @@ union bpf_attr { FN(copy_from_user_task), \ FN(skb_set_tstamp), \ FN(ima_file_hash), \ + FN(kptr_xchg), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f0287342204f..4138c51728dd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3177,6 +3177,7 @@ enum { struct btf_field_info { u32 type_id; u32 off; + enum bpf_kptr_type type; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, @@ -3193,6 +3194,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info) { + enum bpf_kptr_type type; u32 res_id; /* For PTR, sz is always == 8 */ @@ -3205,7 +3207,11 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_UNREF; + else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_REF; + else return -EINVAL; /* Get the base type */ @@ -3216,6 +3222,7 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, info->type_id = res_id; info->off = off; + info->type = type; return BTF_FIELD_FOUND; } @@ -3420,6 +3427,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, } tab->off[i].offset = info_arr[i].off; + tab->off[i].type = info_arr[i].type; tab->off[i].kptr.btf_id = id; tab->off[i].kptr.btf = kernel_btf; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 315053ef6a75..3e709fed5306 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1374,6 +1374,28 @@ out: kfree(t); } +BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) +{ + unsigned long *kptr = map_value; + + return xchg(kptr, (unsigned long)ptr); +} + +/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() + * helper is determined dynamically by the verifier. + */ +#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) + +const struct bpf_func_proto bpf_kptr_xchg_proto = { + .func = bpf_kptr_xchg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = BPF_PTR_POISON, + .arg1_type = ARG_PTR_TO_KPTR, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, + .arg2_btf_id = BPF_PTR_POISON, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1452,6 +1474,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; + case BPF_FUNC_kptr_xchg: + return &bpf_kptr_xchg_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5426bab7f02c..c9ee44efed89 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -258,6 +258,7 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; + struct bpf_map_value_off_desc *kptr_off_desc; }; struct btf *btf_vmlinux; @@ -489,7 +490,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_ringbuf_reserve) + func_id == BPF_FUNC_ringbuf_reserve || + func_id == BPF_FUNC_kptr_xchg) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -3514,6 +3516,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, /* We need to verify reg->type and reg->btf, before accessing reg->btf */ reg_name = kernel_type_name(reg->btf, reg->btf_id); + /* For ref_ptr case, release function check should ensure we get one + * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the + * normal store of unreferenced kptr, we must ensure var_off is zero. + * Since ref_ptr cannot be accessed directly by BPF insns, checks for + * reg->off and reg->ref_obj_id are not needed here. + */ if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; @@ -3569,6 +3577,12 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } + /* We cannot directly access kptr_ref */ + if (off_desc->type == BPF_KPTR_REF) { + verbose(env, "accessing referenced kptr disallowed\n"); + return -EACCES; + } + if (class == BPF_LDX) { val_reg = reg_state(env, value_regno); /* We can simply mark the value_regno receiving the pointer @@ -5293,6 +5307,53 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_kptr_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map_value_off_desc *off_desc; + struct bpf_map *map_ptr = reg->map_ptr; + u32 kptr_off; + int ret; + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. kptr has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map_ptr->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", + map_ptr->name); + return -EINVAL; + } + if (!map_value_has_kptrs(map_ptr)) { + ret = PTR_ERR(map_ptr->kptr_off_tab); + if (ret == -E2BIG) + verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, + BPF_MAP_VALUE_OFF_MAX); + else if (ret == -EEXIST) + verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); + else + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + return -EINVAL; + } + + meta->map_ptr = map_ptr; + kptr_off = reg->off + reg->var_off.value; + off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); + if (!off_desc) { + verbose(env, "off=%d doesn't point to kptr\n", kptr_off); + return -EACCES; + } + if (off_desc->type != BPF_KPTR_REF) { + verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); + return -EACCES; + } + meta->kptr_off_desc = off_desc; + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return base_type(type) == ARG_PTR_TO_MEM || @@ -5433,6 +5494,7 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -5460,11 +5522,13 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, + [ARG_PTR_TO_KPTR] = &kptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, - const u32 *arg_btf_id) + const u32 *arg_btf_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected, type = reg->type; @@ -5517,8 +5581,11 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id)) { + if (meta->func_id == BPF_FUNC_kptr_xchg) { + if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + return -EACCES; + } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, kernel_type_name(reg->btf, reg->btf_id), kernel_type_name(btf_vmlinux, *arg_btf_id)); @@ -5625,7 +5692,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); + err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta); if (err) return err; @@ -5801,6 +5868,9 @@ skip_type_check: verbose(env, "string is not zero-terminated\n"); return -EINVAL; } + } else if (arg_type == ARG_PTR_TO_KPTR) { + if (process_kptr_func(env, regno, meta)) + return -EACCES; } return err; @@ -6143,10 +6213,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; } @@ -7012,21 +7082,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + struct btf *ret_btf; int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; - ret_btf_id = *fn->ret_btf_id; + if (func_id == BPF_FUNC_kptr_xchg) { + ret_btf = meta.kptr_off_desc->kptr.btf; + ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + } else { + ret_btf = btf_vmlinux; + ret_btf_id = *fn->ret_btf_id; + } if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - /* current BPF helper definitions are only coming from - * built-in code with type IDs from vmlinux BTF - */ - regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %u of func %s#%d\n", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d14b10b85e51..444fe6f1cf35 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5143,6 +5143,17 @@ union bpf_attr { * The **hash_algo** is returned on success, * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5339,6 +5350,7 @@ union bpf_attr { FN(copy_from_user_task), \ FN(skb_set_tstamp), \ FN(ima_file_hash), \ + FN(kptr_xchg), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 6efe152d4061a83a2c5db6a0e5b58f9345c9742f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:52 +0530 Subject: bpf: Prevent escaping of kptr loaded from maps While we can guarantee that even for unreferenced kptr, the object pointer points to being freed etc. can be handled by the verifier's exception handling (normal load patching to PROBE_MEM loads), we still cannot allow the user to pass these pointers to BPF helpers and kfunc, because the same exception handling won't be done for accesses inside the kernel. The same is true if a referenced pointer is loaded using normal load instruction. Since the reference is not guaranteed to be held while the pointer is used, it must be marked as untrusted. Hence introduce a new type flag, PTR_UNTRUSTED, which is used to mark all registers loading unreferenced and referenced kptr from BPF maps, and ensure they can never escape the BPF program and into the kernel by way of calling stable/unstable helpers. In check_ptr_to_btf_access, the !type_may_be_null check to reject type flags is still correct, as apart from PTR_MAYBE_NULL, only MEM_USER, MEM_PERCPU, and PTR_UNTRUSTED may be set for PTR_TO_BTF_ID. The first two are checked inside the function and rejected using a proper error message, but we still want to allow dereference of untrusted case. Also, we make sure to inherit PTR_UNTRUSTED when chain of pointers are walked, so that this flag is never dropped once it has been set on a PTR_TO_BTF_ID (i.e. trusted to untrusted transition can only be in one direction). In convert_ctx_accesses, extend the switch case to consider untrusted PTR_TO_BTF_ID in addition to normal PTR_TO_BTF_ID for PROBE_MEM conversion for BPF_LDX. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-5-memxor@gmail.com --- include/linux/bpf.h | 10 +++++++++- kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 24310837bafc..e13a5cbd4ebb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -375,7 +375,15 @@ enum bpf_type_flag { /* Indicates that the argument will be released. */ OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = OBJ_RELEASE, + /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark + * unreferenced and referenced kptr loaded from map value using a load + * instruction, so that they can only be dereferenced but not escape the + * BPF program into the kernel (i.e. cannot be passed as arguments to + * kfunc or bpf helpers). + */ + PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_UNTRUSTED, }; /* Max number of base types. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c9ee44efed89..955c3125576a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -567,6 +567,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(prefix, "user_", 32); if (type & MEM_PERCPU) strncpy(prefix, "percpu_", 32); + if (type & PTR_UNTRUSTED) + strncpy(prefix, "untrusted_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -3504,9 +3506,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + int perm_flags = PTR_MAYBE_NULL; const char *reg_name = ""; - if (base_type(reg->type) != PTR_TO_BTF_ID || type_flag(reg->type) != PTR_MAYBE_NULL) + /* Only unreferenced case accepts untrusted pointers */ + if (off_desc->type == BPF_KPTR_UNREF) + perm_flags |= PTR_UNTRUSTED; + + if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) goto bad_type; if (!btf_is_kernel(reg->btf)) { @@ -3553,7 +3560,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, bad_type: verbose(env, "invalid kptr access, R%d type=%s%s ", regno, reg_type_str(env, reg->type), reg_name); - verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name); + verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); + if (off_desc->type == BPF_KPTR_UNREF) + verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), + targ_name); + else + verbose(env, "\n"); return -EINVAL; } @@ -3577,9 +3589,11 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } - /* We cannot directly access kptr_ref */ - if (off_desc->type == BPF_KPTR_REF) { - verbose(env, "accessing referenced kptr disallowed\n"); + /* We only allow loading referenced kptr, since it will be marked as + * untrusted, similar to unreferenced kptr. + */ + if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -3589,7 +3603,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, - off_desc->kptr.btf_id, PTR_MAYBE_NULL); + off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -4358,6 +4372,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; + /* If this is an untrusted pointer, all pointers formed by walking it + * also inherit the untrusted flag. + */ + if (type_flag(reg->type) & PTR_UNTRUSTED) + flag |= PTR_UNTRUSTED; + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -13076,7 +13096,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (!ctx_access) continue; - switch (env->insn_aux_data[i + delta].ptr_type) { + switch ((int)env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) continue; @@ -13093,6 +13113,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); -- cgit v1.2.3 From 4d7d7f69f4b104b2ddeec6a1e7fcfd2d044ed8c4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:53 +0530 Subject: bpf: Adapt copy_map_value for multiple offset case Since now there might be at most 10 offsets that need handling in copy_map_value, the manual shuffling and special case is no longer going to work. Hence, let's generalise the copy_map_value function by using a sorted array of offsets to skip regions that must be avoided while copying into and out of a map value. When the map is created, we populate the offset array in struct map, Then, copy_map_value uses this sorted offset array is used to memcpy while skipping timer, spin lock, and kptr. The array is allocated as in most cases none of these special fields would be present in map value, hence we can save on space for the common case by not embedding the entire object inside bpf_map struct. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-6-memxor@gmail.com --- include/linux/bpf.h | 56 +++++++++++++++++---------------- kernel/bpf/syscall.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 117 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e13a5cbd4ebb..4e12b27a5de6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -158,6 +158,9 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BPF map value */ BPF_MAP_VALUE_OFF_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + 1 + /* for bpf_spin_lock */ + 1, /* for bpf_timer */ }; enum bpf_kptr_type { @@ -179,6 +182,12 @@ struct bpf_map_value_off { struct bpf_map_value_off_desc off[]; }; +struct bpf_map_off_arr { + u32 cnt; + u32 field_off[BPF_MAP_OFF_ARR_MAX]; + u8 field_sz[BPF_MAP_OFF_ARR_MAX]; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -207,10 +216,7 @@ struct bpf_map { struct mem_cgroup *memcg; #endif char name[BPF_OBJ_NAME_LEN]; - bool bypass_spec_v1; - bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 6 bytes hole */ - + struct bpf_map_off_arr *off_arr; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -230,6 +236,8 @@ struct bpf_map { bool jited; bool xdp_has_frags; } owner; + bool bypass_spec_v1; + bool frozen; /* write-once; write-protected by freeze_mutex */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -253,37 +261,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); + if (unlikely(map_value_has_kptrs(map))) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) + *(u64 *)(dst + tab->off[i].offset) = 0; + } } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + u32 curr_off = 0; + int i; - if (unlikely(map_value_has_spin_lock(map))) { - s_off = map->spin_lock_off; - s_sz = sizeof(struct bpf_spin_lock); - } - if (unlikely(map_value_has_timer(map))) { - t_off = map->timer_off; - t_sz = sizeof(struct bpf_timer); + if (likely(!map->off_arr)) { + memcpy(dst, src, map->value_size); + return; } - if (unlikely(s_sz || t_sz)) { - if (s_off < t_off || !s_sz) { - swap(s_off, t_off); - swap(s_sz, t_sz); - } - memcpy(dst, src, t_off); - memcpy(dst + t_off + t_sz, - src + t_off + t_sz, - s_off - t_off - t_sz); - memcpy(dst + s_off + s_sz, - src + s_off + s_sz, - map->value_size - s_off - s_sz); - } else { - memcpy(dst, src, map->value_size); + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memcpy(dst + curr_off, src + curr_off, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; } + memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 575b09339360..4d419a3effc4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -551,6 +552,7 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); + kfree(map->off_arr); bpf_map_free_kptr_off_tab(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ @@ -840,6 +842,84 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } +static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) +{ + struct bpf_map *map = (struct bpf_map *)priv; + u32 *off_base = map->off_arr->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = map->off_arr->field_sz + (a - off_base); + sz_b = map->off_arr->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +static int bpf_map_alloc_off_arr(struct bpf_map *map) +{ + bool has_spin_lock = map_value_has_spin_lock(map); + bool has_timer = map_value_has_timer(map); + bool has_kptrs = map_value_has_kptrs(map); + struct bpf_map_off_arr *off_arr; + u32 i; + + if (!has_spin_lock && !has_timer && !has_kptrs) { + map->off_arr = NULL; + return 0; + } + + off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); + if (!off_arr) + return -ENOMEM; + map->off_arr = off_arr; + + off_arr->cnt = 0; + if (has_spin_lock) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->spin_lock_off; + off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); + off_arr->cnt++; + } + if (has_timer) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->timer_off; + off_arr->field_sz[i] = sizeof(struct bpf_timer); + off_arr->cnt++; + } + if (has_kptrs) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + u32 *off = &off_arr->field_off[off_arr->cnt]; + u8 *sz = &off_arr->field_sz[off_arr->cnt]; + + for (i = 0; i < tab->nr_off; i++) { + *off++ = tab->off[i].offset; + *sz++ = sizeof(u64); + } + off_arr->cnt += tab->nr_off; + } + + if (off_arr->cnt == 1) + return 0; + sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), + map_off_arr_cmp, map_off_arr_swap, map); + return 0; +} + static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -1009,10 +1089,14 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = bpf_map_alloc_off_arr(map); if (err) goto free_map; + err = security_bpf_map_alloc(map); + if (err) + goto free_map_off_arr; + err = bpf_map_alloc_id(map); if (err) goto free_map_sec; @@ -1035,6 +1119,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); +free_map_off_arr: + kfree(map->off_arr); free_map: btf_put(map->btf); map->ops->map_free(map); -- cgit v1.2.3 From 5ce937d613a423ca3102f53d9f3daf4210c1b6e2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:54 +0530 Subject: bpf: Populate pairs of btf_id and destructor kfunc in btf To support storing referenced PTR_TO_BTF_ID in maps, we require associating a specific BTF ID with a 'destructor' kfunc. This is because we need to release a live referenced pointer at a certain offset in map value from the map destruction path, otherwise we end up leaking resources. Hence, introduce support for passing an array of btf_id, kfunc_btf_id pairs that denote a BTF ID and its associated release function. Then, add an accessor 'btf_find_dtor_kfunc' which can be used to look up the destructor kfunc of a certain BTF ID. If found, we can use it to free the object from the map free path. The registration of these pairs also serve as a whitelist of structures which are allowed as referenced PTR_TO_BTF_ID in a BPF map, because without finding the destructor kfunc, we will bail and return an error. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-7-memxor@gmail.com --- include/linux/btf.h | 17 +++++++++ kernel/bpf/btf.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 19c297f9a52f..fea424681d66 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -40,6 +40,11 @@ struct btf_kfunc_id_set { }; }; +struct btf_id_dtor_kfunc { + u32 btf_id; + u32 kfunc_btf_id; +}; + extern const struct file_operations btf_fops; void btf_get(struct btf *btf); @@ -346,6 +351,9 @@ bool btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_type type, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -369,6 +377,15 @@ static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, { return 0; } +static inline s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + return -ENOENT; +} +static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, + u32 add_cnt, struct module *owner) +{ + return 0; +} #endif #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4138c51728dd..a0b1665a4a48 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -207,12 +207,18 @@ enum btf_kfunc_hook { enum { BTF_KFUNC_SET_MAX_CNT = 32, + BTF_DTOR_KFUNC_MAX_CNT = 256, }; struct btf_kfunc_set_tab { struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; }; +struct btf_id_dtor_kfunc_tab { + u32 cnt; + struct btf_id_dtor_kfunc dtors[]; +}; + struct btf { void *data; struct btf_type **types; @@ -228,6 +234,7 @@ struct btf { u32 id; struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; + struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; /* split BTF support */ struct btf *base_btf; @@ -1616,8 +1623,19 @@ free_tab: btf->kfunc_set_tab = NULL; } +static void btf_free_dtor_kfunc_tab(struct btf *btf) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + + if (!tab) + return; + kfree(tab); + btf->dtor_kfunc_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); @@ -7076,6 +7094,96 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + struct btf_id_dtor_kfunc *dtor; + + if (!tab) + return -ENOENT; + /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need + * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func. + */ + BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0); + dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func); + if (!dtor) + return -ENOENT; + return dtor->kfunc_btf_id; +} + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner) +{ + struct btf_id_dtor_kfunc_tab *tab; + struct btf *btf; + u32 tab_cnt; + int ret; + + btf = btf_get_module_btf(owner); + if (!btf) { + if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + return 0; + } + if (IS_ERR(btf)) + return PTR_ERR(btf); + + if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + tab = btf->dtor_kfunc_tab; + /* Only one call allowed for modules */ + if (WARN_ON_ONCE(tab && btf_is_module(btf))) { + ret = -EINVAL; + goto end; + } + + tab_cnt = tab ? tab->cnt : 0; + if (tab_cnt > U32_MAX - add_cnt) { + ret = -EOVERFLOW; + goto end; + } + if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + tab = krealloc(btf->dtor_kfunc_tab, + offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!tab) { + ret = -ENOMEM; + goto end; + } + + if (!btf->dtor_kfunc_tab) + tab->cnt = 0; + btf->dtor_kfunc_tab = tab; + + memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); + tab->cnt += add_cnt; + + sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_dtor_kfunc_tab(btf); + btf_put(btf); + return ret; +} +EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); + #define MAX_TYPES_ARE_COMPAT_DEPTH 2 static -- cgit v1.2.3 From 14a324f6a67ef6a53e04362a70160a47eb8afffa Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:55 +0530 Subject: bpf: Wire up freeing of referenced kptr A destructor kfunc can be defined as void func(type *), where type may be void or any other pointer type as per convenience. In this patch, we ensure that the type is sane and capture the function pointer into off_desc of ptr_off_tab for the specific pointer offset, with the invariant that the dtor pointer is always set when 'kptr_ref' tag is applied to the pointer's pointee type, which is indicated by the flag BPF_MAP_VALUE_OFF_F_REF. Note that only BTF IDs whose destructor kfunc is registered, thus become the allowed BTF IDs for embedding as referenced kptr. Hence it serves the purpose of finding dtor kfunc BTF ID, as well acting as a check against the whitelist of allowed BTF IDs for this purpose. Finally, wire up the actual freeing of the referenced pointer if any at all available offsets, so that no references are leaked after the BPF map goes away and the BPF program previously moved the ownership a referenced pointer into it. The behavior is similar to BPF timers, where bpf_map_{update,delete}_elem will free any existing referenced kptr. The same case is with LRU map's bpf_lru_push_free/htab_lru_push_free functions, which are extended to reset unreferenced and free referenced kptr. Note that unlike BPF timers, kptr is not reset or freed when map uref drops to zero. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-8-memxor@gmail.com --- include/linux/bpf.h | 4 +++ include/linux/btf.h | 2 ++ kernel/bpf/arraymap.c | 18 +++++++--- kernel/bpf/btf.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/hashtab.c | 64 ++++++++++++++++++++++++--------- kernel/bpf/syscall.c | 49 +++++++++++++++++++++++--- 6 files changed, 210 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e12b27a5de6..6141564c76c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -173,6 +174,8 @@ struct bpf_map_value_off_desc { enum bpf_kptr_type type; struct { struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; u32 btf_id; } kptr; }; @@ -1447,6 +1450,7 @@ struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u3 void bpf_map_free_kptr_off_tab(struct bpf_map *map); struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index fea424681d66..f70625dd5bb4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -45,6 +45,8 @@ struct btf_id_dtor_kfunc { u32 kfunc_btf_id; }; +typedef void (*btf_dtor_kfunc_t)(void *); + extern const struct file_operations btf_fops; void btf_get(struct btf *btf); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7f145aefbff8..c3de63ce574e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -287,10 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_timer_in_array(struct bpf_array *arr, void *val) +static void check_and_free_fields(struct bpf_array *arr, void *val) { - if (unlikely(map_value_has_timer(&arr->map))) + if (map_value_has_timer(&arr->map)) bpf_timer_cancel_and_free(val + arr->map.timer_off); + if (map_value_has_kptrs(&arr->map)) + bpf_map_free_kptrs(&arr->map, val); } /* Called from syscall or from eBPF program */ @@ -327,7 +329,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_timer_in_array(array, val); + check_and_free_fields(array, val); } return 0; } @@ -386,7 +388,8 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (likely(!map_value_has_timer(map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(map)) return; for (i = 0; i < array->map.max_entries; i++) @@ -398,6 +401,13 @@ static void array_map_free_timers(struct bpf_map *map) static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + if (map_value_has_kptrs(map)) { + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_kptrs(map, array->value + array->elem_size * i); + bpf_map_free_kptr_off_tab(map); + } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a0b1665a4a48..1f2012fd89fb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3416,6 +3416,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; struct bpf_map_value_off *tab; struct btf *kernel_btf = NULL; + struct module *mod = NULL; int ret, i, nr_off; ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); @@ -3444,16 +3445,69 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, goto end; } + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info_arr[i].type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } + + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } + + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + tab->off[i].kptr.dtor = (void *)addr; + } + tab->off[i].offset = info_arr[i].off; tab->off[i].type = info_arr[i].type; tab->off[i].kptr.btf_id = id; tab->off[i].kptr.btf = kernel_btf; + tab->off[i].kptr.module = mod; } tab->nr_off = nr_off; return tab; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); end: - while (i--) + while (i--) { btf_put(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + } kfree(tab); return ERR_PTR(ret); } @@ -7111,6 +7165,43 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) return dtor->kfunc_btf_id; } +static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt) +{ + const struct btf_type *dtor_func, *dtor_func_proto, *t; + const struct btf_param *args; + s32 dtor_btf_id; + u32 nr_args, i; + + for (i = 0; i < cnt; i++) { + dtor_btf_id = dtors[i].kfunc_btf_id; + + dtor_func = btf_type_by_id(btf, dtor_btf_id); + if (!dtor_func || !btf_type_is_func(dtor_func)) + return -EINVAL; + + dtor_func_proto = btf_type_by_id(btf, dtor_func->type); + if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto)) + return -EINVAL; + + /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */ + t = btf_type_by_id(btf, dtor_func_proto->type); + if (!t || !btf_type_is_void(t)) + return -EINVAL; + + nr_args = btf_type_vlen(dtor_func_proto); + if (nr_args != 1) + return -EINVAL; + args = btf_params(dtor_func_proto); + t = btf_type_by_id(btf, args[0].type); + /* Allow any pointer type, as width on targets Linux supports + * will be same for all pointer types (i.e. sizeof(void *)) + */ + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + } + return 0; +} + /* This function must be invoked only from initcalls/module init functions */ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner) @@ -7141,6 +7232,11 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c goto end; } + /* Ensure that the prototype of dtor kfuncs being registered is sane */ + ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt); + if (ret < 0) + goto end; + tab = btf->dtor_kfunc_tab; /* Only one call allowed for modules */ if (WARN_ON_ONCE(tab && btf_is_module(btf))) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c68fbebc8c00..7351e61bd683 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -238,7 +238,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (likely(!map_value_has_timer(&htab->map))) + if (!map_value_has_timer(&htab->map)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -254,6 +254,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!map_value_has_kptrs(&htab->map)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_elems(struct bpf_htab *htab) { int i; @@ -725,12 +744,15 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) +static void check_and_free_fields(struct bpf_htab *htab, + struct htab_elem *elem) { - if (unlikely(map_value_has_timer(&htab->map))) - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + void *map_value = elem->key + round_up(htab->map.key_size, 8); + + if (map_value_has_timer(&htab->map)) + bpf_timer_cancel_and_free(map_value + htab->map.timer_off); + if (map_value_has_kptrs(&htab->map)) + bpf_map_free_kptrs(&htab->map, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -757,7 +779,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); break; } @@ -829,7 +851,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); kfree(l); } @@ -857,7 +879,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -1104,7 +1126,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); else - check_and_free_timer(htab, l_old); + check_and_free_fields(htab, l_old); } ret = 0; err: @@ -1114,7 +1136,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_timer(htab, elem); + check_and_free_fields(htab, elem); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -1419,8 +1441,14 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct hlist_nulls_node *n; struct htab_elem *l; - hlist_nulls_for_each_entry(l, n, head, hash_node) - check_and_free_timer(htab, l); + hlist_nulls_for_each_entry(l, n, head, hash_node) { + /* We don't reset or free kptr on uref dropping to zero, + * hence just free timer. + */ + bpf_timer_cancel_and_free(l->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); + } cond_resched_rcu(); } rcu_read_unlock(); @@ -1430,7 +1458,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - if (likely(!map_value_has_timer(&htab->map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(&htab->map)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1453,11 +1482,14 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (!htab_is_prealloc(htab)) + if (!htab_is_prealloc(htab)) { delete_all_elements(htab); - else + } else { + htab_free_prealloced_kptrs(htab); prealloc_destroy(htab); + } + bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d419a3effc4..e0aead17dff4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -507,8 +507,11 @@ void bpf_map_free_kptr_off_tab(struct bpf_map *map) if (!map_value_has_kptrs(map)) return; - for (i = 0; i < tab->nr_off; i++) + for (i = 0; i < tab->nr_off; i++) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); btf_put(tab->off[i].kptr.btf); + } kfree(tab); map->kptr_off_tab = NULL; } @@ -525,8 +528,18 @@ struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) if (!new_tab) return ERR_PTR(-ENOMEM); /* Do a deep copy of the kptr_off_tab */ - for (i = 0; i < tab->nr_off; i++) + for (i = 0; i < tab->nr_off; i++) { btf_get(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { + while (i--) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(new_tab); + return ERR_PTR(-ENXIO); + } + } return new_tab; } @@ -546,6 +559,33 @@ bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_ma return !memcmp(tab_a, tab_b, size); } +/* Caller must ensure map_value_has_kptrs is true. Note that this function can + * be called on a map value while the map_value is visible to BPF programs, as + * it ensures the correct synchronization, and we already enforce the same using + * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. + */ +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + unsigned long *btf_id_ptr; + int i; + + for (i = 0; i < tab->nr_off; i++) { + struct bpf_map_value_off_desc *off_desc = &tab->off[i]; + unsigned long old_ptr; + + btf_id_ptr = map_value + off_desc->offset; + if (off_desc->type == BPF_KPTR_UNREF) { + u64 *p = (u64 *)btf_id_ptr; + + WRITE_ONCE(p, 0); + continue; + } + old_ptr = xchg(btf_id_ptr, 0); + off_desc->kptr.dtor((void *)old_ptr); + } +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -553,9 +593,10 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); kfree(map->off_arr); - bpf_map_free_kptr_off_tab(map); bpf_map_release_memcg(map); - /* implementation dependent freeing */ + /* implementation dependent freeing, map_free callback also does + * bpf_map_free_kptr_off_tab, if needed. + */ map->ops->map_free(map); } -- cgit v1.2.3 From a1ef195996526da45bbc9710849254023df75aea Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:56 +0530 Subject: bpf: Teach verifier about kptr_get kfunc helpers We introduce a new style of kfunc helpers, namely *_kptr_get, where they take pointer to the map value which points to a referenced kernel pointer contained in the map. Since this is referenced, only bpf_kptr_xchg from BPF side and xchg from kernel side is allowed to change the current value, and each pointer that resides in that location would be referenced, and RCU protected (this must be kept in mind while adding kernel types embeddable as reference kptr in BPF maps). This means that if do the load of the pointer value in an RCU read section, and find a live pointer, then as long as we hold RCU read lock, it won't be freed by a parallel xchg + release operation. This allows us to implement a safe refcount increment scheme. Hence, enforce that first argument of all such kfunc is a proper PTR_TO_MAP_VALUE pointing at the right offset to referenced pointer. For the rest of the arguments, they are subjected to typical kfunc argument checks, hence allowing some flexibility in passing more intent into how the reference should be taken. For instance, in case of struct nf_conn, it is not freed until RCU grace period ends, but can still be reused for another tuple once refcount has dropped to zero. Hence, a bpf_ct_kptr_get helper not only needs to call refcount_inc_not_zero, but also do a tuple match after incrementing the reference, and when it fails to match it, put the reference again and return NULL. This can be implemented easily if we allow passing additional parameters to the bpf_ct_kptr_get kfunc, like a struct bpf_sock_tuple * and a tuple__sz pair. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-9-memxor@gmail.com --- include/linux/btf.h | 2 ++ kernel/bpf/btf.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index f70625dd5bb4..2611cea2c2b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -17,6 +17,7 @@ enum btf_kfunc_type { BTF_KFUNC_TYPE_ACQUIRE, BTF_KFUNC_TYPE_RELEASE, BTF_KFUNC_TYPE_RET_NULL, + BTF_KFUNC_TYPE_KPTR_ACQUIRE, BTF_KFUNC_TYPE_MAX, }; @@ -35,6 +36,7 @@ struct btf_kfunc_id_set { struct btf_id_set *acquire_set; struct btf_id_set *release_set; struct btf_id_set *ret_null_set; + struct btf_id_set *kptr_acquire_set; }; struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1f2012fd89fb..494437fb40b7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6089,11 +6089,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); + bool rel = false, kptr_get = false; const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; int ref_regno = 0, ret; - bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -6119,10 +6119,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - /* Only kfunc can be release func */ - if (is_kfunc) + if (is_kfunc) { + /* Only kfunc can be release func */ rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), BTF_KFUNC_TYPE_RELEASE, func_id); + kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id); + } + /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -6154,8 +6158,52 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (ret < 0) return ret; - if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { + /* kptr_get is only true for kfunc */ + if (i == 0 && kptr_get) { + struct bpf_map_value_off_desc *off_desc; + + if (reg->type != PTR_TO_MAP_VALUE) { + bpf_log(log, "arg#0 expected pointer to map value\n"); + return -EINVAL; + } + + /* check_func_arg_reg_off allows var_off for + * PTR_TO_MAP_VALUE, but we need fixed offset to find + * off_desc. + */ + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "arg#0 must have constant offset\n"); + return -EINVAL; + } + + off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); + if (!off_desc || off_desc->type != BPF_KPTR_REF) { + bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", + reg->off + reg->var_off.value); + return -EINVAL; + } + + if (!btf_type_is_ptr(ref_t)) { + bpf_log(log, "arg#0 BTF type must be a double pointer\n"); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, + off_desc->kptr.btf_id)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + /* rest of the arguments can be anything, like normal kfunc */ + } else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ -- cgit v1.2.3 From 2ab3b3808eb17f729edfd69e061667ca0a427195 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:57 +0530 Subject: bpf: Make BTF type match stricter for release arguments The current of behavior of btf_struct_ids_match for release arguments is that when type match fails, it retries with first member type again (recursively). Since the offset is already 0, this is akin to just casting the pointer in normal C, since if type matches it was just embedded inside parent sturct as an object. However, we want to reject cases for release function type matching, be it kfunc or BPF helpers. An example is the following: struct foo { struct bar b; }; struct foo *v = acq_foo(); rel_bar(&v->b); // btf_struct_ids_match fails btf_types_are_same, then // retries with first member type and succeeds, while // it should fail. Hence, don't walk the struct and only rely on btf_types_are_same for strict mode. All users of strict mode must be dealing with zero offset anyway, since otherwise they would want the struct to be walked. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-10-memxor@gmail.com --- include/linux/bpf.h | 3 ++- kernel/bpf/btf.c | 14 ++++++++++---- kernel/bpf/verifier.c | 18 +++++++++++++++--- 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6141564c76c8..0af5793ba417 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1748,7 +1748,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id); + const struct btf *need_btf, u32 need_type_id, + bool strict); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 494437fb40b7..4cfaf5eebecd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5746,7 +5746,8 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1, bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id) + const struct btf *need_btf, u32 need_type_id, + bool strict) { const struct btf_type *type; enum bpf_type_flag flag; @@ -5755,7 +5756,12 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, /* Are we already done? */ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; - + /* In case of strict type match, we do not walk struct, the top level + * type match must succeed. When strict is true, off should have already + * been 0. + */ + if (strict) + return false; again: type = btf_type_by_id(btf, id); if (!type) @@ -6197,7 +6203,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, - off_desc->kptr.btf_id)) { + off_desc->kptr.btf_id, true)) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname); return -EINVAL; @@ -6250,7 +6256,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id)) { + reg->off, btf, ref_id, rel && reg->ref_obj_id)) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 955c3125576a..813f6ee80419 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3551,10 +3551,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * // to match type * * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off - * is zero. + * is zero. We must also ensure that btf_struct_ids_match does not walk + * the struct to match type against first member of struct, i.e. reject + * second case from above. Hence, when type is BPF_KPTR_REF, we set + * strict mode to true for type match. */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - off_desc->kptr.btf, off_desc->kptr.btf_id)) + off_desc->kptr.btf, off_desc->kptr.btf_id, + off_desc->type == BPF_KPTR_REF)) goto bad_type; return 0; bad_type: @@ -5593,6 +5597,13 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, found: if (reg->type == PTR_TO_BTF_ID) { + /* For bpf_sk_release, it needs to match against first member + * 'struct sock_common', hence make an exception for it. This + * allows bpf_sk_release to work for multiple socket types. + */ + bool strict_type_match = arg_type_is_release(arg_type) && + meta->func_id != BPF_FUNC_sk_release; + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -5605,7 +5616,8 @@ found: if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) return -EACCES; } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id)) { + btf_vmlinux, *arg_btf_id, + strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, kernel_type_name(reg->btf, reg->btf_id), kernel_type_name(btf_vmlinux, *arg_btf_id)); -- cgit v1.2.3 From 8e274732115f63c1d09136284431b3555bd5cc56 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 25 Apr 2022 23:04:28 +0206 Subject: printk: extend console_lock for per-console locking Currently threaded console printers synchronize against each other using console_lock(). However, different console drivers are unrelated and do not require any synchronization between each other. Removing the synchronization between the threaded console printers will allow each console to print at its own speed. But the threaded consoles printers do still need to synchronize against console_lock() callers. Introduce a per-console mutex and a new console boolean field @blocked to provide this synchronization. console_lock() is modified so that it must acquire the mutex of each console in order to set the @blocked field. Console printing threads will acquire their mutex while printing a record. If @blocked was set, the thread will go back to sleep instead of printing. The reason for the @blocked boolean field is so that console_lock() callers do not need to acquire multiple console mutexes simultaneously, which would introduce unnecessary complexity due to nested mutex locking. Also, a new field was chosen instead of adding a new @flags value so that the blocked status could be checked without concern of reading inconsistent values due to @flags updates from other contexts. Threaded console printers also need to synchronize against console_trylock() callers. Since console_trylock() may be called from any context, the per-console mutex cannot be used for this synchronization. (mutex_trylock() cannot be called from atomic contexts.) Introduce a global atomic counter to identify if any threaded printers are active. The threaded printers will also check the atomic counter to identify if the console has been locked by another task via console_trylock(). Note that @console_sem is still used to provide synchronization between console_lock() and console_trylock() callers. A locking overview for console_lock(), console_trylock(), and the threaded printers is as follows (pseudo code): console_lock() { down(&console_sem); for_each_console(con) { mutex_lock(&con->lock); con->blocked = true; mutex_unlock(&con->lock); } /* console_lock acquired */ } console_trylock() { if (down_trylock(&console_sem) == 0) { if (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) { /* console_lock acquired */ } } } threaded_printer() { mutex_lock(&con->lock); if (!con->blocked) { /* console_lock() callers blocked */ if (atomic_inc_unless_negative(&console_kthreads_active)) { /* console_trylock() callers blocked */ con->write(); atomic_dec(&console_lock_count); } } mutex_unlock(&con->lock); } The console owner and waiter logic now only applies between contexts that have taken the console_lock via console_trylock(). Threaded printers never take the console_lock, so they do not have a console_lock to handover. Tasks that have used console_lock() will block the threaded printers using a mutex and if the console_lock is handed over to an atomic context, it would be unable to unblock the threaded printers. However, the console_trylock() case is really the only scenario that is interesting for handovers anyway. @panic_console_dropped must change to atomic_t since it is no longer protected exclusively by the console_lock. Since threaded printers remain asleep if they see that the console is locked, they now must be explicitly woken in __console_unlock(). This means wake_up_klogd() calls following a console_unlock() are no longer necessary and are removed. Also note that threaded printers no longer need to check @console_suspended. The check for the @blocked field implicitly covers the suspended console case. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/878rrs6ft7.fsf@jogness.linutronix.de --- include/linux/console.h | 15 +++ kernel/printk/printk.c | 261 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 220 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index 9a251e70c090..143653090c48 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,6 +16,7 @@ #include #include +#include struct vc_data; struct console_font_op; @@ -154,6 +155,20 @@ struct console { u64 seq; unsigned long dropped; struct task_struct *thread; + bool blocked; + + /* + * The per-console lock is used by printing kthreads to synchronize + * this console with callers of console_lock(). This is necessary in + * order to allow printing kthreads to run in parallel to each other, + * while each safely accessing the @blocked field and synchronizing + * against direct printing via console_lock/console_unlock. + * + * Note: For synchronizing against direct printing via + * console_trylock/console_unlock, see the static global + * variable @console_kthreads_active. + */ + struct mutex lock; void *data; struct console *next; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e4cdc424c826..750d1229cc11 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -223,6 +223,33 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; +/* + * Used to synchronize printing kthreads against direct printing via + * console_trylock/console_unlock. + * + * Values: + * -1 = console kthreads atomically blocked (via global trylock) + * 0 = no kthread printing, console not locked (via trylock) + * >0 = kthread(s) actively printing + * + * Note: For synchronizing against direct printing via + * console_lock/console_unlock, see the @lock variable in + * struct console. + */ +static atomic_t console_kthreads_active = ATOMIC_INIT(0); + +#define console_kthreads_atomic_tryblock() \ + (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) +#define console_kthreads_atomic_unblock() \ + atomic_cmpxchg(&console_kthreads_active, -1, 0) +#define console_kthreads_atomically_blocked() \ + (atomic_read(&console_kthreads_active) == -1) + +#define console_kthread_printing_tryenter() \ + atomic_inc_unless_negative(&console_kthreads_active) +#define console_kthread_printing_exit() \ + atomic_dec(&console_kthreads_active) + /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -270,6 +297,49 @@ static bool panic_in_progress(void) return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } +/* + * Tracks whether kthread printers are all blocked. A value of true implies + * that the console is locked via console_lock() or the console is suspended. + * Writing to this variable requires holding @console_sem. + */ +static bool console_kthreads_blocked; + +/* + * Block all kthread printers from a schedulable context. + * + * Requires holding @console_sem. + */ +static void console_kthreads_block(void) +{ + struct console *con; + + for_each_console(con) { + mutex_lock(&con->lock); + con->blocked = true; + mutex_unlock(&con->lock); + } + + console_kthreads_blocked = true; +} + +/* + * Unblock all kthread printers from a schedulable context. + * + * Requires holding @console_sem. + */ +static void console_kthreads_unblock(void) +{ + struct console *con; + + for_each_console(con) { + mutex_lock(&con->lock); + con->blocked = false; + mutex_unlock(&con->lock); + } + + console_kthreads_blocked = false; +} + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2603,13 +2673,6 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); - - /* - * While suspended, new records may have been added to the - * ringbuffer. Wake up the kthread printers to print them. - */ - wake_up_klogd(); - pr_flush(1000, true); } @@ -2628,9 +2691,14 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - - /* Wake kthread printers. Some may have become usable. */ - wake_up_klogd(); + else { + /* + * If a new CPU comes online, the conditions for + * printer_should_wake() may have changed for some + * kthread printer with !CON_ANYTIME. + */ + wake_up_klogd(); + } } return 0; } @@ -2650,6 +2718,7 @@ void console_lock(void) down_console_sem(); if (console_suspended) return; + console_kthreads_block(); console_locked = 1; console_may_schedule = 1; } @@ -2671,6 +2740,10 @@ int console_trylock(void) up_console_sem(); return 0; } + if (!console_kthreads_atomic_tryblock()) { + up_console_sem(); + return 0; + } console_locked = 1; console_may_schedule = 0; return 1; @@ -2679,7 +2752,7 @@ EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { - return console_locked; + return (console_locked || atomic_read(&console_kthreads_active)); } EXPORT_SYMBOL(is_console_locked); @@ -2723,7 +2796,7 @@ static inline bool __console_is_usable(short flags) * Check if the given console is currently capable and allowed to print * records. * - * Requires the console_lock. + * Requires holding the console_lock. */ static inline bool console_is_usable(struct console *con) { @@ -2736,6 +2809,22 @@ static inline bool console_is_usable(struct console *con) static void __console_unlock(void) { console_locked = 0; + + /* + * Depending on whether console_lock() or console_trylock() was used, + * appropriately allow the kthread printers to continue. + */ + if (console_kthreads_blocked) + console_kthreads_unblock(); + else + console_kthreads_atomic_unblock(); + + /* + * New records may have arrived while the console was locked. + * Wake the kthread printers to print them. + */ + wake_up_klogd(); + up_console_sem(); } @@ -2753,17 +2842,19 @@ static void __console_unlock(void) * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the - * console_lock. Otherwise it is set to false. + * console_lock. Otherwise it is set to false. A NULL pointer may be provided + * to disable allowing the console_lock to be taken over by a printk waiter. * * Returns false if the given console has no next record to print, otherwise * true. * - * Requires the console_lock. + * Requires the console_lock if @handover is non-NULL. + * Requires con->lock otherwise. */ -static bool console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) +static bool __console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) { - static int panic_console_dropped; + static atomic_t panic_console_dropped = ATOMIC_INIT(0); struct printk_info info; struct printk_record r; unsigned long flags; @@ -2772,7 +2863,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); - *handover = false; + if (handover) + *handover = false; if (!prb_read_valid(prb, con->seq, &r)) return false; @@ -2780,7 +2872,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ if (con->seq != r.info->seq) { con->dropped += r.info->seq - con->seq; con->seq = r.info->seq; - if (panic_in_progress() && panic_console_dropped++ > 10) { + if (panic_in_progress() && + atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) { suppress_panic_printk = 1; pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); } @@ -2802,31 +2895,61 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); + if (handover) { + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + + /* don't trace irqsoff print latency */ + stop_critical_timings(); + } - stop_critical_timings(); /* don't trace print latency */ call_console_driver(con, write_text, len, dropped_text); - start_critical_timings(); con->seq++; - *handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); + if (handover) { + start_critical_timings(); + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); + } skip: return true; } +/* + * Print a record for a given console, but allow another printk() caller to + * take over the console_lock and continue printing. + * + * Requires the console_lock, but depending on @handover after the call, the + * caller may no longer have the console_lock. + * + * See __console_emit_next_record() for argument and return details. + */ +static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) +{ + /* + * Handovers are only supported if threaded printers are atomically + * blocked. The context taking over the console_lock may be atomic. + */ + if (!console_kthreads_atomically_blocked()) { + *handover = false; + handover = NULL; + } + + return __console_emit_next_record(con, text, ext_text, dropped_text, handover); +} + /* * Print out all remaining records to all consoles. * @@ -2878,13 +3001,11 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove if (con->flags & CON_EXTENDED) { /* Extended consoles do not print "dropped messages". */ - progress = console_emit_next_record(con, &text[0], - &ext_text[0], NULL, - handover); + progress = console_emit_next_record_transferable(con, &text[0], + &ext_text[0], NULL, handover); } else { - progress = console_emit_next_record(con, &text[0], - NULL, &dropped_text[0], - handover); + progress = console_emit_next_record_transferable(con, &text[0], + NULL, &dropped_text[0], handover); } if (*handover) return false; @@ -2999,6 +3120,10 @@ void console_unblank(void) if (oops_in_progress) { if (down_trylock_console_sem() != 0) return; + if (!console_kthreads_atomic_tryblock()) { + up_console_sem(); + return; + } } else console_lock(); @@ -3081,10 +3206,6 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); - - /* Wake the newly enabled kthread printer. */ - wake_up_klogd(); - __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3286,6 +3407,8 @@ void register_console(struct console *newcon) newcon->dropped = 0; newcon->thread = NULL; + newcon->blocked = true; + mutex_init(&newcon->lock); if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ @@ -3586,6 +3709,19 @@ static void printk_fallback_preferred_direct(void) console_unlock(); } +/* + * Print a record for a given console, not allowing another printk() caller + * to take over. This is appropriate for contexts that do not have the + * console_lock. + * + * See __console_emit_next_record() for argument and return details. + */ +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text) +{ + return __console_emit_next_record(con, text, ext_text, dropped_text, NULL); +} + static bool printer_should_wake(struct console *con, u64 seq) { short flags; @@ -3593,8 +3729,10 @@ static bool printer_should_wake(struct console *con, u64 seq) if (kthread_should_stop() || !printk_kthreads_available) return true; - if (console_suspended) + if (con->blocked || + console_kthreads_atomically_blocked()) { return false; + } /* * This is an unsafe read from con->flags, but a false positive is @@ -3615,7 +3753,6 @@ static int printk_kthread_func(void *data) struct console *con = data; char *dropped_text = NULL; char *ext_text = NULL; - bool handover; u64 seq = 0; char *text; int error; @@ -3665,15 +3802,27 @@ static int printk_kthread_func(void *data) if (error) continue; - console_lock(); + error = mutex_lock_interruptible(&con->lock); + if (error) + continue; - if (console_suspended) { - up_console_sem(); + if (con->blocked || + !console_kthread_printing_tryenter()) { + /* Another context has locked the console_lock. */ + mutex_unlock(&con->lock); continue; } - if (!console_is_usable(con)) { - __console_unlock(); + /* + * Although this context has not locked the console_lock, it + * is known that the console_lock is not locked and it is not + * possible for any other context to lock the console_lock. + * Therefore it is safe to read con->flags. + */ + + if (!__console_is_usable(con->flags)) { + console_kthread_printing_exit(); + mutex_unlock(&con->lock); continue; } @@ -3686,13 +3835,13 @@ static int printk_kthread_func(void *data) * which can conditionally invoke cond_resched(). */ console_may_schedule = 0; - console_emit_next_record(con, text, ext_text, dropped_text, &handover); - if (handover) - continue; + console_emit_next_record(con, text, ext_text, dropped_text); seq = con->seq; - __console_unlock(); + console_kthread_printing_exit(); + + mutex_unlock(&con->lock); } con_printk(KERN_INFO, con, "printing thread stopped\n"); -- cgit v1.2.3 From ab406816fca009349b89cbde885daf68a8c77e33 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:50 +0206 Subject: printk: remove @console_locked The static global variable @console_locked is used to help debug VT code to make sure that certain code paths are running with the console_lock held. However, this information is also available with the static global variable @console_kthreads_blocked (for locking via console_lock()), and the static global variable @console_kthreads_active (for locking via console_trylock()). Remove @console_locked and update is_console_locked() to use the alternative variables. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-16-john.ogness@linutronix.de --- kernel/printk/printk.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 750d1229cc11..f66d6e72a642 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -340,15 +340,7 @@ static void console_kthreads_unblock(void) console_kthreads_blocked = false; } -/* - * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's - * definitely not the perfect debug tool (we don't know if _WE_ - * hold it and are racing, but it helps tracking those weird code - * paths in the console code where we end up in places I want - * locked without the console semaphore held). - */ -static int console_locked, console_suspended; +static int console_suspended; /* * Array of consoles built from command line options (console=) @@ -2719,7 +2711,6 @@ void console_lock(void) if (console_suspended) return; console_kthreads_block(); - console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); @@ -2744,15 +2735,26 @@ int console_trylock(void) up_console_sem(); return 0; } - console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); +/* + * This is used to help to make sure that certain paths within the VT code are + * running with the console lock held. It is definitely not the perfect debug + * tool (it is not known if the VT code is the task holding the console lock), + * but it helps tracking those weird code paths in the console code such as + * when the console is suspended: where the console is not locked but no + * console printing may occur. + * + * Note: This returns true when the console is suspended but is not locked. + * This is intentional because the VT code must consider that situation + * the same as if the console was locked. + */ int is_console_locked(void) { - return (console_locked || atomic_read(&console_kthreads_active)); + return (console_kthreads_blocked || atomic_read(&console_kthreads_active)); } EXPORT_SYMBOL(is_console_locked); @@ -2808,8 +2810,6 @@ static inline bool console_is_usable(struct console *con) static void __console_unlock(void) { - console_locked = 0; - /* * Depending on whether console_lock() or console_trylock() was used, * appropriately allow the kthread printers to continue. @@ -3127,7 +3127,6 @@ void console_unblank(void) } else console_lock(); - console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) -- cgit v1.2.3 From c317ab71facc2cd0a94145973318a4c914e11acc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 25 Apr 2022 21:32:47 +0800 Subject: bpf: Compute map_btf_id during build time For now, the field 'map_btf_id' in 'struct bpf_map_ops' for all map types are computed during vmlinux-btf init: btf_parse_vmlinux() -> btf_vmlinux_map_ids_init() It will lookup the btf_type according to the 'map_btf_name' field in 'struct bpf_map_ops'. This process can be done during build time, thanks to Jiri's resolve_btfids. selftest of map_ptr has passed: $96 map_ptr:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Reported-by: kernel test robot Signed-off-by: Menglong Dong Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- kernel/bpf/arraymap.c | 26 ++++++++------------------ kernel/bpf/bloom_filter.c | 6 +++--- kernel/bpf/bpf_inode_storage.c | 6 +++--- kernel/bpf/bpf_struct_ops.c | 6 +++--- kernel/bpf/bpf_task_storage.c | 5 ++--- kernel/bpf/btf.c | 40 ---------------------------------------- kernel/bpf/cpumap.c | 6 +++--- kernel/bpf/devmap.c | 10 ++++------ kernel/bpf/hashtab.c | 22 +++++++--------------- kernel/bpf/local_storage.c | 7 ++++--- kernel/bpf/lpm_trie.c | 6 +++--- kernel/bpf/queue_stack_maps.c | 10 ++++------ kernel/bpf/reuseport_array.c | 6 +++--- kernel/bpf/ringbuf.c | 6 +++--- kernel/bpf/stackmap.c | 5 ++--- net/core/bpf_sk_storage.c | 5 ++--- net/core/sock_map.c | 10 ++++------ net/xdp/xskmap.c | 6 +++--- 19 files changed, 62 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0af5793ba417..be94833d390a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -148,8 +148,7 @@ struct bpf_map_ops { bpf_callback_t callback_fn, void *callback_ctx, u64 flags); - /* BTF name and id of struct allocated by map_alloc */ - const char * const map_btf_name; + /* BTF id of struct allocated by map_alloc */ int *map_btf_id; /* bpf_iter info used to open a seq_file */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c3de63ce574e..b3bf31fd9458 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -690,7 +691,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ return num_elems; } -static int array_map_btf_id; +BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -711,12 +712,10 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -732,8 +731,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &percpu_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -1112,7 +1110,6 @@ static void prog_array_map_free(struct bpf_map *map) * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ -static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -1128,8 +1125,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &prog_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -1218,7 +1214,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map) fd_array_map_free(map); } -static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1231,8 +1226,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &perf_event_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS @@ -1255,7 +1249,6 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1267,8 +1260,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &cgroup_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #endif @@ -1342,7 +1334,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1355,6 +1346,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &array_of_maps_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b141a1346f72..b9ea539a5561 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -7,6 +7,7 @@ #include #include #include +#include #define BLOOM_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) @@ -192,7 +193,7 @@ static int bloom_map_check_btf(const struct bpf_map *map, return btf_type_is_void(key_type) ? 0 : -EINVAL; } -static int bpf_bloom_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, @@ -205,6 +206,5 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, - .map_btf_name = "bpf_bloom_filter", - .map_btf_id = &bpf_bloom_map_btf_id, + .map_btf_id = &bpf_bloom_map_btf_ids[0], }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 10424a1cda51..5f7683b19199 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -245,7 +245,8 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, NULL); } -static int inode_storage_map_btf_id; +BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, + bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -256,8 +257,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &inode_storage_map_btf_id, + .map_btf_id = &inode_storage_map_btf_ids[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index de01d37c2d3b..3a0103ad97bc 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -10,6 +10,7 @@ #include #include #include +#include enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, @@ -612,7 +613,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } -static int bpf_struct_ops_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -622,8 +623,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, - .map_btf_name = "bpf_struct_ops_map", - .map_btf_id = &bpf_struct_ops_map_btf_id, + .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 57904263a710..e9014dc62682 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -307,7 +307,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } -static int task_storage_map_btf_id; +BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -318,8 +318,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &task_storage_map_btf_id, + .map_btf_id = &task_storage_map_btf_ids[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4cfaf5eebecd..2f0b0440131c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5025,41 +5025,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, return ctx_type; } -static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { -#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) -#define BPF_LINK_TYPE(_id, _name) -#define BPF_MAP_TYPE(_id, _ops) \ - [_id] = &_ops, -#include -#undef BPF_PROG_TYPE -#undef BPF_LINK_TYPE -#undef BPF_MAP_TYPE -}; - -static int btf_vmlinux_map_ids_init(const struct btf *btf, - struct bpf_verifier_log *log) -{ - const struct bpf_map_ops *ops; - int i, btf_id; - - for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { - ops = btf_vmlinux_map_ops[i]; - if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) - continue; - if (!ops->map_btf_name || !ops->map_btf_id) { - bpf_log(log, "map type %d is misconfigured\n", i); - return -EINVAL; - } - btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, - BTF_KIND_STRUCT); - if (btf_id < 0) - return btf_id; - *ops->map_btf_id = btf_id; - } - - return 0; -} - static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -5125,11 +5090,6 @@ struct btf *btf_parse_vmlinux(void) /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - /* find bpf map structs for map_ptr access checking */ - err = btf_vmlinux_map_ids_init(btf, log); - if (err < 0) - goto errout; - bpf_struct_ops_init(btf, log); refcount_set(&btf->refcnt, 1); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 650e5d21f90d..f4860ac756cd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -27,6 +27,7 @@ #include #include #include +#include #include /* netif_receive_skb_list */ #include /* eth_type_trans */ @@ -673,7 +674,7 @@ static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __cpu_map_lookup_elem); } -static int cpu_map_btf_id; +BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, @@ -683,8 +684,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_cpu_map", - .map_btf_id = &cpu_map_btf_id, + .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 038f6d7a83e4..c2867068e5bd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,6 +48,7 @@ #include #include #include +#include #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -1005,7 +1006,7 @@ static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __dev_map_hash_lookup_elem); } -static int dev_map_btf_id; +BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1015,12 +1016,10 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; -static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1030,8 +1029,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_hash_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7351e61bd683..3e00e62b2218 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -2137,7 +2138,7 @@ out: return num_elems; } -static int htab_map_btf_id; +BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2154,12 +2155,10 @@ const struct bpf_map_ops htab_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2177,8 +2176,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2284,7 +2282,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2299,12 +2296,10 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2319,8 +2314,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2444,7 +2438,6 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } -static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -2457,6 +2450,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_of_maps_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], }; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 497916060ac7..8654fc97f5fe 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUP_BPF @@ -446,7 +447,8 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int cgroup_storage_map_btf_id; +BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, + bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -456,8 +458,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, - .map_btf_name = "bpf_cgroup_storage_map", - .map_btf_id = &cgroup_storage_map_btf_id, + .map_btf_id = &cgroup_storage_map_btf_ids[0], }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 5763cc7ac4f1..f0d05a3cc4b9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -14,6 +14,7 @@ #include #include #include +#include /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -719,7 +720,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } -static int trie_map_btf_id; +BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie) const struct bpf_map_ops trie_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, @@ -732,6 +733,5 @@ const struct bpf_map_ops trie_map_ops = { .map_update_batch = generic_map_update_batch, .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, - .map_btf_name = "lpm_trie", - .map_btf_id = &trie_map_btf_id, + .map_btf_id = &trie_map_btf_ids[0], }; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f9c734aaa990..a1c0794ae49d 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -247,7 +248,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } -static int queue_map_btf_id; +BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -260,11 +261,9 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &queue_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; -static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -277,6 +276,5 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &stack_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 8251243022a2..e2618fb5870e 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -6,6 +6,7 @@ #include #include #include +#include struct reuseport_array { struct bpf_map map; @@ -337,7 +338,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } -static int reuseport_array_map_btf_id; +BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, @@ -346,6 +347,5 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, - .map_btf_name = "reuseport_array", - .map_btf_id = &reuseport_array_map_btf_id, + .map_btf_id = &reuseport_array_map_btf_ids[0], }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 5173fd37590f..311264ab80c4 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -10,6 +10,7 @@ #include #include #include +#include #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -263,7 +264,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } -static int ringbuf_map_btf_id; +BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, @@ -274,8 +275,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, - .map_btf_name = "bpf_ringbuf_map", - .map_btf_id = &ringbuf_map_btf_id, + .map_btf_id = &ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1dd5266fbebb..1adbe67cdb95 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -654,7 +654,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static int stack_trace_map_btf_id; +BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, @@ -664,6 +664,5 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_stack_map", - .map_btf_id = &stack_trace_map_btf_id, + .map_btf_id = &stack_trace_map_btf_ids[0], }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 83d7641ef67b..a25ec93729b9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -338,7 +338,7 @@ bpf_sk_storage_ptr(void *owner) return &sk->sk_bpf_storage; } -static int sk_storage_map_btf_id; +BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -349,8 +349,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &sk_storage_map_btf_id, + .map_btf_id = &sk_storage_map_btf_ids[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2d213c4011db..81d4b4756a02 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -793,7 +793,7 @@ static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_priv_size = sizeof(struct sock_map_seq_info), }; -static int sock_map_btf_id; +BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, @@ -805,8 +805,7 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_stab", - .map_btf_id = &sock_map_btf_id, + .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; @@ -1385,7 +1384,7 @@ static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_priv_size = sizeof(struct sock_hash_seq_info), }; -static int sock_hash_map_btf_id; +BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, @@ -1397,8 +1396,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_shtab", - .map_btf_id = &sock_hash_map_btf_id, + .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 65b53fb3de13..acc8e52a4f5f 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "xsk.h" @@ -254,7 +255,7 @@ static bool xsk_map_meta_equal(const struct bpf_map *meta0, bpf_map_meta_equal(meta0, meta1); } -static int xsk_map_btf_id; +BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map) const struct bpf_map_ops xsk_map_ops = { .map_meta_equal = xsk_map_meta_equal, .map_alloc = xsk_map_alloc, @@ -266,7 +267,6 @@ const struct bpf_map_ops xsk_map_ops = { .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "xsk_map", - .map_btf_id = &xsk_map_btf_id, + .map_btf_id = &xsk_map_btf_ids[0], .map_redirect = xsk_map_redirect, }; -- cgit v1.2.3 From 217d8c05ec6295947b0b71bc784012d112f0032e Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 7 Feb 2022 05:12:16 -0800 Subject: tracing: Cleanup double word in comment Remove the second 'is' and 'to'. Link: https://lkml.kernel.org/r/20220207131216.2059997-1-trix@redhat.com Signed-off-by: Tom Rix Signed-off-by: Steven Rostedt (Google) --- kernel/trace/pid_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index a2ef1d18126a..95106d02b32d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1, /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test - * @pid: The pid to to see if set in the list. + * @pid: The pid to see if set in the list. * - * Tests if @pid is is set in the @pid_list. This is usually called + * Tests if @pid is set in the @pid_list. This is usually called * from the scheduler when a task is scheduled. Its pid is checked * if it should be traced or not. * -- cgit v1.2.3 From b8cc44a4d3c19296dfd1be1a018a8523e09ab919 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:04 -0600 Subject: tracing: Remove logic for registering multiple event triggers at a time Code for registering triggers assumes it's possible to register more than one trigger at a time. In fact, it's unimplemented and there doesn't seem to be a reason to do that. Remove the n_registered param from event_trigger_register() and fix up callers. Doing so simplifies the logic in event_trigger_register to the point that it just becomes a wrapper calling event_command.reg(). It also removes the problematic call to event_command.unreg() in case of failure. A new function, event_trigger_unregister() is also added for callers to call themselves. The changes to trace_events_hist.c simply allow compilation; a separate patch follows which updates the hist triggers to work correctly with the new changes. Link: https://lkml.kernel.org/r/6149fec7a139d93e84fa4535672fb5bef88006b0.1644010575.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 9 ++-- kernel/trace/trace_events_hist.c | 17 +++---- kernel/trace/trace_events_trigger.c | 96 +++++++++++++------------------------ 3 files changed, 45 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 07d990270e2a..2bc8036b0659 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1629,10 +1629,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops, extern int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *trigger, - struct event_trigger_data *trigger_data, - int *n_registered); + struct event_trigger_data *trigger_data); +extern void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data); /** * struct event_trigger_ops - callbacks for trace event triggers diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 44db5ba9cabb..b18d00905eae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6287,7 +6287,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6296,13 +6296,10 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of triggers registered, - * but if it didn't register any it returns zero. Consider no - * triggers registered a failure too. - */ - if (!ret) { + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; + if (ret == 0) { if (!(attrs->pause || attrs->cont || attrs->clear)) ret = -ENOENT; goto out_free; @@ -6331,15 +6328,13 @@ enable: se = find_synth_event(se_name); if (se) se->ref++; - /* Just return zero, not the number of registered triggers */ - ret = 0; out: if (ret == 0) hist_err_clear(); return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 7eb9d04f1c2e..ea13679054a8 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -587,13 +587,12 @@ static int register_trigger(char *glob, } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; @@ -927,48 +926,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops, * @cmd_ops: The event_command operations for the trigger * @file: The event file for the trigger's event * @glob: The trigger command string, with optional remove(!) operator - * @cmd: The cmd string - * @param: The param string * @trigger_data: The trigger_data for the trigger - * @n_registered: optional outparam, the number of triggers registered * * Register an event trigger. The @cmd_ops are used to call the - * cmd_ops->reg() function which actually does the registration. The - * cmd_ops->reg() function returns the number of triggers registered, - * which is assigned to n_registered, if n_registered is non-NULL. + * cmd_ops->reg() function which actually does the registration. * * Return: 0 on success, errno otherwise */ int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *param, - struct event_trigger_data *trigger_data, - int *n_registered) + struct event_trigger_data *trigger_data) { - int ret; - - if (n_registered) - *n_registered = 0; - - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) { - if (n_registered) - *n_registered = ret; - /* Just return zero, not the number of enabled functions */ - ret = 0; - } + return cmd_ops->reg(glob, trigger_data, file); +} - return ret; +/** + * event_trigger_unregister - unregister an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @trigger_data: The trigger_data for the trigger + * + * Unregister an event trigger. The @cmd_ops are used to call the + * cmd_ops->unreg() function which actually does the unregistration. + */ +void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data) +{ + cmd_ops->unreg(glob, trigger_data, file); } /* @@ -1027,7 +1015,7 @@ event_trigger_parse(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); ret = 0; goto out; @@ -1062,17 +1050,10 @@ event_trigger_parse(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) - ret = 0; + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_ops, trigger_data); @@ -1854,7 +1835,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, trigger_data->private_data = enable_data; if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1901,19 +1882,11 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) goto out_disable; - /* Just return zero, not the number of enabled functions */ - ret = 0; + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1959,13 +1932,12 @@ int event_enable_register_trigger(char *glob, } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; -- cgit v1.2.3 From 476705419518a2f383b6695782ba1f285a2959b9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:05 -0600 Subject: tracing: Remove redundant trigger_ops params Since event_trigger_data contains the .ops trigger_ops field, there's no reason to pass the trigger_ops separately. Remove it as a param from functions whenever event_trigger_data is passed. Link: https://lkml.kernel.org/r/9856c9bc81bde57077f5b8d6f8faa47156c6354a.1644010575.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 14 ++++------- kernel/trace/trace_eprobe.c | 7 ++---- kernel/trace/trace_events_hist.c | 29 ++++++++++------------- kernel/trace/trace_events_trigger.c | 46 +++++++++++++++---------------------- 4 files changed, 36 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2bc8036b0659..f3e4cd1bb60e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1573,10 +1573,8 @@ struct enable_trigger_data { }; extern int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, - struct event_trigger_data *data); -extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param); @@ -1587,8 +1585,7 @@ extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); -extern int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); @@ -1687,12 +1684,9 @@ struct event_trigger_ops { struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_ops *ops, - struct event_trigger_data *data); - void (*free)(struct event_trigger_ops *ops, - struct event_trigger_data *data); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); int (*print)(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data); }; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 541aa13581b9..9b3dfc9da067 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -511,20 +511,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) * functions are just stubs to fulfill what is needed to use the trigger * infrastructure. */ -static int eprobe_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int eprobe_trigger_init(struct event_trigger_data *data) { return 0; } -static void eprobe_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void eprobe_trigger_free(struct event_trigger_data *data) { } static int eprobe_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { /* Do not print eprobe event triggers */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b18d00905eae..40a7e5582dfb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5252,7 +5252,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5484,7 +5484,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5621,7 +5621,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) } static int event_hist_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5729,8 +5728,7 @@ static int event_hist_trigger_print(struct seq_file *m, return 0; } -static int event_hist_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5758,8 +5756,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) } } -static void event_hist_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_free(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5788,25 +5785,23 @@ static struct event_trigger_ops event_hist_trigger_ops = { .free = event_hist_trigger_free, }; -static int event_hist_trigger_named_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_named_init(struct event_trigger_data *data) { data->ref++; save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(ops, data->named_data); + event_hist_trigger_init(data->named_data); return 0; } -static void event_hist_trigger_named_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_named_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; - event_hist_trigger_free(ops, data->named_data); + event_hist_trigger_free(data->named_data); data->ref--; if (!data->ref) { @@ -5993,7 +5988,7 @@ static int hist_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -6111,7 +6106,7 @@ static void hist_unregister_trigger(char *glob, } if (unregistered && test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); if (hist_data->enable_timestamps) { if (!hist_data->remove || unregistered) @@ -6164,7 +6159,7 @@ static void hist_unreg_all(struct trace_event_file *file) if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } @@ -6458,7 +6453,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index ea13679054a8..32202175875d 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -188,7 +188,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); return 0; } @@ -432,7 +432,6 @@ event_trigger_print(const char *name, struct seq_file *m, /** * event_trigger_init - Generic event_trigger_ops @init implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -442,8 +441,7 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_data *data) { data->ref++; return 0; @@ -451,7 +449,6 @@ int event_trigger_init(struct event_trigger_ops *ops, /** * event_trigger_free - Generic event_trigger_ops @free implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -460,8 +457,7 @@ int event_trigger_init(struct event_trigger_ops *ops, * implementations. */ static void -event_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +event_trigger_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; @@ -515,7 +511,7 @@ clear_event_triggers(struct trace_array *tr) trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); if (data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } } } @@ -581,7 +577,7 @@ static int register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -629,7 +625,7 @@ static void unregister_trigger(char *glob, } if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } /* @@ -1049,14 +1045,14 @@ event_trigger_parse(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ - event_trigger_init(trigger_ops, trigger_data); + event_trigger_init(trigger_data); ret = event_trigger_register(cmd_ops, file, glob, trigger_data); if (ret) goto out_free; /* Down the counter of trigger_data or free it if not used anymore */ - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); out: return ret; @@ -1382,16 +1378,14 @@ traceoff_count_trigger(struct event_trigger_data *data, } static int -traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceon", m, (void *)data->count, data->filter_str); } static int -traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceoff", m, (void *)data->count, data->filter_str); @@ -1502,8 +1496,7 @@ register_snapshot_trigger(char *glob, } static int -snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("snapshot", m, (void *)data->count, data->filter_str); @@ -1598,8 +1591,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, } static int -stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("stacktrace", m, (void *)data->count, data->filter_str); @@ -1689,7 +1681,6 @@ event_enable_count_trigger(struct event_trigger_data *data, } int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1714,8 +1705,7 @@ int event_enable_trigger_print(struct seq_file *m, return 0; } -void event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1843,7 +1833,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, } /* Up the trigger_data count to make sure nothing frees it on failure */ - event_trigger_init(trigger_ops, trigger_data); + event_trigger_init(trigger_data); if (trigger) { number = strsep(&trigger, ":"); @@ -1887,7 +1877,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, if (ret) goto out_disable; - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); out: return ret; @@ -1898,7 +1888,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); kfree(enable_data); goto out; } @@ -1926,7 +1916,7 @@ int event_enable_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -1969,7 +1959,7 @@ void event_enable_unregister_trigger(char *glob, } if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } static struct event_trigger_ops * -- cgit v1.2.3 From e1f187d09e11f50a50cbb02ae277d4e8bdfc7db8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:06 -0600 Subject: tracing: Have existing event_command.parse() implementations use helpers Simplify the existing event_command.parse() implementations by having them make use of the helper functions previously introduced. Link: https://lkml.kernel.org/r/b353e3427a81f9d3adafd98fd7d73e78a8209f43.1644010576.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 3 +- kernel/trace/trace_eprobe.c | 3 +- kernel/trace/trace_events_hist.c | 64 ++++++--------- kernel/trace/trace_events_trigger.c | 150 ++++++++++-------------------------- 4 files changed, 69 insertions(+), 151 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3e4cd1bb60e..ff816fb41e48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1577,7 +1577,8 @@ extern int event_enable_trigger_print(struct seq_file *m, extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); extern int event_enable_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 9b3dfc9da067..b045fa9f276c 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -546,7 +546,8 @@ static struct event_trigger_ops eprobe_trigger_ops = { static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { return -1; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 40a7e5582dfb..1b71bdb0bcb1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2785,7 +2785,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, static struct event_command trigger_hist_cmd; static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -6166,17 +6167,17 @@ static void hist_unreg_all(struct trace_event_file *file) static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; struct hist_trigger_attrs *attrs; - struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; + char *param, *filter, *p, *start; struct synth_event *se; const char *se_name; - bool remove = false; - char *trigger, *p, *start; + bool remove; int ret = 0; lockdep_assert_held(&event_mutex); @@ -6185,31 +6186,30 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (strlen(glob)) { hist_err_clear(); - last_cmd_set(file, param); + last_cmd_set(file, param_and_filter); } - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - if (glob[0] == '!') - remove = true; + if (event_trigger_empty_param(param_and_filter)) + return -EINVAL; /* * separate the trigger from the filter (k:v [if filter]) * allowing for whitespace in the trigger */ - p = trigger = param; + p = param = param_and_filter; do { p = strstr(p, "if"); if (!p) break; - if (p == param) + if (p == param_and_filter) return -EINVAL; if (*(p - 1) != ' ' && *(p - 1) != '\t') { p++; continue; } - if (p >= param + strlen(param) - (sizeof("if") - 1) - 1) + if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1) return -EINVAL; if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { p++; @@ -6219,24 +6219,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, } while (1); if (!p) - param = NULL; + filter = NULL; else { *(p - 1) = '\0'; - param = strstrip(p); - trigger = strstrip(trigger); + filter = strstrip(p); + param = strstrip(param); } /* * To simplify arithmetic expression parsing, replace occurrences of * '.sym-offset' modifier with '.symXoffset' */ - start = strstr(trigger, ".sym-offset"); + start = strstr(param, ".sym-offset"); while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); } - attrs = parse_hist_trigger_attrs(file->tr, trigger); + attrs = parse_hist_trigger_attrs(file->tr, param); if (IS_ERR(attrs)) return PTR_ERR(attrs); @@ -6249,29 +6249,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; } - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); - - trigger_data->private_data = hist_data; - - /* if param is non-empty, it's supposed to be a filter */ - if (param && cmd_ops->set_filter) { - ret = cmd_ops->set_filter(param, trigger_data, file); - if (ret < 0) - goto out_free; - } + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; if (remove) { if (!have_hist_trigger_match(trigger_data, file)) @@ -6298,8 +6284,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (!(attrs->pause || attrs->cont || attrs->clear)) ret = -ENOENT; goto out_free; - } else if (ret < 0) - goto out_free; + } if (get_named_trigger_data(trigger_data)) goto enable; @@ -6331,8 +6316,7 @@ enable: out_unreg: event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); remove_hist_vars(hist_data); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 32202175875d..62c44dab8f46 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -965,7 +965,7 @@ void event_trigger_unregister(struct event_command *cmd_ops, * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger * @cmd: The cmd portion of the string used to register the trigger - * @param: The params portion of the string used to register the trigger + * @param_and_filter: The param and filter portion of the string used to register the trigger * * Common implementation for event command parsing and trigger * instantiation. @@ -978,72 +978,39 @@ void event_trigger_unregister(struct event_command *cmd_ops, static int event_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; - char *trigger = NULL; - char *number; + char *param, *filter; + bool remove; int ret; - /* separate the trigger from the filter (t:n [if filter]) */ - if (param && isdigit(param[0])) { - trigger = strsep(¶m, " \t"); - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - } + remove = event_trigger_check_remove(glob); - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, false); + if (ret) + return ret; ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); if (!trigger_data) goto out; - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - trigger_data->private_data = file; - INIT_LIST_HEAD(&trigger_data->list); - INIT_LIST_HEAD(&trigger_data->named_list); - - if (glob[0] == '!') { + if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); ret = 0; goto out; } - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_data); @@ -1057,8 +1024,7 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); kfree(trigger_data); goto out; } @@ -1752,39 +1718,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; struct trace_array *tr = file->tr; + char *param, *filter; + bool enable, remove; const char *system; const char *event; bool hist = false; - char *trigger; - char *number; - bool enable; int ret; - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - /* separate the trigger from the filter (s:e:n [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) + if (event_trigger_empty_param(param_and_filter)) return -EINVAL; - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - system = strsep(&trigger, ":"); - if (!trigger) + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, true); + if (ret) + return ret; + + system = strsep(¶m, ":"); + if (!param) return -EINVAL; - event = strsep(&trigger, ":"); + event = strsep(¶m, ":"); ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); @@ -1800,31 +1760,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, #else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; #endif - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) - goto out; enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); - if (!enable_data) { - kfree(trigger_data); + if (!enable_data) goto out; - } - - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data->private_data = enable_data; - if (glob[0] == '!') { + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + if (!trigger_data) { + kfree(enable_data); + goto out; + } + + if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); kfree(enable_data); @@ -1835,33 +1787,14 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, /* Up the trigger_data count to make sure nothing frees it on failure */ event_trigger_init(trigger_data); - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { @@ -1880,16 +1813,15 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); out: return ret; - out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: trace_event_put_ref(event_enable_file->event_call); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); event_trigger_free(trigger_data); kfree(enable_data); + goto out; } -- cgit v1.2.3 From a7e6b7dcfb19988ad2968a1fafd29b600abbf133 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:07 -0600 Subject: tracing: Separate hist state updates from hist registration hist_register_trigger() handles both new hist registration as well as existing hist registration through event_command.reg(). Adding a new function, existing_hist_update_only(), that checks and updates existing histograms and exits after doing so allows the confusing logic in event_hist_trigger_parse() to be simplified. Link: https://lkml.kernel.org/r/211b2cd3e3d7e00f4f8ad45ef8b33063da6a7e05.1644010576.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 66 +++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b71bdb0bcb1..998dfe2162fc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5929,6 +5929,48 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } +static bool existing_hist_update_only(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool updated = false; + + if (!hist_data->attrs->pause && !hist_data->attrs->cont && + !hist_data->attrs->clear) + goto out; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) + goto out; + } + } + + if (hist_data->attrs->name && !named_data) + goto out; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + updated = true; + goto out; + } + } + out: + return updated; +} + static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) @@ -5957,19 +5999,11 @@ static int hist_register_trigger(char *glob, list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) - continue; - if (hist_data->attrs->pause) - test->paused = true; - else if (hist_data->attrs->cont) - test->paused = false; - else if (hist_data->attrs->clear) - hist_clear(test); - else { + if (hist_trigger_match(data, test, named_data, false)) { hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; + goto out; } - goto out; } } new: @@ -6008,8 +6042,6 @@ static int hist_register_trigger(char *glob, if (named_data) destroy_hist_data(hist_data); - - ret++; out: return ret; } @@ -6277,14 +6309,12 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret) + if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - if (ret == 0) { - if (!(attrs->pause || attrs->cont || attrs->clear)) - ret = -ENOENT; + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) goto out_free; - } if (get_named_trigger_data(trigger_data)) goto enable; -- cgit v1.2.3 From cf2adec7479d875973fe10782f8bf20377628ef2 Mon Sep 17 00:00:00 2001 From: Oscar Shiang Date: Thu, 17 Feb 2022 00:50:06 +0800 Subject: tracing: Fix inconsistent style of mini-HOWTO Each description should start with a hyphen and a space. Insert spaces to fix it. Link: https://lkml.kernel.org/r/TYCP286MB19130AA4A9C6FC5A8793DED2A1359@TYCP286MB1913.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Oscar Shiang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f4de111fa18f..4f9c499368c4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5475,7 +5475,7 @@ static const char readme_msg[] = " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" - " trace_clock\t\t-change the clock used to order events\n" + " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" @@ -5484,7 +5484,7 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif - "\n timestamp_mode\t-view the mode used to timestamp events\n" + "\n timestamp_mode\t- view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" -- cgit v1.2.3 From 3b57d8477cd0f427198930706958f1312cc3594b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 18 Feb 2022 18:08:49 +0800 Subject: tracing: Fix kernel-doc Fix the following W=1 kernel warnings: kernel/trace/trace.c:1181: warning: expecting prototype for tracing_snapshot_cond_data(). Prototype was for tracing_cond_snapshot_data() instead. Link: https://lkml.kernel.org/r/20220218100849.122038-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f9c499368c4..aceeeea21c11 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1174,7 +1174,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** - * tracing_snapshot_cond_data - get the user data associated with a snapshot + * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using -- cgit v1.2.3 From adaa0a9f06d127b6a0a1b929ec0971df40993a9b Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sat, 2 Apr 2022 15:20:15 +0800 Subject: tracing: Fix tracing_map_sort_entries() kernel-doc comment Add the description of @n_sort_keys and make @sort_key -> @sort_keys in tracing_map_sort_entries() kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. kernel/trace/tracing_map.c:1073: warning: Function parameter or member 'sort_keys' not described in 'tracing_map_sort_entries' kernel/trace/tracing_map.c:1073: warning: Function parameter or member 'n_sort_keys' not described in 'tracing_map_sort_entries' kernel/trace/tracing_map.c:1073: warning: Excess function parameter 'sort_key' description in 'tracing_map_sort_entries' Link: https://lkml.kernel.org/r/20220402072015.45864-1-yang.lee@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9628b5571846..9901708ce6b8 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1045,7 +1045,8 @@ static void sort_secondary(struct tracing_map *map, /** * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map * @map: The tracing_map - * @sort_key: The sort key to use for sorting + * @sort_keys: The sort key to use for sorting + * @n_sort_keys: hitcount, always have at least one * @sort_entries: outval: pointer to allocated and sorted array of entries * * tracing_map_sort_entries() sorts the current set of entries in the -- cgit v1.2.3 From cb1c45fb68b8a4285ccf750842b1136f26cfe267 Mon Sep 17 00:00:00 2001 From: Jeff Xie Date: Sun, 10 Apr 2022 22:50:25 +0800 Subject: tracing: Make tp_printk work on syscall tracepoints Currently the tp_printk option has no effect on syscall tracepoint. When adding the kernel option parameter tp_printk, then: echo 1 > /sys/kernel/debug/tracing/events/syscalls/enable When running any application, no trace information is printed on the terminal. Now added printk for syscall tracepoints. Link: https://lkml.kernel.org/r/20220410145025.681144-1-xiehuan09@gmail.com Signed-off-by: Jeff Xie Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f755bde42fd0..b69e207012c9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ @@ -296,9 +296,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; unsigned long args[6]; int syscall_nr; int size; @@ -321,20 +319,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->enter_event->event.type, size, trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; syscall_get_arguments(current, regs, args); memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,9 +337,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -364,20 +356,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->exit_event->event.type, sizeof(*entry), - trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static int reg_event_syscall_enter(struct trace_event_file *file, -- cgit v1.2.3 From 97a5d2e5e35f119b6d7a37c7538773ae7af82dd5 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Mon, 18 Apr 2022 01:56:29 +0700 Subject: tracing: Return -EINVAL if WARN_ON(!glob) triggered in event_hist_trigger_parse() If `WARN_ON(!glob)` is ever triggered, we will still continue executing the next lines. This will trigger the more serious problem, a NULL pointer dereference bug. Just return -EINVAL if @glob is NULL. Link: https://lkml.kernel.org/r/20220417185630.199062-2-ammarfaizi2@gnuweeb.org Cc: Ingo Molnar Cc: GNU/Weeb Mailing List Signed-off-by: Ammar Faizi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 998dfe2162fc..80c25be23c45 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6214,7 +6214,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); - WARN_ON(!glob); + if (WARN_ON(!glob)) + return -EINVAL; if (strlen(glob)) { hist_err_clear(); -- cgit v1.2.3 From 69686fcbdcc0b6fed3b8d0907e641f282df2f827 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Mon, 18 Apr 2022 01:56:30 +0700 Subject: tracing: Change `if (strlen(glob))` to `if (glob[0])` No need to traverse to the end of string. If the first byte is not a NUL char, it's guaranteed `if (strlen(glob))` is true. Link: https://lkml.kernel.org/r/20220417185630.199062-3-ammarfaizi2@gnuweeb.org Cc: Ingo Molnar Cc: GNU/Weeb Mailing List Signed-off-by: Ammar Faizi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 80c25be23c45..fe10179893c1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6217,7 +6217,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (WARN_ON(!glob)) return -EINVAL; - if (strlen(glob)) { + if (glob[0]) { hist_err_clear(); last_cmd_set(file, param_and_filter); } -- cgit v1.2.3 From 12025abdc8539ed9d5014e2d647a3fd1bd3de5cd Mon Sep 17 00:00:00 2001 From: Jun Miao Date: Tue, 19 Apr 2022 09:39:10 +0800 Subject: tracing: Fix sleeping function called from invalid context on RT kernel When setting bootparams="trace_event=initcall:initcall_start tp_printk=1" in the cmdline, the output_printk() was called, and the spin_lock_irqsave() was called in the atomic and irq disable interrupt context suitation. On the PREEMPT_RT kernel, these locks are replaced with sleepable rt-spinlock, so the stack calltrace will be triggered. Fix it by raw_spin_lock_irqsave when PREEMPT_RT and "trace_event=initcall:initcall_start tp_printk=1" enabled. BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0 preempt_count: 2, expected: 0 RCU nest depth: 0, expected: 0 Preemption disabled at: [] try_to_wake_up+0x7e/0xba0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt17+ #19 34c5812404187a875f32bee7977f7367f9679ea7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl+0x60/0x8c dump_stack+0x10/0x12 __might_resched.cold+0x11d/0x155 rt_spin_lock+0x40/0x70 trace_event_buffer_commit+0x2fa/0x4c0 ? map_vsyscall+0x93/0x93 trace_event_raw_event_initcall_start+0xbe/0x110 ? perf_trace_initcall_finish+0x210/0x210 ? probe_sched_wakeup+0x34/0x40 ? ttwu_do_wakeup+0xda/0x310 ? trace_hardirqs_on+0x35/0x170 ? map_vsyscall+0x93/0x93 do_one_initcall+0x217/0x3c0 ? trace_event_raw_event_initcall_level+0x170/0x170 ? push_cpu_stop+0x400/0x400 ? cblist_init_generic+0x241/0x290 kernel_init_freeable+0x1ac/0x347 ? _raw_spin_unlock_irq+0x65/0x80 ? rest_init+0xf0/0xf0 kernel_init+0x1e/0x150 ret_from_fork+0x22/0x30 Link: https://lkml.kernel.org/r/20220419013910.894370-1-jun.miao@intel.com Signed-off-by: Jun Miao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aceeeea21c11..27bb486c3f97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2835,7 +2835,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) @@ -2863,14 +2863,14 @@ static void output_printk(struct trace_event_buffer *fbuffer) event = &fbuffer->trace_file->event_call->event; - spin_lock_irqsave(&tracepoint_iter_lock, flags); + raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); + raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(struct ctl_table *table, int write, -- cgit v1.2.3 From 4ee51101e93f0c8d83876ae5c0c26ca14934629e Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Sun, 24 Apr 2022 21:19:32 +0800 Subject: tracing: Use WARN instead of printk and WARN_ON Use `WARN(cond, ...)` instead of `if (cond)` + `printk(...)` + `WARN_ON(1)`. Link: https://lkml.kernel.org/r/20220424131932.3606-1-guozhengkui@vivo.com Suggested-by: Steven Rostedt Signed-off-by: Guo Zhengkui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8aa493d25c73..d89e3f7e26eb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -778,9 +778,8 @@ int register_trace_event(struct trace_event *event) list_add_tail(&event->list, list); - } else if (event->type > __TRACE_LAST_TYPE) { - printk(KERN_WARNING "Need to add type to trace.h\n"); - WARN_ON(1); + } else if (WARN(event->type > __TRACE_LAST_TYPE, + "Need to add type to trace.h")) { goto out; } else { /* Is this event already used */ @@ -1571,13 +1570,8 @@ __init static int init_events(void) for (i = 0; events[i]; i++) { event = events[i]; - ret = register_trace_event(event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - event->type); - WARN_ON_ONCE(1); - } + WARN_ONCE(!ret, "event %d failed to register", event->type); } return 0; -- cgit v1.2.3 From ed888241a0ab9a3606bd986be6f0abafa1afdf19 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 26 Apr 2022 15:06:28 +0800 Subject: ring-buffer: Simplify if-if to if-else Use if and else instead of if(A) and if (!A). Link: https://lkml.kernel.org/r/20220426070628.167565-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05dfc7a12d3d..655d6db3e3c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6011,10 +6011,10 @@ static __init int test_ringbuffer(void) pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); - if (total_lost) + if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); - if (!total_lost) { + } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; -- cgit v1.2.3 From ef9188bcc6ca1d8a2ad83e826b548e6820721061 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Tue, 26 Apr 2022 20:24:06 +0800 Subject: tracing: Avoid adding tracer option before update_tracer_options To prepare for support asynchronous tracer_init_tracefs initcall, avoid calling create_trace_option_files before __update_tracer_options. Otherwise, create_trace_option_files will show warning because some tracers in trace_types list are already in tr->topts. For example, hwlat_tracer call register_tracer in late_initcall, and global_trace.dir is already created in tracing_init_dentry, hwlat_tracer will be put into tr->topts. Then if the __update_tracer_options is executed after hwlat_tracer registered, create_trace_option_files find that hwlat_tracer is already in tr->topts. Link: https://lkml.kernel.org/r/20220426122407.17042-2-mark-pk.tsai@mediatek.com Link: https://lore.kernel.org/lkml/20220322133339.GA32582@xsang-OptiPlex-9020/ Reported-by: kernel test robot Signed-off-by: Mark-PK Tsai Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 27bb486c3f97..7275173c55d0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6332,12 +6332,18 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } +static bool tracer_options_updated; + static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir) return; + /* Only create trace option files after update_tracer_options finish */ + if (!tracer_options_updated) + return; + create_trace_option_files(tr, t); } @@ -9176,6 +9182,7 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { mutex_lock(&trace_types_lock); + tracer_options_updated = true; __update_tracer_options(tr); mutex_unlock(&trace_types_lock); } -- cgit v1.2.3 From 6621a7004684bfcff5af4c8e4d37989941f42a6b Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Tue, 26 Apr 2022 20:24:07 +0800 Subject: tracing: make tracer_init_tracefs initcall asynchronous Move trace_eval_init() to subsys_initcall to make it start earlier. And to avoid tracer_init_tracefs being blocked by trace_event_sem which trace_eval_init() hold [1], queue tracer_init_tracefs() to eval_map_wq to let the two works being executed sequentially. It can speed up the initialization of kernel as result of making tracer_init_tracefs asynchronous. On my arm64 platform, it reduce ~20ms of 125ms which total time do_initcalls spend. Link: https://lkml.kernel.org/r/20220426122407.17042-3-mark-pk.tsai@mediatek.com [1]: https://lore.kernel.org/r/68d7b3327052757d0cd6359a6c9015a85b437232.camel@pengutronix.de Signed-off-by: Mark-PK Tsai Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7275173c55d0..400d3e9fe9ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9615,6 +9615,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static struct workqueue_struct *eval_map_wq __initdata; static struct work_struct eval_map_work __initdata; +static struct work_struct tracerfs_init_work __initdata; static void __init eval_map_work_func(struct work_struct *work) { @@ -9640,6 +9641,8 @@ static int __init trace_eval_init(void) return 0; } +subsys_initcall(trace_eval_init); + static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ @@ -9722,15 +9725,8 @@ static struct notifier_block trace_module_nb = { }; #endif /* CONFIG_MODULES */ -static __init int tracer_init_tracefs(void) +static __init void tracer_init_tracefs_work_func(struct work_struct *work) { - int ret; - - trace_access_lock_init(); - - ret = tracing_init_dentry(); - if (ret) - return 0; event_trace_init(); @@ -9752,8 +9748,6 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); - trace_eval_init(); - trace_create_eval_file(NULL); #ifdef CONFIG_MODULES @@ -9768,6 +9762,24 @@ static __init int tracer_init_tracefs(void) create_trace_instances(NULL); update_tracer_options(&global_trace); +} + +static __init int tracer_init_tracefs(void) +{ + int ret; + + trace_access_lock_init(); + + ret = tracing_init_dentry(); + if (ret) + return 0; + + if (eval_map_wq) { + INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); + queue_work(eval_map_wq, &tracerfs_init_work); + } else { + tracer_init_tracefs_work_func(NULL); + } return 0; } -- cgit v1.2.3 From 6695da58f9445f386516ed53a3e870f534d2645d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 27 Apr 2022 15:33:39 -0400 Subject: ring-buffer: Have absolute time stamps handle large numbers There's an absolute timestamp event in the ring buffer, but this only saves 59 bits of the timestamp, as the 5 MSB is used for meta data (stating it is an absolute time stamp). This was never an issue as all the clocks currently in use never used those 5 MSB. But now there's a new clock (TAI) that does. To handle this case, when reading an absolute timestamp, a previous full timestamp is passed in, and the 5 MSB of that timestamp is OR'd to the absolute timestamp (if any of the 5 MSB are set), and then to test for overflow, if the new result is smaller than the passed in previous timestamp, then 1 << 59 is added to it. All the extra processing is done on the reader "slow" path, with the exception of the "too big delta" check, and the reading of timestamps for histograms. Note, libtraceevent will need to be updated to handle this case as well. But this is not a user space regression, as user space was never able to handle any timestamps that used more than 59 bits. Link: https://lore.kernel.org/all/20220426175338.3807ca4f@gandalf.local.home/ Link: https://lkml.kernel.org/r/20220427153339.16c33f75@gandalf.local.home Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Kurt Kanzenbach Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 49 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 655d6db3e3c3..3a0c7ed0e93f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -29,6 +29,14 @@ #include +/* + * The "absolute" timestamp in the buffer is only 59 bits. + * If a clock has the 5 MSBs set, it needs to be saved and + * reinserted. + */ +#define TS_MSB (0xf8ULL << 56) +#define ABS_TS_MASK (~TS_MSB) + static void update_pages_handler(struct work_struct *work); /* @@ -783,6 +791,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, } #endif +/* + * The absolute time stamp drops the 5 MSBs and some clocks may + * require them. The rb_fix_abs_ts() will take a previous full + * time stamp, and add the 5 MSB of that time stamp on to the + * saved absolute time stamp. Then they are compared in case of + * the unlikely event that the latest time stamp incremented + * the 5 MSB. + */ +static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) +{ + if (save_ts & TS_MSB) { + abs |= save_ts & TS_MSB; + /* Check for overflow */ + if (unlikely(abs < save_ts)) + abs += 1ULL << 59; + } + return abs; +} static inline u64 rb_time_stamp(struct trace_buffer *buffer); @@ -811,8 +837,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, u64 ts; /* If the event includes an absolute time, then just use that */ - if (event->type_len == RINGBUF_TYPE_TIME_STAMP) - return rb_event_time_stamp(event); + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + ts = rb_event_time_stamp(event); + return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); + } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); @@ -2754,8 +2782,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { + /* + * Some timers can use more than 59 bits, and when a timestamp + * is added to the buffer, it will lose those bits. + */ + if (abs && (info->ts & TS_MSB)) { + info->delta &= ABS_TS_MASK; + /* did the clock go backwards */ - if (info->before == info->after && info->before > info->ts) { + } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; @@ -3304,7 +3339,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3380,7 +3415,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); break; case RINGBUF_TYPE_PADDING: @@ -4367,6 +4402,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; @@ -4397,6 +4433,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; @@ -4650,6 +4687,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4741,6 +4779,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } -- cgit v1.2.3 From f03f2abce4f3944b9576bc4ad28b75890e907d7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 27 Apr 2022 17:08:12 -0400 Subject: ring-buffer: Have 32 bit time stamps use all 64 bits When the new logic was made to handle deltas of events from interrupts that interrupted other events, it required 64 bit local atomics. Unfortunately, 64 bit local atomics are expensive on 32 bit architectures. Thus, commit 10464b4aa605e ("ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit") created a type of seq lock timer for 32 bits. It used two 32 bit local atomics, but required 2 bits from them each for synchronization, making it only 60 bits. Add a new "msb" field to hold the extra 4 bits that are cut off. Link: https://lore.kernel.org/all/20220426175338.3807ca4f@gandalf.local.home/ Link: https://lkml.kernel.org/r/20220427170812.53cc7139@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3a0c7ed0e93f..d59b6a328b7f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -476,6 +476,7 @@ struct rb_time_struct { local_t cnt; local_t top; local_t bottom; + local_t msb; }; #else #include @@ -577,7 +578,6 @@ struct ring_buffer_iter { * For the ring buffer, 64 bit required operations for the time is * the following: * - * - Only need 59 bits (uses 60 to make it even). * - Reads may fail if it interrupted a modification of the time stamp. * It will succeed if it did not interrupt another write even if * the read itself is interrupted by a write. @@ -602,6 +602,7 @@ struct ring_buffer_iter { */ #define RB_TIME_SHIFT 30 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) +#define RB_TIME_MSB_SHIFT 60 static inline int rb_time_cnt(unsigned long val) { @@ -621,7 +622,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom) static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) { - unsigned long top, bottom; + unsigned long top, bottom, msb; unsigned long c; /* @@ -633,6 +634,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); + msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); @@ -641,7 +643,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) if (*cnt != rb_time_cnt(bottom)) return false; - *ret = rb_time_val(top, bottom); + /* The shift to msb will lose its cnt bits */ + *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); return true; } @@ -657,10 +660,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); } -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, + unsigned long *msb) { *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); + *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); } static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) @@ -671,15 +676,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long static void rb_time_set(rb_time_t *t, u64 val) { - unsigned long cnt, top, bottom; + unsigned long cnt, top, bottom, msb; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); /* Writes always succeed with a valid number even if it gets interrupted. */ do { cnt = local_inc_return(&t->cnt); rb_time_val_set(&t->top, top, cnt); rb_time_val_set(&t->bottom, bottom, cnt); + rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); } while (cnt != local_read(&t->cnt)); } @@ -694,8 +700,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - unsigned long cnt, top, bottom; - unsigned long cnt2, top2, bottom2; + unsigned long cnt, top, bottom, msb; + unsigned long cnt2, top2, bottom2, msb2; u64 val; /* The cmpxchg always fails if it interrupted an update */ @@ -711,16 +717,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) cnt2 = cnt + 1; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); top = rb_time_val_cnt(top, cnt); bottom = rb_time_val_cnt(bottom, cnt); - rb_time_split(set, &top2, &bottom2); + rb_time_split(set, &top2, &bottom2, &msb2); top2 = rb_time_val_cnt(top2, cnt2); bottom2 = rb_time_val_cnt(bottom2, cnt2); if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) return false; + if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) + return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) -- cgit v1.2.3 From c575afe21ccc47c5561d11493ede81ab7d072267 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Thu, 14 Apr 2022 11:18:04 +0200 Subject: tracing: Introduce trace clock tai A fast/NMI safe accessor for CLOCK_TAI has been introduced. Use it for adding the additional trace clock "tai". Link: https://lkml.kernel.org/r/20220414091805.89667-3-kurt@linutronix.de Signed-off-by: Kurt Kanzenbach Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 400d3e9fe9ff..087740f63d92 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1542,6 +1542,7 @@ static struct { { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3 From 1da27a25054f7660f7bb27ad4ccf036ee6ba51e9 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:31 +0200 Subject: tracing: Remove usage of list iterator after the loop body In preparation to limit the scope of the list iterator variable to the traversal loop, use a dedicated pointer to point to the found element [1]. Before, the code implicitly used the head when no element was found when using &pos->list. Since the new variable is only set if an element was found, the head needs to be used explicitly if the variable is NULL. Link: https://lkml.kernel.org/r/20220427170734.819891-2-jakobkoschel@gmail.com Cc: Ingo Molnar Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d89e3f7e26eb..67f47ea27921 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { - struct trace_event *e; + struct trace_event *e = NULL, *iter; int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { @@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list) * We used up all possible max events, * lets see if somebody freed one. */ - list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != next) + list_for_each_entry(iter, &ftrace_event_list, list) { + if (iter->type != next) { + e = iter; break; + } next++; } @@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list) if (next > TRACE_EVENT_TYPE_MAX) return 0; - *list = &e->list; + if (e) + *list = &e->list; + else + *list = &ftrace_event_list; return next; } -- cgit v1.2.3 From 99d8ae4ec8a90ecf6c2a6a141bd4db4a3bc5146c Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:32 +0200 Subject: tracing: Remove usage of list iterator variable after the loop In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to point to the found element [1]. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220427170734.819891-3-jakobkoschel@gmail.com Cc: Ingo Molnar Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e11e167b7809..e4a442060707 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1723,9 +1723,9 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { + struct trace_subsystem_dir *dir = NULL, *iter_dir; + struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; - struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ - struct trace_array *tr; int ret; if (tracing_is_disabled()) @@ -1734,10 +1734,12 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - list_for_each_entry(dir, &tr->systems, list) { - if (dir == inode->i_private) { + list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { + list_for_each_entry(iter_dir, &iter_tr->systems, list) { + if (iter_dir == inode->i_private) { /* Don't open systems with no events */ + tr = iter_tr; + dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; @@ -1753,9 +1755,6 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (!system) return -ENODEV; - /* Some versions of gcc think dir can be uninitialized here */ - WARN_ON(!dir); - /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); -- cgit v1.2.3 From 45e333ce2ad5cbb0ee05686336de09058c6af8ca Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:33 +0200 Subject: tracing: Replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lkml.kernel.org/r/20220427170734.819891-4-jakobkoschel@gmail.com Cc: Ingo Molnar Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 15 +++++++-------- kernel/trace/trace_events_trigger.c | 24 +++++++++++------------- 2 files changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fe10179893c1..038dc591545d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6117,20 +6117,19 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { + struct event_trigger_data *test = NULL, *iter, *named_data = NULL; struct hist_trigger_data *hist_data = data->private_data; - struct event_trigger_data *test, *named_data = NULL; - bool unregistered = false; lockdep_assert_held(&event_mutex); if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, iter, named_data, false)) continue; - unregistered = true; + test = iter; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -6138,11 +6137,11 @@ static void hist_unregister_trigger(char *glob, } } - if (unregistered && test->ops->free) + if (test && test->ops->free) test->ops->free(test); if (hist_data->enable_timestamps) { - if (!hist_data->remove || unregistered) + if (!hist_data->remove || test) tracing_set_filter_buffering(file->tr, false); } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 62c44dab8f46..21592bed2e89 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -609,14 +609,13 @@ static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { - struct event_trigger_data *data; - bool unregistered = false; + struct event_trigger_data *data = NULL, *iter; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { - unregistered = true; + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -624,7 +623,7 @@ static void unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) + if (data && data->ops->free) data->ops->free(data); } @@ -1870,19 +1869,18 @@ void event_enable_unregister_trigger(char *glob, struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; + struct event_trigger_data *data = NULL, *iter; struct enable_trigger_data *enable_data; - struct event_trigger_data *data; - bool unregistered = false; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - enable_data = data->private_data; + list_for_each_entry(iter, &file->triggers, list) { + enable_data = iter->private_data; if (enable_data && - (data->cmd_ops->trigger_type == + (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { - unregistered = true; + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -1890,7 +1888,7 @@ void event_enable_unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) + if (data && data->ops->free) data->ops->free(data); } -- cgit v1.2.3 From ba27d8555867b0e02e15709f4ddb79aec5cf2efc Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:34 +0200 Subject: tracing: Remove check of list iterator against head past the loop body When list_for_each_entry() completes the iteration over the whole list without breaking the loop, the iterator value will be a bogus pointer computed based on the head element. While it is safe to use the pointer to determine if it was computed based on the head element, either with list_entry_is_head() or &pos->member == head, using the iterator variable after the loop should be avoided. In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to point to the found element [1]. Link: https://lkml.kernel.org/r/20220427170734.819891-5-jakobkoschel@gmail.com Cc: Ingo Molnar Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 20 ++++++++++++-------- kernel/trace/trace_eprobe.c | 14 ++++++++------ kernel/trace/trace_events.c | 12 ++++++------ 3 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..5c465e70d146 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4560,8 +4560,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops, void *data) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; @@ -4580,11 +4580,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) { + if (!probe) { probe = kzalloc(sizeof(*probe), GFP_KERNEL); if (!probe) { mutex_unlock(&ftrace_lock); @@ -4702,9 +4704,9 @@ int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; @@ -4732,11 +4734,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) + if (!probe) goto err_unlock_ftrace; ret = -EINVAL; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b045fa9f276c..7d4478525c66 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -648,7 +648,7 @@ static struct trace_event_functions eprobe_funcs = { static int disable_eprobe(struct trace_eprobe *ep, struct trace_array *tr) { - struct event_trigger_data *trigger; + struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; struct eprobe_data *edata; @@ -656,14 +656,16 @@ static int disable_eprobe(struct trace_eprobe *ep, if (!file) return -ENOENT; - list_for_each_entry(trigger, &file->triggers, list) { - if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + list_for_each_entry(iter, &file->triggers, list) { + if (!(iter->flags & EVENT_TRIGGER_FL_PROBE)) continue; - edata = trigger->private_data; - if (edata->ep == ep) + edata = iter->private_data; + if (edata->ep == ep) { + trigger = iter; break; + } } - if (list_entry_is_head(trigger, &file->triggers, list)) + if (!trigger) return -ENODEV; list_del_rcu(&trigger->list); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e4a442060707..78f313b7b315 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2279,8 +2279,8 @@ static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { + struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ @@ -2294,13 +2294,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } /* Now see if the system itself exists. */ - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + system = NULL; + list_for_each_entry(iter, &event_subsystems, list) { + if (strcmp(iter->name, name) == 0) { + system = iter; break; + } } - /* Reset system variable when not found */ - if (&system->list == &event_subsystems) - system = NULL; dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) -- cgit v1.2.3 From c06d7aaf2951ce7f986a879127995728d63d8577 Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: kernel: pid_namespace: use NULL instead of using plain integer as pointer This fixes the following sparse warnings: kernel/pid_namespace.c:55:77: warning: Using plain integer as NULL pointer Link: https://lkml.kernel.org/r/1647944288-2806-1-git-send-email-baihaowen@meizu.com Signed-off-by: Haowen Bai Signed-off-by: Andrew Morton --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a46a3723bc66..f4f8cb0435b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); -- cgit v1.2.3 From 67fca000e1e173fe2c539a127ccf1bc338d5ff37 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: lib/Kconfig.debug: remove more CONFIG_..._VALUE indirections As in "kernel/panic.c: remove CONFIG_PANIC_ON_OOPS_VALUE indirection", use the IS_ENABLED() helper rather than having a hidden config option. Link: https://lkml.kernel.org/r/20220321121301.1389693-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Masahiro Yamada Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/hung_task.c | 2 +- kernel/watchdog.c | 4 ++-- lib/Kconfig.debug | 21 --------------------- 3 files changed, 3 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f7655..cff3ae8c818f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ unsigned int __read_mostly sysctl_hung_task_panic = - CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9166220457bc..ecb0e8346e65 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 075cd25363ac..8fa08100dbd8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1071,13 +1071,6 @@ config BOOTPARAM_SOFTLOCKUP_PANIC Say N if unsure. -config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE - int - depends on SOFTLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC - default 1 if BOOTPARAM_SOFTLOCKUP_PANIC - config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR @@ -1119,13 +1112,6 @@ config BOOTPARAM_HARDLOCKUP_PANIC Say N if unsure. -config BOOTPARAM_HARDLOCKUP_PANIC_VALUE - int - depends on HARDLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_HARDLOCKUP_PANIC - default 1 if BOOTPARAM_HARDLOCKUP_PANIC - config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL @@ -1173,13 +1159,6 @@ config BOOTPARAM_HUNG_TASK_PANIC Say N if unsure. -config BOOTPARAM_HUNG_TASK_PANIC_VALUE - int - depends on DETECT_HUNG_TASK - range 0 1 - default 0 if !BOOTPARAM_HUNG_TASK_PANIC - default 1 if BOOTPARAM_HUNG_TASK_PANIC - config WQ_WATCHDOG bool "Detect Workqueue Stalls" depends on DEBUG_KERNEL -- cgit v1.2.3 From f26b2afd53e70db67be8252d340b4a1387ec8b55 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: ptrace: remove redudant check of #ifdef PTRACE_SINGLESTEP Patch series "ptrace: do some cleanup". This patch (of 3): PTRACE_SINGLESTEP is always defined as 9 in include/uapi/linux/ptrace.h, remove redudant check of #ifdef PTRACE_SINGLESTEP. Link: https://lkml.kernel.org/r/1649240981-11024-2-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/ptrace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ccc4b465775b..49c29baf9907 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -829,11 +829,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, } #endif -#ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif #ifdef PTRACE_SINGLEBLOCK #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) @@ -1221,9 +1217,7 @@ int ptrace_request(struct task_struct *child, long request, } #endif -#ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: -#endif #ifdef PTRACE_SINGLEBLOCK case PTRACE_SINGLEBLOCK: #endif -- cgit v1.2.3 From 16b0b7adabfb5564a77fa35917afe08decd55b29 Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: kexec: remove redundant assignments Get rid of redundant assignments which end up in values not being read either because they are overwritten or the function ends. Reported by clang-tidy [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220326180948.192154-1-michalorzel.eng@gmail.com Signed-off-by: Michal Orzel Acked-by: Baoquan He Cc: Eric Biederman Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Michal Orzel Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..d08904a27362 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -768,7 +768,6 @@ static struct page *kimage_alloc_page(struct kimage *image, kimage_free_pages(old_page); continue; } - addr = old_addr; page = old_page; break; } @@ -788,7 +787,6 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; - result = 0; if (image->file_mode) kbuf = segment->kbuf; else -- cgit v1.2.3 From 0e0af57e0e91b304f36b7d1dba859e3c04094273 Mon Sep 17 00:00:00 2001 From: "Dr. Thomas Orgis" Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: taskstats: version 12 with thread group and exe info The task exit struct needs some crucial information to be able to provide an enhanced version of process and thread accounting. This change provides: 1. ac_tgid in additon to ac_pid 2. thread group execution walltime in ac_tgetime 3. flag AGROUP in ac_flag to indicate the last task in a thread group / process 4. device ID and inode of task's /proc/self/exe in ac_exe_dev and ac_exe_inode 5. tools/accounting/procacct as demonstrator When a task exits, taskstats are reported to userspace including the task's pid and ppid, but without the id of the thread group this task is part of. Without the tgid, the stats of single tasks cannot be correlated to each other as a thread group (process). The taskstats documentation suggests that on process exit a data set consisting of accumulated stats for the whole group is produced. But such an additional set of stats is only produced for actually multithreaded processes, not groups that had only one thread, and also those stats only contain data about delay accounting and not the more basic information about CPU and memory resource usage. Adding the AGROUP flag to be set when the last task of a group exited enables determination of process end also for single-threaded processes. My applicaton basically does enhanced process accounting with summed cputime, biggest maxrss, tasks per process. The data is not available with the traditional BSD process accounting (which is not designed to be extensible) and the taskstats interface allows more efficient on-the-fly grouping and summing of the stats, anyway, without intermediate disk writes. Furthermore, I do carry statistics on which exact program binary is used how often with associated resources, getting a picture on how important which parts of a collection of installed scientific software in different versions are, and how well they put load on the machine. This is enabled by providing information on /proc/self/exe for each task. I assume the two 64-bit fields for device ID and inode are more appropriate than the possibly large resolved path to keep the data volume down. Add the tgid to the stats to complete task identification, the flag AGROUP to mark the last task of a group, the group wallclock time, and inode-based identification of the associated executable file. Add tools/accounting/procacct.c as a simplified fork of getdelays.c to demonstrate process and thread accounting. [thomas.orgis@uni-hamburg.de: fix version number in comment] Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster Signed-off-by: Dr. Thomas Orgis Reviewed-by: Ismael Luceno Cc: Balbir Singh Cc: Eric W. Biederman Cc: xu xin Cc: Yang Yang Signed-off-by: Andrew Morton --- include/uapi/linux/acct.h | 3 +- include/uapi/linux/taskstats.h | 24 ++- kernel/taskstats.c | 23 +++ kernel/tsacct.c | 10 +- tools/accounting/.gitignore | 1 + tools/accounting/Makefile | 2 +- tools/accounting/procacct.c | 417 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 473 insertions(+), 7 deletions(-) create mode 100644 tools/accounting/procacct.c (limited to 'kernel') diff --git a/include/uapi/linux/acct.h b/include/uapi/linux/acct.h index 985b89068591..0e591152aa8a 100644 --- a/include/uapi/linux/acct.h +++ b/include/uapi/linux/acct.h @@ -103,12 +103,13 @@ struct acct_v3 /* * accounting flags */ - /* bit set when the process ... */ + /* bit set when the process/task ... */ #define AFORK 0x01 /* ... executed fork, but did not exec */ #define ASU 0x02 /* ... used super-user privileges */ #define ACOMPAT 0x04 /* ... used compatibility mode (VAX only not used) */ #define ACORE 0x08 /* ... dumped core */ #define AXSIG 0x10 /* ... was killed by a signal */ +#define AGROUP 0x20 /* ... was the last task of the process (task group) */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) #define ACCT_BYTEORDER 0x80 /* accounting file is big endian */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 12327d32378f..736154171489 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 11 +#define TASKSTATS_VERSION 12 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -48,7 +48,8 @@ struct taskstats { __u32 ac_exitcode; /* Exit status */ /* The accounting flags of a task as defined in - * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG. + * Defined values are AFORK, ASU, ACOMPAT, ACORE, AXSIG, and AGROUP. + * (AGROUP since version 12). */ __u8 ac_flag; /* Record flags */ __u8 ac_nice; /* task_nice */ @@ -173,9 +174,26 @@ struct taskstats { /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ - /* Delay waiting for memory compact */ + /* v11: Delay waiting for memory compact */ __u64 compact_count; __u64 compact_delay_total; + + /* v12 begin */ + __u32 ac_tgid; /* thread group ID */ + /* Thread group walltime up to now. This is total process walltime if + * AGROUP flag is set. + */ + __u64 ac_tgetime __attribute__((aligned(8))); + /* Lightweight information to identify process binary files. + * This leaves userspace to match this to a file system path, using + * MAJOR() and MINOR() macros to identify a device and mount point, + * the inode to identify the executable file. This is /proc/self/exe + * at the end, so matching the most recent exec(). Values are zero + * for kernel threads. + */ + __u64 ac_exe_dev; /* program binary device ID */ + __u64 ac_exe_inode; /* program binary inode number */ + /* v12 end */ }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bcac5a9043aa..72415e22342b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } +static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + /* No idea if I'm allowed to access that here, now. */ + struct file *exe_file = get_task_exe_file(tsk); + + if (exe_file) { + /* Following cp_new_stat64() in stat.c . */ + stats->ac_exe_dev = + huge_encode_dev(exe_file->f_inode->i_sb->s_dev); + stats->ac_exe_inode = exe_file->f_inode->i_ino; + fput(exe_file); + } else { + stats->ac_exe_dev = 0; + stats->ac_exe_inode = 0; + } +} + static void fill_stats(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct task_struct *tsk, struct taskstats *stats) @@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); + + /* add executable info */ + exe_add_tsk(stats, tsk); } static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) @@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) goto err; fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); + if (group_dead) + stats->ac_flag |= AGROUP; /* * Doesn't matter if tsk is the leader or the last group member leaving diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 1d261fbe367b..4252f0645b9e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; - u64 delta; + u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ - delta = ktime_get_ns() - tsk->start_time; + now_ns = ktime_get_ns(); + /* store whole group time first */ + delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); + stats->ac_tgetime = delta; + delta = now_ns - tsk->start_time; + do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); @@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); + stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); diff --git a/tools/accounting/.gitignore b/tools/accounting/.gitignore index c45fb4ed4309..522a690aaf3d 100644 --- a/tools/accounting/.gitignore +++ b/tools/accounting/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only getdelays +procacct diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile index 03687f19cbb1..11def1ad046c 100644 --- a/tools/accounting/Makefile +++ b/tools/accounting/Makefile @@ -2,7 +2,7 @@ CC := $(CROSS_COMPILE)gcc CFLAGS := -I../../usr/include -PROGS := getdelays +PROGS := getdelays procacct all: $(PROGS) diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c new file mode 100644 index 000000000000..8353d3237e50 --- /dev/null +++ b/tools/accounting/procacct.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0 +/* procacct.c + * + * Demonstrator of fetching resource data on task exit, as a way + * to accumulate accurate program resource usage statistics, without + * prior identification of the programs. For that, the fields for + * device and inode of the program executable binary file are also + * extracted in addition to the command string. + * + * The TGID together with the PID and the AGROUP flag allow + * identification of threads in a process and single-threaded processes. + * The ac_tgetime field gives proper whole-process walltime. + * + * Written (changed) by Thomas Orgis, University of Hamburg in 2022 + * + * This is a cheap derivation (inheriting the style) of getdelays.c: + * + * Utility to get per-pid and per-tgid delay accounting statistics + * Also illustrates usage of the taskstats interface + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2005 + * Copyright (C) Balbir Singh, IBM Corp. 2006 + * Copyright (c) Jay Lan, SGI. 2006 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Generic macros for dealing with netlink sockets. Might be duplicated + * elsewhere. It is recommended that commercial grade applications use + * libnl or libnetlink and use the interfaces provided by the library + */ +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define err(code, fmt, arg...) \ + do { \ + fprintf(stderr, fmt, ##arg); \ + exit(code); \ + } while (0) + +int rcvbufsz; +char name[100]; +int dbg; +int print_delays; +int print_io_accounting; +int print_task_context_switch_counts; + +#define PRINTF(fmt, arg...) { \ + if (dbg) { \ + printf(fmt, ##arg); \ + } \ + } + +/* Maximum size of response requested or message sent */ +#define MAX_MSG_SIZE 1024 +/* Maximum number of cpus expected to be specified in a cpumask */ +#define MAX_CPUS 32 + +struct msgtemplate { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; +}; + +char cpumask[100+6*MAX_CPUS]; + +static void usage(void) +{ + fprintf(stderr, "procacct [-v] [-w logfile] [-r bufsize] [-m cpumask]\n"); + fprintf(stderr, " -v: debug on\n"); +} + +/* + * Create a raw netlink socket and bind + */ +static int create_nl_socket(int protocol) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, protocol); + if (fd < 0) + return -1; + + if (rcvbufsz) + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &rcvbufsz, sizeof(rcvbufsz)) < 0) { + fprintf(stderr, "Unable to set socket rcv buf size to %d\n", + rcvbufsz); + goto error; + } + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) + goto error; + + return fd; +error: + close(fd); + return -1; +} + + +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct nlattr *na; + struct sockaddr_nl nladdr; + int r, buflen; + char *buf; + + struct msgtemplate msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + 1 + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + + +/* + * Probe the controller in genetlink to find the family id + * for the TASKSTATS family + */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + + strcpy(name, TASKSTATS_GENL_NAME); + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; /* sendto() failure? */ + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) + return 0; + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + + return id; +} + +#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) + +static void print_procacct(struct taskstats *t) +{ + /* First letter: T is a mere thread, G the last in a group, U unknown. */ + printf( + "%c pid=%lu tgid=%lu uid=%lu wall=%llu gwall=%llu cpu=%llu vmpeak=%llu rsspeak=%llu dev=%lu:%lu inode=%llu comm=%s\n" + , t->version >= 12 ? (t->ac_flag & AGROUP ? 'P' : 'T') : '?' + , (unsigned long)t->ac_pid + , (unsigned long)(t->version >= 12 ? t->ac_tgid : 0) + , (unsigned long)t->ac_uid + , (unsigned long long)t->ac_etime + , (unsigned long long)(t->version >= 12 ? t->ac_tgetime : 0) + , (unsigned long long)(t->ac_utime+t->ac_stime) + , (unsigned long long)t->hiwater_vm + , (unsigned long long)t->hiwater_rss + , (unsigned long)(t->version >= 12 ? MAJOR(t->ac_exe_dev) : 0) + , (unsigned long)(t->version >= 12 ? MINOR(t->ac_exe_dev) : 0) + , (unsigned long long)(t->version >= 12 ? t->ac_exe_inode : 0) + , t->ac_comm + ); +} + +void handle_aggr(int mother, struct nlattr *na, int fd) +{ + int aggr_len = NLA_PAYLOAD(na->nla_len); + int len2 = 0; + pid_t rtid = 0; + + na = (struct nlattr *) NLA_DATA(na); + while (len2 < aggr_len) { + switch (na->nla_type) { + case TASKSTATS_TYPE_PID: + rtid = *(int *) NLA_DATA(na); + PRINTF("PID\t%d\n", rtid); + break; + case TASKSTATS_TYPE_TGID: + rtid = *(int *) NLA_DATA(na); + PRINTF("TGID\t%d\n", rtid); + break; + case TASKSTATS_TYPE_STATS: + if (mother == TASKSTATS_TYPE_AGGR_PID) + print_procacct((struct taskstats *) NLA_DATA(na)); + if (fd) { + if (write(fd, NLA_DATA(na), na->nla_len) < 0) + err(1, "write error\n"); + } + break; + case TASKSTATS_TYPE_NULL: + break; + default: + fprintf(stderr, "Unknown nested nla_type %d\n", + na->nla_type); + break; + } + len2 += NLA_ALIGN(na->nla_len); + na = (struct nlattr *)((char *)na + + NLA_ALIGN(na->nla_len)); + } +} + +int main(int argc, char *argv[]) +{ + int c, rc, rep_len, aggr_len, len2; + int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC; + __u16 id; + __u32 mypid; + + struct nlattr *na; + int nl_sd = -1; + int len = 0; + pid_t tid = 0; + + int fd = 0; + int write_file = 0; + int maskset = 0; + char *logfile = NULL; + int containerset = 0; + char *containerpath = NULL; + int cfd = 0; + int forking = 0; + sigset_t sigset; + + struct msgtemplate msg; + + while (!forking) { + c = getopt(argc, argv, "m:vr:"); + if (c < 0) + break; + + switch (c) { + case 'w': + logfile = strdup(optarg); + printf("write to file %s\n", logfile); + write_file = 1; + break; + case 'r': + rcvbufsz = atoi(optarg); + printf("receive buf size %d\n", rcvbufsz); + if (rcvbufsz < 0) + err(1, "Invalid rcv buf size\n"); + break; + case 'm': + strncpy(cpumask, optarg, sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; + maskset = 1; + break; + case 'v': + printf("debug on\n"); + dbg = 1; + break; + default: + usage(); + exit(-1); + } + } + if (!maskset) { + maskset = 1; + strncpy(cpumask, "1", sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; + } + printf("cpumask %s maskset %d\n", cpumask, maskset); + + if (write_file) { + fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd == -1) { + perror("Cannot open output file\n"); + exit(1); + } + } + + nl_sd = create_nl_socket(NETLINK_GENERIC); + if (nl_sd < 0) + err(1, "error creating Netlink socket\n"); + + mypid = getpid(); + id = get_family_id(nl_sd); + if (!id) { + fprintf(stderr, "Error getting family id, errno %d\n", errno); + goto err; + } + PRINTF("family id %d\n", id); + + if (maskset) { + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, + &cpumask, strlen(cpumask) + 1); + PRINTF("Sent register cpumask, retval %d\n", rc); + if (rc < 0) { + fprintf(stderr, "error sending register cpumask\n"); + goto err; + } + } + + do { + rep_len = recv(nl_sd, &msg, sizeof(msg), 0); + PRINTF("received %d bytes\n", rep_len); + + if (rep_len < 0) { + fprintf(stderr, "nonfatal reply error: errno %d\n", + errno); + continue; + } + if (msg.n.nlmsg_type == NLMSG_ERROR || + !NLMSG_OK((&msg.n), rep_len)) { + struct nlmsgerr *err = NLMSG_DATA(&msg); + + fprintf(stderr, "fatal reply error, errno %d\n", + err->error); + goto done; + } + + PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n", + sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len); + + + rep_len = GENLMSG_PAYLOAD(&msg.n); + + na = (struct nlattr *) GENLMSG_DATA(&msg); + len = 0; + while (len < rep_len) { + len += NLA_ALIGN(na->nla_len); + int mother = na->nla_type; + + PRINTF("mother=%i\n", mother); + switch (na->nla_type) { + case TASKSTATS_TYPE_AGGR_PID: + case TASKSTATS_TYPE_AGGR_TGID: + /* For nested attributes, na follows */ + handle_aggr(mother, na, fd); + break; + default: + fprintf(stderr, "Unexpected nla_type %d\n", + na->nla_type); + case TASKSTATS_TYPE_NULL: + break; + } + na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); + } + } while (1); +done: + if (maskset) { + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, + &cpumask, strlen(cpumask) + 1); + printf("Sent deregister mask, retval %d\n", rc); + if (rc < 0) + err(rc, "error sending deregister cpumask\n"); + } +err: + close(nl_sd); + if (fd) + close(fd); + if (cfd) + close(cfd); + return 0; +} -- cgit v1.2.3 From edc73c7261ca3ea79867437bb0b9dab0e232436c Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: kernel: make taskstats available from all net namespaces If getdelays runs in a non-init network namespace, it will fail in getting delayacct stats even if it has privilege of root user, which seems to be not very reasonable. We can simply reproduce this by executing commands: unshare -n getdelays -d -p I don't think net namespace should be an obstacle to the normal execution of getdelay function. So let's make it available from all net namespaces. Link: https://lkml.kernel.org/r/20220412071946.2532318-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Cc: Balbir Singh Cc: Yang Yang Cc: "Dr. Thomas Orgis" Cc: Eric W. Biederman Cc: Ismael Luceno Signed-off-by: Andrew Morton --- kernel/taskstats.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 72415e22342b..f7e246336218 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -688,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .netnsok = true, }; /* Needed early in initialization */ -- cgit v1.2.3 From 701850dc0c31bfadf75a0a74af7d2c97859945ec Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 3 May 2022 09:38:44 +0200 Subject: printk, tracing: fix console tracepoint The original intent of the 'console' tracepoint per the commit 95100358491a ("printk/tracing: Add console output tracing") had been to "[...] record any printk messages into the trace, regardless of the current console loglevel. This can help correlate (existing) printk debugging with other tracing." Petr points out [1] that calling trace_console_rcuidle() in call_console_driver() had been the wrong thing for a while, because "printk() always used console_trylock() and the message was flushed to the console only when the trylock succeeded. And it was always deferred in NMI or when printed via printk_deferred()." With the commit 09c5ba0aa2fc ("printk: add kthread console printers"), things only got worse, and calls to call_console_driver() no longer happen with typical printk() calls but always appear deferred [2]. As such, the tracepoint can no longer serve its purpose to clearly correlate printk() calls and other tracing, as well as breaks usecases that expect every printk() call to result in a callback of the console tracepoint. Notably, the KFENCE and KCSAN test suites, which want to capture console output and assume a printk() immediately gives us a callback to the console tracepoint. Fix the console tracepoint by moving it into printk_sprint() [3]. One notable difference is that by moving tracing into printk_sprint(), the 'text' will no longer include the "header" (loglevel and timestamp), but only the raw message. Arguably this is less of a problem now that the console tracepoint happens on the printk() call and isn't delayed. Link: https://lore.kernel.org/all/Ym+WqKStCg%2FEHfh3@alley/ [1] Link: https://lore.kernel.org/all/CA+G9fYu2kS0wR4WqMRsj2rePKV9XLgOU1PiXnMvpT+Z=c2ucHA@mail.gmail.com/ [2] Link: https://lore.kernel.org/all/87fslup9dx.fsf@jogness.linutronix.de/ [3] Reported-by: Linux Kernel Functional Testing Signed-off-by: Marco Elver Cc: John Ogness Cc: Petr Mladek Reviewed-by: Petr Mladek Acked-by: John Ogness Acked-by: Steven Rostedt (Google) Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220503073844.4148944-1-elver@google.com --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f66d6e72a642..a3e1035929b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2064,8 +2064,6 @@ static void call_console_driver(struct console *con, const char *text, size_t le { size_t dropped_len; - trace_console_rcuidle(text, len); - if (con->dropped && dropped_text) { dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, "** %lu printk messages dropped **\n", @@ -2240,6 +2238,8 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } + trace_console_rcuidle(text, text_len); + return text_len; } -- cgit v1.2.3 From 343f4c49f2438d8920f1f76fa823ee59b91f02e4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Apr 2022 11:40:14 -0500 Subject: kthread: Don't allocate kthread_struct for init and umh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If kthread_is_per_cpu runs concurrently with free_kthread_struct the kthread_struct that was just freed may be read from. This bug was introduced by commit 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads"). When kthread_struct started to be allocated for all tasks that have PF_KTHREAD set. This in turn required the kthread_struct to be freed in kernel_execve and violated the assumption that kthread_struct will have the same lifetime as the task. Looking a bit deeper this only applies to callers of kernel_execve which is just the init process and the user mode helper processes. These processes really don't want to be kernel threads but are for historical reasons. Mostly that copy_thread does not know how to take a kernel mode function to the process with for processes without PF_KTHREAD or PF_IO_WORKER set. Solve this by not allocating kthread_struct for the init process and the user mode helper processes. This is done by adding a kthread member to struct kernel_clone_args. Setting kthread in fork_idle and kernel_thread. Adding user_mode_thread that works like kernel_thread except it does not set kthread. In fork only allocating the kthread_struct if .kthread is set. I have looked at kernel/kthread.c and since commit 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") there have been no assumptions added that to_kthread or __to_kthread will not return NULL. There are a few callers of to_kthread or __to_kthread that assume a non-NULL struct kthread pointer will be returned. These functions are kthread_data(), kthread_parmme(), kthread_exit(), kthread(), kthread_park(), kthread_unpark(), kthread_stop(). All of those functions can reasonably expected to be called when it is know that a task is a kthread so that assumption seems reasonable. Cc: stable@vger.kernel.org Fixes: 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") Reported-by: Максим Кутявин Link: https://lkml.kernel.org/r/20220506141512.516114-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 6 ++++-- include/linux/sched/task.h | 2 ++ init/main.c | 2 +- kernel/fork.c | 22 ++++++++++++++++++++-- kernel/umh.c | 6 +++--- 5 files changed, 30 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index e3e55d5e0be1..75eb6e0ee7b2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1308,8 +1308,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - if (me->flags & PF_KTHREAD) - free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); @@ -1955,6 +1953,10 @@ int kernel_execve(const char *kernel_filename, int fd = AT_FDCWD; int retval; + if (WARN_ON_ONCE((current->flags & PF_KTHREAD) && + (current->worker_private))) + return -EINVAL; + filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 719c9a6cac8d..4492266935dd 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -32,6 +32,7 @@ struct kernel_clone_args { size_t set_tid_size; int cgroup; int io_thread; + int kthread; struct cgroup *cgrp; struct css_set *cset; }; @@ -89,6 +90,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); diff --git a/init/main.c b/init/main.c index 98182c3c2c4b..39baac0211c6 100644 --- a/init/main.c +++ b/init/main.c @@ -688,7 +688,7 @@ noinline void __ref rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - pid = kernel_thread(kernel_init, NULL, CLONE_FS); + pid = user_mode_thread(kernel_init, NULL, CLONE_FS); /* * Pin init on the boot CPU. Task migration is not properly working * until sched_init_smp() has been run. It will set the allowed diff --git a/kernel/fork.c b/kernel/fork.c index 9796897560ab..27c5203750b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2157,7 +2157,7 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); - if (p->flags & PF_KTHREAD) { + if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } @@ -2548,7 +2548,8 @@ struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { - .flags = CLONE_VM, + .flags = CLONE_VM, + .kthread = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); @@ -2679,6 +2680,23 @@ pid_t kernel_clone(struct kernel_clone_args *args) * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + .kthread = 1, + }; + + return kernel_clone(&args); +} + +/* + * Create a user mode thread. + */ +pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | diff --git a/kernel/umh.c b/kernel/umh.c index 36c123360ab8..b989736e8707 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -132,7 +132,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else @@ -171,8 +171,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work) * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); + pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); -- cgit v1.2.3 From c5febea0956fd3874e8fb59c6f84d68f128d68f8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Apr 2022 18:07:50 -0500 Subject: fork: Pass struct kernel_clone_args into copy_thread With io_uring we have started supporting tasks that are for most purposes user space tasks that exclusively run code in kernel mode. The kernel task that exec's init and tasks that exec user mode helpers are also user mode tasks that just run kernel code until they call kernel execve. Pass kernel_clone_args into copy_thread so these oddball tasks can be supported more cleanly and easily. v2: Fix spelling of kenrel_clone_args on h8300 Link: https://lkml.kernel.org/r/20220506141512.516114-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/process.c | 8 +++++--- arch/arc/kernel/process.c | 8 +++++--- arch/arm/kernel/process.c | 7 +++++-- arch/arm64/kernel/process.c | 7 +++++-- arch/csky/kernel/process.c | 10 +++++----- arch/h8300/kernel/process.c | 5 +++-- arch/hexagon/kernel/process.c | 7 +++++-- arch/ia64/kernel/process.c | 7 +++++-- arch/m68k/kernel/process.c | 7 +++++-- arch/microblaze/kernel/process.c | 7 +++++-- arch/mips/kernel/process.c | 8 +++++--- arch/nios2/kernel/process.c | 7 +++++-- arch/openrisc/kernel/process.c | 7 +++++-- arch/parisc/kernel/process.c | 7 +++++-- arch/powerpc/kernel/process.c | 8 +++++--- arch/riscv/kernel/process.c | 7 +++++-- arch/s390/kernel/process.c | 7 +++++-- arch/sh/kernel/process_32.c | 7 +++++-- arch/sparc/kernel/process_32.c | 7 +++++-- arch/sparc/kernel/process_64.c | 7 +++++-- arch/um/kernel/process.c | 7 +++++-- arch/x86/kernel/process.c | 7 +++++-- arch/xtensa/kernel/process.c | 8 +++++--- include/linux/sched/task.h | 3 +-- kernel/fork.c | 4 ++-- 25 files changed, 116 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 5f8527081da9..732e39217c7f 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -233,10 +233,12 @@ release_thread(struct task_struct *dead_task) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long kthread_arg = args->stack_size; + unsigned long tls = args->tls; extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 5f7f5aab361f..caf948ba647c 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -162,10 +162,12 @@ asmlinkage void ret_from_fork(void); * | user_r25 | * ------------------ <===== END of PAGE */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long kthread_arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *c_regs; /* child's pt_regs */ unsigned long *childksp; /* to unwind out of __switch_to() */ struct callee_regs *c_callee; /* child's callee regs */ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 0617af11377f..8e13b426dd26 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -238,9 +238,12 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long stack_start = args->stack; + unsigned long stk_sz = args->stack_size; + unsigned long tls = args->tls; struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 7fa97df55e3a..e002f6681c8d 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -316,9 +316,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) asmlinkage void ret_from_fork(void) asm("ret_from_fork"); -int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long stack_start = args->stack; + unsigned long stk_sz = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 3d0ca22cd0e2..7dba33d37e1a 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -30,12 +30,12 @@ asmlinkage void ret_from_kernel_thread(void); */ void flush_thread(void){} -int copy_thread(unsigned long clone_flags, - unsigned long usp, - unsigned long kthread_arg, - struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long kthread_arg = args->stack_size; + unsigned long tls = args->tls; struct switch_stack *childstack; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 8833fa4f5d51..752cbd9b0bf6 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -105,9 +105,10 @@ void flush_thread(void) { } -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long topstk, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long usp = args->stack; + unsigned long topstk = args->stack_size; struct pt_regs *childregs; childregs = (struct pt_regs *) (THREAD_SIZE + task_stack_page(p)) - 1; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index eab03c691f53..f1c1f6f21941 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -50,9 +50,12 @@ void arch_cpu_idle(void) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct hexagon_switch_stack *ss; struct pt_regs *childregs; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index d7a256bd9d6b..10d41ded05a5 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -295,9 +295,12 @@ ia64_load_extra (struct task_struct *task) * so there is nothing to worry about. */ int -copy_thread(unsigned long clone_flags, unsigned long user_stack_base, - unsigned long user_stack_size, struct task_struct *p, unsigned long tls) +copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long user_stack_base = args->stack; + unsigned long user_stack_size = args->stack_size; + unsigned long tls = args->tls; extern char ia64_ret_from_clone; struct switch_stack *child_stack, *stack; unsigned long rbs, child_rbs, rbs_size; diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index a6030dbaa089..8ac575656fc4 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -138,9 +138,12 @@ asmlinkage int m68k_clone3(struct pt_regs *regs) return sys_clone3((struct clone_args __user *)regs->d1, regs->d2); } -int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct fork_frame { struct switch_stack sw; struct pt_regs regs; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 1b944d319d73..b5f549125c6a 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -52,9 +52,12 @@ void flush_thread(void) { } -int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index c2d5f4bfe1f3..a572d097b16b 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -105,10 +105,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long kthread_arg = args->stack_size; + unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); unsigned long childksp; diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index f8ea522a1588..98c4bfe972e0 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -100,9 +100,12 @@ void flush_thread(void) { } -int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); struct pt_regs *regs; struct switch_stack *stack; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 3c0c91bcdcba..486e46dd5883 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -152,9 +152,12 @@ extern asmlinkage void ret_from_fork(void); */ int -copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *userregs; struct pt_regs *kregs; unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 28b6a2a5574c..129c17de45ba 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -206,9 +206,12 @@ arch_initcall(parisc_idle_init); * Copy architecture-specific thread state */ int -copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, unsigned long tls) +copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long kthread_arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 984813a4d5dc..3fd67c861d54 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1716,10 +1716,12 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long kthread_arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); extern void ret_from_fork_scv(void); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 504b496787aa..334382731725 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -120,9 +120,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); /* p->thread holds context to be restored by __switch_to() */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 71d86f73b02c..bb5daec39516 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -94,9 +94,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -int copy_thread(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long new_stackp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct fake_frame { struct stack_frame sf; diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index ca01286a0610..6023399b1892 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -92,9 +92,12 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); -int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs; diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 88c0c14aaff0..80e6775e18c0 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -259,9 +259,12 @@ clone_stackframe(struct sparc_stackf __user *dst, extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); -int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); char *new_stack; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 9a2ceb080ac9..38c46ca826d9 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -564,9 +564,12 @@ barf: * Parent --> %o0 == childs pid, %o1 == 0 * Child --> %o0 == parents pid, %o1 == 1 */ -int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct thread_info *t = task_thread_info(p); struct pt_regs *regs = current_pt_regs(); struct sparc_stackf *parent_sf; diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 80504680be08..fd2d2361484d 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -154,9 +154,12 @@ void fork_handler(void) userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct * p, unsigned long tls) +int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; void (*handler)(void); int kthread = current->flags & (PF_KTHREAD | PF_IO_WORKER); int ret = 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..0fce52b10dc4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -130,9 +130,12 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index e8bfbca5f001..15ce25073142 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -201,10 +201,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) * involved. Much simpler to just not copy those live frames across. */ -int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, - unsigned long thread_fn_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp_thread_fn = args->stack; + unsigned long thread_fn_arg = args->stack_size; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); #if (XTENSA_HAVE_COPROCESSORS || XTENSA_HAVE_IO_PORTS) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4492266935dd..fcdcba231aac 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -68,8 +68,7 @@ extern void fork_init(void); extern void release_task(struct task_struct * p); -extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *, unsigned long); +extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); extern void flush_thread(void); diff --git a/kernel/fork.c b/kernel/fork.c index 27c5203750b4..d39a248a8d8d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1979,7 +1979,7 @@ static __latent_entropy struct task_struct *copy_process( struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; - u64 clone_flags = args->flags; + const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* @@ -2240,7 +2240,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); + retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3 From 36cb0e1cda645ee645b85a6ce652cb46a16e14e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Apr 2022 16:17:28 -0500 Subject: fork: Explicity test for idle tasks in copy_thread The architectures ia64 and parisc have special handling for the idle thread in copy_process. Add a flag named idle to kernel_clone_args and use it to explicity test if an idle process is being created. Fullfill the expectations of the rest of the copy_thread implemetations and pass a function pointer in .stack from fork_idle(). This makes what is happening in copy_thread better defined, and is useful to make idle threads less special. Link: https://lkml.kernel.org/r/20220506141512.516114-3-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 2 +- include/linux/sched/task.h | 1 + kernel/fork.c | 9 +++++++++ 4 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 10d41ded05a5..8f010ae818bc 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -342,7 +342,7 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { - if (unlikely(!user_stack_base)) { + if (unlikely(args->idle)) { /* fork_idle() called us */ return 0; } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 129c17de45ba..30a5874ca845 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -224,7 +224,7 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ memset(cregs, 0, sizeof(struct pt_regs)); - if (!usp) /* idle thread */ + if (args->idle) /* idle thread */ return 0; /* Must exit via ret_from_kernel_thread in order * to call schedule_tail() diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index fcdcba231aac..3d6b99ce5408 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -33,6 +33,7 @@ struct kernel_clone_args { int cgroup; int io_thread; int kthread; + int idle; struct cgroup *cgrp; struct css_set *cset; }; diff --git a/kernel/fork.c b/kernel/fork.c index d39a248a8d8d..93d77ee921ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2544,12 +2544,21 @@ static inline void init_idle_pids(struct task_struct *idle) } } +static int idle_dummy(void *dummy) +{ + /* This function is never called */ + return 0; +} + struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { .flags = CLONE_VM, + .stack = (unsigned long)&idle_dummy, + .stack_size = (unsigned long)NULL, .kthread = 1, + .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); -- cgit v1.2.3 From 5bd2e97c868a8a44470950ed01846cab6328e540 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 12 Apr 2022 10:18:48 -0500 Subject: fork: Generalize PF_IO_WORKER handling Add fn and fn_arg members into struct kernel_clone_args and test for them in copy_thread (instead of testing for PF_KTHREAD | PF_IO_WORKER). This allows any task that wants to be a user space task that only runs in kernel mode to use this functionality. The code on x86 is an exception and still retains a PF_KTHREAD test because x86 unlikely everything else handles kthreads slightly differently than user space tasks that start with a function. The functions that created tasks that start with a function have been updated to set ".fn" and ".fn_arg" instead of ".stack" and ".stack_size". These functions are fork_idle(), create_io_thread(), kernel_thread(), and user_mode_thread(). Link: https://lkml.kernel.org/r/20220506141512.516114-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/process.c | 7 +++---- arch/arc/kernel/process.c | 7 +++---- arch/arm/kernel/process.c | 7 +++---- arch/arm64/kernel/process.c | 7 +++---- arch/csky/kernel/process.c | 7 +++---- arch/h8300/kernel/process.c | 7 +++---- arch/hexagon/kernel/process.c | 7 +++---- arch/ia64/kernel/process.c | 6 +++--- arch/m68k/kernel/process.c | 7 +++---- arch/microblaze/kernel/process.c | 7 +++---- arch/mips/kernel/process.c | 7 +++---- arch/nios2/kernel/process.c | 7 +++---- arch/openrisc/kernel/process.c | 7 +++---- arch/parisc/kernel/process.c | 11 +++++------ arch/powerpc/kernel/process.c | 9 ++++----- arch/riscv/kernel/process.c | 7 +++---- arch/s390/kernel/process.c | 7 +++---- arch/sh/kernel/process_32.c | 7 +++---- arch/sparc/kernel/process_32.c | 7 +++---- arch/sparc/kernel/process_64.c | 7 +++---- arch/um/kernel/process.c | 10 ++++------ arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/include/asm/switch_to.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/process.c | 13 ++++++------- arch/xtensa/kernel/process.c | 11 +++++------ include/linux/sched/task.h | 2 ++ kernel/fork.c | 16 ++++++++-------- 28 files changed, 95 insertions(+), 116 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 732e39217c7f..6cbba7370b4e 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -237,7 +237,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); @@ -251,13 +250,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childti->pcb.ksp = (unsigned long) childstack; childti->pcb.flags = 1; /* set FEN, clear everything else */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(childstack, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs)); childstack->r26 = (unsigned long) ret_from_kernel_thread; - childstack->r9 = usp; /* function */ - childstack->r10 = kthread_arg; + childstack->r9 = (unsigned long) args->fn; + childstack->r10 = (unsigned long) args->fn_arg; childregs->hae = alpha_mv.hae_cache; childti->pcb.usp = 0; return 0; diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index caf948ba647c..3369f0700702 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -166,7 +166,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *c_regs; /* child's pt_regs */ unsigned long *childksp; /* to unwind out of __switch_to() */ @@ -193,11 +192,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childksp[0] = 0; /* fp */ childksp[1] = (unsigned long)ret_from_fork; /* blink */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(c_regs, 0, sizeof(struct pt_regs)); - c_callee->r13 = kthread_arg; - c_callee->r14 = usp; /* function */ + c_callee->r13 = (unsigned long)args->fn_arg; + c_callee->r14 = (unsigned long)args->fn; return 0; } diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8e13b426dd26..3d9cace63884 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -242,7 +242,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long stack_start = args->stack; - unsigned long stk_sz = args->stack_size; unsigned long tls = args->tls; struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); @@ -259,15 +258,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) thread->cpu_domain = get_domain(); #endif - if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->ARM_r0 = 0; if (stack_start) childregs->ARM_sp = stack_start; } else { memset(childregs, 0, sizeof(struct pt_regs)); - thread->cpu_context.r4 = stk_sz; - thread->cpu_context.r5 = stack_start; + thread->cpu_context.r4 = (unsigned long)args->fn_arg; + thread->cpu_context.r5 = (unsigned long)args->fn; childregs->ARM_cpsr = SVC_MODE; } thread->cpu_context.pc = (unsigned long)ret_from_fork; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index e002f6681c8d..d0ef05c661b0 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -320,7 +320,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long stack_start = args->stack; - unsigned long stk_sz = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); @@ -337,7 +336,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ptrauth_thread_init_kernel(p); - if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; @@ -371,8 +370,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; - p->thread.cpu_context.x19 = stack_start; - p->thread.cpu_context.x20 = stk_sz; + p->thread.cpu_context.x19 = (unsigned long)args->fn; + p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 7dba33d37e1a..9af49aea1c3b 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -34,7 +34,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct switch_stack *childstack; struct pt_regs *childregs = task_pt_regs(p); @@ -49,11 +48,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* setup thread.sp for switch_to !!! */ p->thread.sp = (unsigned long)childstack; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; - childstack->r10 = kthread_arg; - childstack->r9 = usp; + childstack->r10 = (unsigned long) args->fn_arg; + childstack->r9 = (unsigned long) args->fn; childregs->sr = mfcr("psr"); } else { *childregs = *(current_pt_regs()); diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 752cbd9b0bf6..9028262c96a9 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -108,16 +108,15 @@ void flush_thread(void) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long usp = args->stack; - unsigned long topstk = args->stack_size; struct pt_regs *childregs; childregs = (struct pt_regs *) (THREAD_SIZE + task_stack_page(p)) - 1; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); childregs->retpc = (unsigned long) ret_from_kernel_thread; - childregs->er4 = topstk; /* arg */ - childregs->er5 = usp; /* fn */ + childregs->er4 = (unsigned long) args->fn_arg; + childregs->er5 = (unsigned long) args->fn; } else { *childregs = *current_pt_regs(); childregs->er0 = 0; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index f1c1f6f21941..f0552f98a7ba 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -54,7 +54,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct hexagon_switch_stack *ss; @@ -76,11 +75,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) sizeof(*ss)); ss->lr = (unsigned long)ret_from_fork; p->thread.switch_sp = ss; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); /* r24 <- fn, r25 <- arg */ - ss->r24 = usp; - ss->r25 = arg; + ss->r24 = (unsigned long)args->fn; + ss->r25 = (unsigned long)args->fn_arg; pt_set_kmode(childregs); return 0; } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 8f010ae818bc..167b1765bea1 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -341,14 +341,14 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { if (unlikely(args->idle)) { /* fork_idle() called us */ return 0; } memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack)); - child_stack->r4 = user_stack_base; /* payload */ - child_stack->r5 = user_stack_size; /* argument */ + child_stack->r4 = (unsigned long) args->fn; + child_stack->r5 = (unsigned long) args->fn_arg; /* * Preserve PSR bits, except for bits 32-34 and 37-45, * which we can't read. diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 8ac575656fc4..221feb0269f1 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -142,7 +142,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct fork_frame { struct switch_stack sw; @@ -160,12 +159,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ p->thread.fc = USER_DATA; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(frame, 0, sizeof(struct fork_frame)); frame->regs.sr = PS_S; - frame->sw.a3 = usp; /* function */ - frame->sw.d7 = arg; + frame->sw.a3 = (unsigned long)args->fn; + frame->sw.d7 = (unsigned long)args->fn_arg; frame->sw.retpc = (unsigned long)ret_from_kernel_thread; p->thread.usp = 0; return 0; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index b5f549125c6a..3c6241bcaea8 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -56,19 +56,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* if we're creating a new kernel thread then just zeroing all * the registers. That's OK for a brand new thread.*/ memset(childregs, 0, sizeof(struct pt_regs)); memset(&ti->cpu_context, 0, sizeof(struct cpu_context)); ti->cpu_context.r1 = (unsigned long)childregs; - ti->cpu_context.r20 = (unsigned long)usp; /* fn */ - ti->cpu_context.r19 = (unsigned long)arg; + ti->cpu_context.r20 = (unsigned long)args->fn; + ti->cpu_context.r19 = (unsigned long)args->fn_arg; childregs->pt_mode = 1; local_save_flags(childregs->msr); ti->cpu_context.msr = childregs->msr & ~MSR_IE; diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index a572d097b16b..35b912bce429 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -109,7 +109,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); @@ -122,12 +121,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Put the stack after the struct pt_regs. */ childksp = (unsigned long) childregs; p->thread.cp0_status = (read_c0_status() & ~(ST0_CU2|ST0_CU1)) | ST0_KERNEL_CUMASK; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ unsigned long status = p->thread.cp0_status; memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.reg16 = usp; /* fn */ - p->thread.reg17 = kthread_arg; + p->thread.reg16 = (unsigned long)args->fn; + p->thread.reg17 = (unsigned long)args->fn_arg; p->thread.reg29 = childksp; p->thread.reg31 = (unsigned long) ret_from_kernel_thread; #if defined(CONFIG_CPU_R3000) diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 98c4bfe972e0..29593b98567d 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -104,7 +104,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); struct pt_regs *regs; @@ -112,12 +111,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) struct switch_stack *childstack = ((struct switch_stack *)childregs) - 1; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childstack, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs)); - childstack->r16 = usp; /* fn */ - childstack->r17 = arg; + childstack->r16 = (unsigned long) args->fn; + childstack->r17 = (unsigned long) args->fn_arg; childstack->ra = (unsigned long) ret_from_kernel_thread; childregs->estatus = STATUS_PIE; childregs->sp = (unsigned long) childstack; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 486e46dd5883..d9697cc9bc4d 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -156,7 +156,6 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *userregs; struct pt_regs *kregs; @@ -175,10 +174,10 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *)sp; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(kregs, 0, sizeof(struct pt_regs)); - kregs->gpr[20] = usp; /* fn, kernel thread */ - kregs->gpr[22] = arg; + kregs->gpr[20] = (unsigned long)args->fn; + kregs->gpr[22] = (unsigned long)args->fn_arg; } else { *userregs = *current_pt_regs(); diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 30a5874ca845..a6a2a558fc5b 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -210,7 +210,6 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); @@ -221,7 +220,7 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) extern void * const ret_from_kernel_thread; extern void * const child_return; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(cregs, 0, sizeof(struct pt_regs)); if (args->idle) /* idle thread */ @@ -236,12 +235,12 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * ret_from_kernel_thread. */ #ifdef CONFIG_64BIT - cregs->gr[27] = ((unsigned long *)usp)[3]; - cregs->gr[26] = ((unsigned long *)usp)[2]; + cregs->gr[27] = ((unsigned long *)args->fn)[3]; + cregs->gr[26] = ((unsigned long *)args->fn)[2]; #else - cregs->gr[26] = usp; + cregs->gr[26] = (unsigned long) args->fn; #endif - cregs->gr[25] = kthread_arg; + cregs->gr[25] = (unsigned long) args->fn_arg; } else { /* user thread */ /* usp must be word aligned. This also prevents users from diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3fd67c861d54..4f367bb68906 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1720,7 +1720,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -1738,18 +1737,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Copy registers */ sp -= sizeof(struct pt_regs); childregs = (struct pt_regs *) sp; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); /* function */ - if (usp) - childregs->gpr[14] = ppc_function_entry((void *)usp); + if (args->fn) + childregs->gpr[14] = ppc_function_entry((void *)args->fn); #ifdef CONFIG_PPC64 clear_tsk_thread_flag(p, TIF_32BIT); childregs->softe = IRQS_ENABLED; #endif - childregs->gpr[15] = kthread_arg; + childregs->gpr[15] = (unsigned long)args->fn_arg; p->thread.regs = NULL; /* no user register state */ ti->flags |= _TIF_RESTOREALL; f = ret_from_kernel_thread; diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 334382731725..24efabdbc551 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -124,12 +124,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); /* p->thread holds context to be restored by __switch_to() */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp_in_global; @@ -137,8 +136,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs->status = SR_PP | SR_PIE; p->thread.ra = (unsigned long)ret_from_kernel_thread; - p->thread.s[0] = usp; /* fn */ - p->thread.s[1] = arg; + p->thread.s[0] = (unsigned long)args->fn; + p->thread.s[1] = (unsigned long)args->fn_arg; } else { *childregs = *(current_pt_regs()); if (usp) /* User fork */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index bb5daec39516..89949b9f3cf8 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -98,7 +98,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long new_stackp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct fake_frame { @@ -133,15 +132,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->sf.gprs[9] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = (unsigned long)__ret_from_fork; - frame->childregs.gprs[9] = new_stackp; /* function */ - frame->childregs.gprs[10] = arg; + frame->childregs.gprs[9] = (unsigned long)args->fn; + frame->childregs.gprs[10] = (unsigned long)args->fn_arg; frame->childregs.orig_gpr2 = -1; frame->childregs.last_break = 1; return 0; diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 6023399b1892..a808843375e7 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -96,7 +96,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs; @@ -117,11 +116,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); p->thread.pc = (unsigned long) ret_from_kernel_thread; - childregs->regs[4] = arg; - childregs->regs[5] = usp; + childregs->regs[4] = (unsigned long) args->fn_arg; + childregs->regs[5] = (unsigned long) args->fn; childregs->sr = SR_MD; #if defined(CONFIG_SH_FPU) childregs->sr |= SR_FD; diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 80e6775e18c0..33b0215a4182 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -263,7 +263,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); @@ -299,13 +298,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ti->ksp = (unsigned long) new_stack; p->thread.kregs = childregs; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { extern int nwindows; unsigned long psr; memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ); ti->kpc = (((unsigned long) ret_from_kernel_thread) - 0x8); - childregs->u_regs[UREG_G1] = sp; /* function */ - childregs->u_regs[UREG_G2] = arg; + childregs->u_regs[UREG_G1] = (unsigned long) args->fn; + childregs->u_regs[UREG_G2] = (unsigned long) args->fn_arg; psr = childregs->psr = get_psr(); ti->kpsr = psr | PSR_PIL; ti->kwim = 1 << (((psr & PSR_CWP) + 1) % nwindows); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 38c46ca826d9..6335b698a4b4 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -568,7 +568,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *t = task_thread_info(p); struct pt_regs *regs = current_pt_regs(); @@ -587,12 +586,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) sizeof(struct sparc_stackf)); t->fpsaved[0] = 0; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(child_trap_frame, 0, child_stack_sz); __thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] = (current_pt_regs()->tstate + 1) & TSTATE_CWP; - t->kregs->u_regs[UREG_G1] = sp; /* function */ - t->kregs->u_regs[UREG_G2] = arg; + t->kregs->u_regs[UREG_G1] = (unsigned long) args->fn; + t->kregs->u_regs[UREG_G2] = (unsigned long) args->fn_arg; return 0; } diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index fd2d2361484d..181cc9aafb25 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -158,15 +158,13 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; void (*handler)(void); - int kthread = current->flags & (PF_KTHREAD | PF_IO_WORKER); int ret = 0; p->thread = (struct thread_struct) INIT_THREAD; - if (!kthread) { + if (!args->fn) { memcpy(&p->thread.regs.regs, current_pt_regs(), sizeof(p->thread.regs.regs)); PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); @@ -178,14 +176,14 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = (int (*)(void *))sp; - p->thread.request.u.thread.arg = (void *)arg; + p->thread.request.u.thread.proc = args->fn; + p->thread.request.u.thread.arg = args->fn_arg; handler = new_thread_handler; } new_thread(task_stack_page(p), &p->thread.switch_buf, handler); - if (!kthread) { + if (!args->fn) { clear_flushed_tls(p); /* diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 99a8820e8cc4..b2486b2cbc6e 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); extern void fpu_flush_thread(void); /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index b5f0d2ff47e4..c08eb0fdd11f 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -78,13 +78,13 @@ static inline void update_task_stack(struct task_struct *task) } static inline void kthread_frame_init(struct inactive_task_frame *frame, - unsigned long fun, unsigned long arg) + int (*fun)(void *), void *arg) { - frame->bx = fun; + frame->bx = (unsigned long)fun; #ifdef CONFIG_X86_32 - frame->di = arg; + frame->di = (unsigned long)arg; #else - frame->r12 = arg; + frame->r12 = (unsigned long)arg; #endif } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..fbade5a3975b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -556,7 +556,7 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -579,7 +579,7 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags) * No FPU state inheritance for kernel threads and IO * worker threads. */ - if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0fce52b10dc4..d20eaad52a85 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -134,7 +134,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; @@ -172,13 +171,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags); + fpu_clone(p, clone_flags, args->fn); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@ -198,10 +197,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) task_user_gs(p) = get_user_gs(current_pt_regs()); #endif - if (unlikely(p->flags & PF_IO_WORKER)) { + if (unlikely(args->fn)) { /* - * An IO thread is a user space thread, but it doesn't - * return to ret_after_fork(). + * A user space thread, but it doesn't return to + * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. @@ -211,7 +210,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ childregs->sp = 0; childregs->ip = 0; - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 15ce25073142..c3751cc88e5d 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -205,7 +205,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp_thread_fn = args->stack; - unsigned long thread_fn_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); @@ -226,7 +225,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) #error Unsupported Xtensa ABI #endif - if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (!args->fn) { struct pt_regs *regs = current_pt_regs(); unsigned long usp = usp_thread_fn ? usp_thread_fn : regs->areg[1]; @@ -278,15 +277,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * Window underflow will load registers from the * spill slots on the stack on return from _switch_to. */ - SPILL_SLOT(childregs, 2) = usp_thread_fn; - SPILL_SLOT(childregs, 3) = thread_fn_arg; + SPILL_SLOT(childregs, 2) = (unsigned long)args->fn; + SPILL_SLOT(childregs, 3) = (unsigned long)args->fn_arg; #elif defined(__XTENSA_CALL0_ABI__) /* * a12 = thread_fn, a13 = thread_fn arg. * _switch_to epilogue will load registers from the stack. */ - ((unsigned long *)p->thread.sp)[0] = usp_thread_fn; - ((unsigned long *)p->thread.sp)[1] = thread_fn_arg; + ((unsigned long *)p->thread.sp)[0] = (unsigned long)args->fn; + ((unsigned long *)p->thread.sp)[1] = (unsigned long)args->fn_arg; #else #error Unsupported Xtensa ABI #endif diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d6b99ce5408..505aaf9fe477 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -34,6 +34,8 @@ struct kernel_clone_args { int io_thread; int kthread; int idle; + int (*fn)(void *); + void *fn_arg; struct cgroup *cgrp; struct css_set *cset; }; diff --git a/kernel/fork.c b/kernel/fork.c index 93d77ee921ff..8e17c3fbce42 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2555,8 +2555,8 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *task; struct kernel_clone_args args = { .flags = CLONE_VM, - .stack = (unsigned long)&idle_dummy, - .stack_size = (unsigned long)NULL, + .fn = &idle_dummy, + .fn_arg = NULL, .kthread = 1, .idle = 1, }; @@ -2589,8 +2589,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .io_thread = 1, }; @@ -2694,8 +2694,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .kthread = 1, }; @@ -2711,8 +2711,8 @@ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, }; return kernel_clone(&args); -- cgit v1.2.3 From 753550eb0ce1fea4b5cbd989f2e06ef80b2feb28 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Apr 2022 14:13:56 -0500 Subject: fork: Explicitly set PF_KTHREAD Instead of implicitly inheriting PF_KTHREAD from the parent process examine arguments in kernel_clone_args to see if PF_KTHREAD should be set. This makes knowledge of which new threads are kernel threads explicit. This also makes it so that init and the user mode helper processes no longer have PF_KTHREAD set. Link: https://lkml.kernel.org/r/20220506141512.516114-6-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8e17c3fbce42..35645f57bd2f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2068,6 +2068,9 @@ static __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; + p->flags &= ~PF_KTHREAD; + if (args->kthread) + p->flags |= PF_KTHREAD; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't -- cgit v1.2.3 From 1e8ca62b79dec20aeded4fe283f4890e5016767a Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Wed, 13 Apr 2022 16:36:49 -0700 Subject: kheaders: Have cpio unconditionally replace files For out-of-tree builds, this script invokes cpio twice to copy header files from the srctree and subsequently from the objtree. According to a comment in the script, there might be situations in which certain files already exist in the destination directory when header files are copied from the objtree: "The second CPIO can complain if files already exist which can happen with out of tree builds having stale headers in srctree. Just silence CPIO for now." GNU cpio might simply print a warning like "newer or same age version exists", but toybox cpio exits with a non-zero exit code unless the command line option "-u" is specified. To improve compatibility with toybox cpio, add the command line option "-u" to unconditionally replace existing files in the destination directory. Signed-off-by: Daniel Mentz Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 1966a749e0d9..0c78e64f747d 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -74,7 +74,7 @@ fi # of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 +done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1 # Remove comments except SDPX lines find $cpio_dir -type f -print0 | -- cgit v1.2.3 From 764aaf44cd64dd1f760268ee0b22d2dc53cd5bc0 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 27 Apr 2022 20:54:01 +0800 Subject: reboot: Fix build warning without CONFIG_SYSCTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SYSCTL is n, build warn: kernel/reboot.c:443:20: error: ‘kernel_reboot_sysctls_init’ defined but not used [-Werror=unused-function] static void __init kernel_reboot_sysctls_init(void) ^~~~~~~~~~~~~~~~~~~~~~~~~~ Move kernel_reboot_sysctls_init() to #ifdef block to fix this. Fixes: 06d177662fb8 ("kernel/reboot: move reboot sysctls to its own file") Signed-off-by: YueHaibing Signed-off-by: Luis Chamberlain --- kernel/reboot.c | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index ed4e6dfb7d44..ecbf09ea03c5 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -421,33 +421,6 @@ void ctrl_alt_del(void) static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; static const char reboot_cmd[] = "/sbin/reboot"; -#ifdef CONFIG_SYSCTL -static struct ctl_table kern_reboot_table[] = { - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; - -static void __init kernel_reboot_sysctls_init(void) -{ - register_sysctl_init("kernel", kern_reboot_table); -} -#else -#define kernel_reboot_sysctls_init() do { } while (0) -#endif /* CONFIG_SYSCTL */ - static int run_cmd(const char *cmd) { char **argv; @@ -895,6 +868,33 @@ static struct attribute *reboot_attrs[] = { NULL, }; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_reboot_table[] = { + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static void __init kernel_reboot_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_reboot_table); +} +#else +#define kernel_reboot_sysctls_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static const struct attribute_group reboot_attr_group = { .attrs = reboot_attrs, }; -- cgit v1.2.3 From 494dcdf46e5cdee926c9f441d37e3ea1db57d1da Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 27 Apr 2022 21:10:02 +0800 Subject: sched: Fix build warning without CONFIG_SYSCTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IF CONFIG_SYSCTL is n, build warn: kernel/sched/core.c:1782:12: warning: ‘sysctl_sched_uclamp_handler’ defined but not used [-Wunused-function] static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, ^~~~~~~~~~~~~~~~~~~~~~~~~~~ sysctl_sched_uclamp_handler() is used while CONFIG_SYSCTL enabled, wrap all related code with CONFIG_SYSCTL to fix this. Fixes: 3267e0156c33 ("sched: Move uclamp_util sysctls to core.c") Signed-off-by: YueHaibing Signed-off-by: Luis Chamberlain --- kernel/sched/core.c | 65 +++++++++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ef31751c5799..b8d2ff09b853 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1306,10 +1306,10 @@ static void set_load_weight(struct task_struct *p, bool update_load) static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -static unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -static unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* * By default RT tasks run at the maximum performance point/capacity of the @@ -1456,33 +1456,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p) task_rq_unlock(rq, p, &rf); } -static void uclamp_sync_util_min_rt_default(void) -{ - struct task_struct *g, *p; - - /* - * copy_process() sysctl_uclamp - * uclamp_min_rt = X; - * write_lock(&tasklist_lock) read_lock(&tasklist_lock) - * // link thread smp_mb__after_spinlock() - * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); - * sched_post_fork() for_each_process_thread() - * __uclamp_sync_rt() __uclamp_sync_rt() - * - * Ensures that either sched_post_fork() will observe the new - * uclamp_min_rt or for_each_process_thread() will observe the new - * task. - */ - read_lock(&tasklist_lock); - smp_mb__after_spinlock(); - read_unlock(&tasklist_lock); - - rcu_read_lock(); - for_each_process_thread(g, p) - uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); -} - static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -1762,6 +1735,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css) } static void cpu_util_update_eff(struct cgroup_subsys_state *css); +#endif + +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_UCLAMP_TASK +#ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { struct task_group *tg = &root_task_group; @@ -1779,6 +1757,33 @@ static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { } #endif +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -1843,6 +1848,8 @@ done: return result; } +#endif +#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) -- cgit v1.2.3 From 43bf087848ab796fab93c9b4de59a7ed70aab94a Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 5 May 2022 15:01:14 +0800 Subject: bpf: Remove unused parameter from find_kfunc_desc_btf() The func_id parameter in find_kfunc_desc_btf() is not used, get rid of it. Fixes: 2357672c54c3 ("bpf: Introduce BPF support for kernel module function calls") Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20220505070114.3522522-1-ytcoode@gmail.com --- kernel/bpf/verifier.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 813f6ee80419..c27fee73a2cb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1815,8 +1815,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) kfree(tab); } -static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset) +static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) { if (offset) { if (offset < 0) { @@ -1891,7 +1890,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset); + desc_btf = find_kfunc_desc_btf(env, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -2360,7 +2359,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); + desc_btf = find_kfunc_desc_btf(data, insn->off); if (IS_ERR(desc_btf)) return ""; @@ -7237,7 +7236,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); + desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); -- cgit v1.2.3 From 40f2bbf71161fa9195c7869004290003af152375 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: drop "compound" parameter from page_add_new_anon_rmap() New anonymous pages are always mapped natively: only THP/khugepaged code maps a new compound anonymous page and passes "true". Otherwise, we're just dealing with simple, non-compound pages. Let's give the interface clearer semantics and document these. Remove the PageTransCompound() sanity check from page_add_new_anon_rmap(). Link: https://lkml.kernel.org/r/20220428083441.37290-9-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 ++- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 2 +- mm/rmap.c | 11 ++++++----- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- 9 files changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 90e1e6925789..e4156921eea9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -185,11 +185,12 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, bool compound); + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); + void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6418083901d4..4ef5385815d3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); + page_add_new_anon_rmap(new_page, vma, addr); lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6232b6817fab..c0365280b481 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -647,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr, true); + page_add_new_anon_rmap(page, vma, haddr); lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ac53ad2c9bb1..a2560f970881 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1183,7 +1183,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); + page_add_new_anon_rmap(new_page, vma, address); lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); diff --git a/mm/memory.c b/mm/memory.c index 8e92010f3d89..3dedb575baef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -893,7 +893,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, dst_vma, addr, false); + page_add_new_anon_rmap(new_page, dst_vma, addr); lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; @@ -3058,7 +3058,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address, false); + page_add_new_anon_rmap(new_page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -3702,7 +3702,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); @@ -3852,7 +3852,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -4039,7 +4039,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 70c7dc05bbfc..fb6d7d5499f5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -610,7 +610,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); if (!is_zone_device_page(page)) lru_cache_add_inactive_or_unevictable(page, vma); get_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index 32630f1b1ee1..90f92c53476f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1226,19 +1226,22 @@ void page_add_anon_rmap(struct page *page, } /** - * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * page_add_new_anon_rmap - add mapping to a new anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped - * @compound: charge the page as compound or small page + * + * If it's a compound page, it is accounted as a compound page. As the page + * is new, it's assume to get mapped exclusively by a single process. * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, bool compound) + struct vm_area_struct *vma, unsigned long address) { + const bool compound = PageCompound(page); int nr = compound ? thp_nr_pages(page) : 1; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); @@ -1251,8 +1254,6 @@ void page_add_new_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { - /* Anon THP always mapped first with PMD */ - VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1ba525a2179d..0ad7ed7ded21 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1802,7 +1802,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (page == swapcache) { page_add_anon_rmap(page, vma, addr, RMAP_NONE); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } set_pte_at(vma->vm_mm, addr, pte, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e9bb6db002aa..dae25d985d15 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -104,7 +104,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, lru_cache_add(page); page_add_file_rmap(page, dst_vma, false); } else { - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr); lru_cache_add_inactive_or_unevictable(page, dst_vma); } -- cgit v1.2.3 From 9263dddc7b6f816fdd327eee435cc54ba51dd095 Mon Sep 17 00:00:00 2001 From: Takshak Chahande Date: Tue, 10 May 2022 01:22:20 -0700 Subject: bpf: Extend batch operations for map-in-map bpf-maps This patch extends batch operations support for map-in-map map-types: BPF_MAP_TYPE_HASH_OF_MAPS and BPF_MAP_TYPE_ARRAY_OF_MAPS A usecase where outer HASH map holds hundred of VIP entries and its associated reuse-ports per VIP stored in REUSEPORT_SOCKARRAY type inner map, needs to do batch operation for performance gain. This patch leverages the exiting generic functions for most of the batch operations. As map-in-map's value contains the actual reference of the inner map, for BPF_MAP_TYPE_HASH_OF_MAPS type, it needed an extra step to fetch the map_id from the reference value. selftests are added in next patch 2/2. Signed-off-by: Takshak Chahande Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220510082221.2390540-1-ctakshak@fb.com --- kernel/bpf/arraymap.c | 2 ++ kernel/bpf/hashtab.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b3bf31fd9458..724613da6576 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1345,6 +1345,8 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, .map_btf_id = &array_map_btf_ids[0], }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3e00e62b2218..705841279d16 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -140,7 +140,7 @@ static inline bool htab_use_raw_lock(const struct bpf_htab *htab) static void htab_init_buckets(struct bpf_htab *htab) { - unsigned i; + unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); @@ -1627,7 +1627,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); - u32 batch, max_count, size, bucket_size; + u32 batch, max_count, size, bucket_size, map_id; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1752,6 +1752,14 @@ again_nocopy: } } else { value = l->key + roundup_key_size; + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + struct bpf_map **inner_map = value; + + /* Actual value is the id of the inner map */ + map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); + value = &map_id; + } + if (elem_map_flags & BPF_F_LOCK) copy_map_value_locked(map, dst_val, value, true); @@ -2450,5 +2458,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; -- cgit v1.2.3 From 9f88361273082825d9f0d13a543d49f9fa0d44a8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 10 May 2022 17:52:30 +0200 Subject: bpf: Add bpf_link iterator Implement bpf_link iterator to traverse links via bpf_seq_file operations. The changeset is mostly shamelessly copied from commit a228a64fc1e4 ("bpf: Add bpf_prog iterator") Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220510155233.9815-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/link_iter.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 +++++++++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/link_iter.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be94833d390a..551b7198ae8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1544,6 +1544,7 @@ void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd); struct bpf_link *bpf_link_get_from_fd(u32 ufd); +struct bpf_link *bpf_link_get_curr_or_next(u32 *id); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c1a9be6a4b9f..057ba8e01e70 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c new file mode 100644 index 000000000000..fec8005a121c --- /dev/null +++ b/kernel/bpf/link_iter.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Red Hat, Inc. */ +#include +#include +#include +#include +#include + +struct bpf_iter_seq_link_info { + u32 link_id; +}; + +static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + struct bpf_link *link; + + link = bpf_link_get_curr_or_next(&info->link_id); + if (!link) + return NULL; + + if (*pos == 0) + ++*pos; + return link; +} + +static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + + ++*pos; + ++info->link_id; + bpf_link_put((struct bpf_link *)v); + return bpf_link_get_curr_or_next(&info->link_id); +} + +struct bpf_iter__bpf_link { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_link *, link); +}; + +DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link) + +static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_link ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.link = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_link_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_link_seq_show(seq, v, false); +} + +static void bpf_link_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_link_seq_show(seq, v, true); + else + bpf_link_put((struct bpf_link *)v); +} + +static const struct seq_operations bpf_link_seq_ops = { + .start = bpf_link_seq_start, + .next = bpf_link_seq_next, + .stop = bpf_link_seq_stop, + .show = bpf_link_seq_show, +}; + +BTF_ID_LIST(btf_bpf_link_id) +BTF_ID(struct, bpf_link) + +static const struct bpf_iter_seq_info bpf_link_seq_info = { + .seq_ops = &bpf_link_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_link_info), +}; + +static struct bpf_iter_reg bpf_link_reg_info = { + .target = "bpf_link", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_link, link), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_link_seq_info, +}; + +static int __init bpf_link_iter_init(void) +{ + bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id; + return bpf_iter_reg_target(&bpf_link_reg_info); +} + +late_initcall(bpf_link_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e0aead17dff4..50164d324eaf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4680,6 +4680,25 @@ struct bpf_link *bpf_link_by_id(u32 id) return link; } +struct bpf_link *bpf_link_get_curr_or_next(u32 *id) +{ + struct bpf_link *link; + + spin_lock_bh(&link_idr_lock); +again: + link = idr_get_next(&link_idr, id); + if (link) { + link = bpf_link_inc_not_zero(link); + if (IS_ERR(link)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&link_idr_lock); + + return link; +} + #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id static int bpf_link_get_fd_by_id(const union bpf_attr *attr) -- cgit v1.2.3 From d721def7392a7348ffb9f3583b264239cbd3702c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:12 +0200 Subject: kallsyms: Make kallsyms_on_each_symbol generally available Making kallsyms_on_each_symbol generally available, so it can be used outside CONFIG_LIVEPATCH option in following changes. Rather than adding another ifdef option let's make the function generally available (when CONFIG_KALLSYMS option is defined). Cc: Christoph Hellwig Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/kallsyms.h | 7 ++++++- kernel/kallsyms.c | 2 -- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index ce1bd2fbf23e..ad39636e0c3f 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -65,11 +65,11 @@ static inline void *dereference_symbol_descriptor(void *ptr) return ptr; } +#ifdef CONFIG_KALLSYMS int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -#ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); @@ -163,6 +163,11 @@ static inline bool kallsyms_show_value(const struct cred *cred) return false; } +static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, + unsigned long), void *data) +{ + return -EOPNOTSUPP; +} #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 79f2eb617a62..fdfd308bebc4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -228,7 +228,6 @@ unsigned long kallsyms_lookup_name(const char *name) return module_kallsyms_lookup_name(name); } -#ifdef CONFIG_LIVEPATCH /* * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. @@ -251,7 +250,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, } return 0; } -#endif /* CONFIG_LIVEPATCH */ static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, -- cgit v1.2.3 From bed0d9a50dacee6fcf785c555cfb0d2573355afc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:13 +0200 Subject: ftrace: Add ftrace_lookup_symbols function Adding ftrace_lookup_symbols function that resolves array of symbols with single pass over kallsyms. The user provides array of string pointers with count and pointer to allocated array for resolved values. int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) It iterates all kallsyms symbols and tries to loop up each in provided symbols array with bsearch. The symbols array needs to be sorted by name for this reason. We also check each symbol to pass ftrace_location, because this API will be used for fprobe symbols resolving. Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 6 +++++ kernel/kallsyms.c | 1 + kernel/trace/ftrace.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..820500430eae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -303,6 +303,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct ftrace_regs *fregs); + +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs); #else /* !CONFIG_FUNCTION_TRACER */ /* * (un)register_ftrace_function must be a macro since the ops parameter @@ -313,6 +315,10 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } +static inline int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FUNCTION_TRACER */ struct ftrace_func_entry { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fdfd308bebc4..fbdf8d3279ac 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * These will be re-linked against their real values diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..07d87c7a525d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7964,3 +7964,65 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct kallsyms_data { + unsigned long *addrs; + const char **syms; + size_t cnt; + size_t found; +}; + +static int kallsyms_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct kallsyms_data *args = data; + + if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp)) + return 0; + + addr = ftrace_location(addr); + if (!addr) + return 0; + + args->addrs[args->found++] = addr; + return args->found == args->cnt ? 1 : 0; +} + +/** + * ftrace_lookup_symbols - Lookup addresses for array of symbols + * + * @sorted_syms: array of symbols pointers symbols to resolve, + * must be alphabetically sorted + * @cnt: number of symbols/addresses in @syms/@addrs arrays + * @addrs: array for storing resulting addresses + * + * This function looks up addresses for array of symbols provided in + * @syms array (must be alphabetically sorted) and stores them in + * @addrs array, which needs to be big enough to store at least @cnt + * addresses. + * + * This function returns 0 if all provided symbols are found, + * -ESRCH otherwise. + */ +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + struct kallsyms_data args; + int err; + + args.addrs = addrs; + args.syms = sorted_syms; + args.cnt = cnt; + args.found = 0; + err = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (err < 0) + return err; + return args.found == args.cnt ? 0 : -ESRCH; +} -- cgit v1.2.3 From 8be9253344a1d8cd91b22655e55de41e3288aaf9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:14 +0200 Subject: fprobe: Resolve symbols with ftrace_lookup_symbols Using ftrace_lookup_symbols to speed up symbols lookup in register_fprobe_syms API. Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/fprobe.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 89d9f994ebb0..aac63ca9c3d1 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -85,39 +85,31 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, } NOKPROBE_SYMBOL(fprobe_exit_handler); +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + /* Convert ftrace location address from symbols */ static unsigned long *get_ftrace_locations(const char **syms, int num) { - unsigned long addr, size; unsigned long *addrs; - int i; /* Convert symbols to symbol address */ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return ERR_PTR(-ENOMEM); - for (i = 0; i < num; i++) { - addr = kallsyms_lookup_name(syms[i]); - if (!addr) /* Maybe wrong symbol */ - goto error; - - /* Convert symbol address to ftrace location. */ - if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size) - goto error; + /* ftrace_lookup_symbols expects sorted symbols */ + sort(syms, num, sizeof(*syms), symbols_cmp, NULL); - addr = ftrace_location_range(addr, addr + size - 1); - if (!addr) /* No dynamic ftrace there. */ - goto error; + if (!ftrace_lookup_symbols(syms, num, addrs)) + return addrs; - addrs[i] = addr; - } - - return addrs; - -error: kfree(addrs); - return ERR_PTR(-ENOENT); } -- cgit v1.2.3 From 0236fec57a15dc2a068dfe4488e0c2ab4559b1ec Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:15 +0200 Subject: bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link Using kallsyms_lookup_names function to speed up symbols lookup in kprobe multi link attachment and replacing with it the current kprobe_multi_resolve_syms function. This speeds up bpftrace kprobe attachment: # perf stat -r 5 -e cycles ./src/bpftrace -e 'kprobe:x* { } i:ms:1 { exit(); }' ... 6.5681 +- 0.0225 seconds time elapsed ( +- 0.34% ) After: # perf stat -r 5 -e cycles ./src/bpftrace -e 'kprobe:x* { } i:ms:1 { exit(); }' ... 0.5661 +- 0.0275 seconds time elapsed ( +- 4.85% ) Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 112 ++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f15b826f9899..7fd11c17558d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2229,6 +2229,59 @@ struct bpf_kprobe_multi_run_ctx { unsigned long entry_ip; }; +struct user_syms { + const char **syms; + char *buf; +}; + +static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) +{ + unsigned long __user usymbol; + const char **syms = NULL; + char *buf = NULL, *p; + int err = -ENOMEM; + unsigned int i; + + syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL); + if (!syms) + goto error; + + buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL); + if (!buf) + goto error; + + for (p = buf, i = 0; i < cnt; i++) { + if (__get_user(usymbol, usyms + i)) { + err = -EFAULT; + goto error; + } + err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + syms[i] = p; + p += err + 1; + } + + us->syms = syms; + us->buf = buf; + return 0; + +error: + if (err) { + kvfree(syms); + kvfree(buf); + } + return err; +} + +static void free_user_syms(struct user_syms *us) +{ + kvfree(us->syms); + kvfree(us->buf); +} + static void bpf_kprobe_multi_link_release(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; @@ -2349,53 +2402,12 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, kprobe_multi_link_prog_run(link, entry_ip, regs); } -static int -kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt, - unsigned long *addrs) +static int symbols_cmp(const void *a, const void *b) { - unsigned long addr, size; - const char __user **syms; - int err = -ENOMEM; - unsigned int i; - char *func; - - size = cnt * sizeof(*syms); - syms = kvzalloc(size, GFP_KERNEL); - if (!syms) - return -ENOMEM; + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; - func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - goto error; - - if (copy_from_user(syms, usyms, size)) { - err = -EFAULT; - goto error; - } - - for (i = 0; i < cnt; i++) { - err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN); - if (err == KSYM_NAME_LEN) - err = -E2BIG; - if (err < 0) - goto error; - err = -EINVAL; - addr = kallsyms_lookup_name(func); - if (!addr) - goto error; - if (!kallsyms_lookup_size_offset(addr, &size, NULL)) - goto error; - addr = ftrace_location_range(addr, addr + size - 1); - if (!addr) - goto error; - addrs[i] = addr; - } - - err = 0; -error: - kvfree(syms); - kfree(func); - return err; + return strcmp(*str_a, *str_b); } int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -2441,7 +2453,15 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; } } else { - err = kprobe_multi_resolve_syms(usyms, cnt, addrs); + struct user_syms us; + + err = copy_user_syms(&us, usyms, cnt); + if (err) + goto error; + + sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL); + err = ftrace_lookup_symbols(us.syms, cnt, addrs); + free_user_syms(&us); if (err) goto error; } -- cgit v1.2.3 From f7e0beaf39d3868dc700d4954b26cf8443c5d423 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:19 -0700 Subject: bpf, x86: Generate trampolines from bpf_tramp_links Replace struct bpf_tramp_progs with struct bpf_tramp_links to collect struct bpf_tramp_link(s) for a trampoline. struct bpf_tramp_link extends bpf_link to act as a linked list node. arch_prepare_bpf_trampoline() accepts a struct bpf_tramp_links to collects all bpf_tramp_link(s) that a trampoline should call. Change BPF trampoline and bpf_struct_ops to pass bpf_tramp_links instead of bpf_tramp_progs. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-2-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 36 +++++++++++---------- include/linux/bpf.h | 36 ++++++++++++++------- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 71 +++++++++++++++++++++++++++------------- kernel/bpf/syscall.c | 23 +++++-------- kernel/bpf/trampoline.c | 73 ++++++++++++++++++++++++------------------ net/bpf/bpf_dummy_struct_ops.c | 24 +++++++++++--- tools/bpf/bpftool/link.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 10 files changed, 164 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 16b6efacf7c6..38eb43159230 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1762,10 +1762,12 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool save_ret) + struct bpf_tramp_link *l, int stack_size, + bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; + struct bpf_prog *p = l->link.prog; /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1850,14 +1852,14 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size, + struct bpf_tramp_links *tl, int stack_size, bool save_ret) { int i; u8 *prog = *pprog; - for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, save_ret)) return -EINVAL; } @@ -1866,7 +1868,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, } static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size, + struct bpf_tramp_links *tl, int stack_size, u8 **branches) { u8 *prog = *pprog; @@ -1877,8 +1879,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, */ emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true)) + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, true)) return -EINVAL; /* mod_ret prog stored return value into [rbp - 8]. Emit: @@ -1980,14 +1982,14 @@ static bool is_valid_bpf_tramp_flags(unsigned int flags) */ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { int ret, i, nr_args = m->nr_args; int regs_off, ip_off, args_off, stack_size = nr_args * 8; - struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; - struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; - struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; bool save_ret; @@ -2078,13 +2080,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } } - if (fentry->nr_progs) + if (fentry->nr_links) if (invoke_bpf(m, &prog, fentry, regs_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; - if (fmod_ret->nr_progs) { - branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *), + if (fmod_ret->nr_links) { + branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -2111,7 +2113,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i prog += X86_PATCH_SIZE; } - if (fmod_ret->nr_progs) { + if (fmod_ret->nr_links) { /* From Intel 64 and IA-32 Architectures Optimization * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler * Coding Rule 11: All branch targets should be 16-byte @@ -2121,12 +2123,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit. */ - for (i = 0; i < fmod_ret->nr_progs; i++) + for (i = 0; i < fmod_ret->nr_links; i++) emit_cond_near_jump(&branches[i], prog, branches[i], X86_JNE); } - if (fexit->nr_progs) + if (fexit->nr_links) if (invoke_bpf(m, &prog, fexit, regs_off, false)) { ret = -EINVAL; goto cleanup; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 551b7198ae8a..75e0110a65e1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -723,11 +723,11 @@ struct btf_func_model { /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -#define BPF_MAX_TRAMP_PROGS 38 +#define BPF_MAX_TRAMP_LINKS 38 -struct bpf_tramp_progs { - struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; - int nr_progs; +struct bpf_tramp_links { + struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; + int nr_links; }; /* Different use cases for BPF trampoline: @@ -753,7 +753,7 @@ struct bpf_tramp_progs { struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); @@ -852,9 +852,10 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( { return bpf_func(ctx, insnsi); } + #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); @@ -905,12 +906,12 @@ int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else -static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, +static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog, +static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; @@ -1009,7 +1010,6 @@ struct bpf_prog_aux { bool tail_call_reachable; bool xdp_has_frags; bool use_bpf_prog_pack; - struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -1096,6 +1096,18 @@ struct bpf_link_ops { struct bpf_link_info *info); }; +struct bpf_tramp_link { + struct bpf_link link; + struct hlist_node tramp_hlist; +}; + +struct bpf_tracing_link { + struct bpf_tramp_link link; + enum bpf_attach_type attach_type; + struct bpf_trampoline *trampoline; + struct bpf_prog *tgt_prog; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; @@ -1133,8 +1145,8 @@ bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 3e24ad0c4b3c..2b9112b80171 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -141,3 +141,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) +BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 95a3d1ff6255..3d032ea1b6a3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1013,6 +1013,7 @@ enum bpf_link_type { BPF_LINK_TYPE_XDP = 6, BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 3a0103ad97bc..d9a3c9207240 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -33,15 +33,15 @@ struct bpf_struct_ops_map { const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; - /* progs has all the bpf_prog that is populated + /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data). */ - struct bpf_prog **progs; + struct bpf_link **links; /* image is a page that has all the trampolines * that stores the func args before calling the bpf_prog. * A PAGE_SIZE "image" is enough to store all trampoline for - * "progs[]". + * "links[]". */ void *image; /* uvalue->data stores the kernel struct @@ -283,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) u32 i; for (i = 0; i < btf_type_vlen(t); i++) { - if (st_map->progs[i]) { - bpf_prog_put(st_map->progs[i]); - st_map->progs[i] = NULL; + if (st_map->links[i]) { + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } } @@ -316,18 +316,34 @@ static int check_zero_holes(const struct btf_type *t, void *data) return 0; } -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +static void bpf_struct_ops_link_release(struct bpf_link *link) +{ +} + +static void bpf_struct_ops_link_dealloc(struct bpf_link *link) +{ + struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); + + kfree(tlink); +} + +const struct bpf_link_ops bpf_struct_ops_link_lops = { + .release = bpf_struct_ops_link_release, + .dealloc = bpf_struct_ops_link_dealloc, +}; + +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end) { u32 flags; - tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; - tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + tlinks[BPF_TRAMP_FENTRY].links[0] = link; + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tprogs, NULL); + model, flags, tlinks, NULL); } static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -338,7 +354,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; - struct bpf_tramp_progs *tprogs = NULL; + struct bpf_tramp_links *tlinks = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image, *image_end; @@ -362,8 +378,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -386,6 +402,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; + struct bpf_tramp_link *link; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; @@ -439,16 +456,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = PTR_ERR(prog); goto reset_unlock; } - st_map->progs[i] = prog; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || prog->aux->attach_btf_id != st_ops->type_id || prog->expected_attach_type != i) { + bpf_prog_put(prog); err = -EINVAL; goto reset_unlock; } - err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + bpf_prog_put(prog); + err = -ENOMEM; + goto reset_unlock; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog); + st_map->links[i] = &link->link; + + err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], image, image_end); if (err < 0) @@ -491,7 +518,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tprogs); + kfree(tlinks); mutex_unlock(&st_map->lock); return err; } @@ -546,9 +573,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - if (st_map->progs) + if (st_map->links) bpf_struct_ops_map_put_progs(st_map); - bpf_map_area_free(st_map->progs); + bpf_map_area_free(st_map->links); bpf_jit_free_exec(st_map->image); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -597,11 +624,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->progs = - bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + st_map->links = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->progs || !st_map->image) { + if (!st_map->uvalue || !st_map->links || !st_map->image) { bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50164d324eaf..2dc582773344 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2864,19 +2864,12 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) } EXPORT_SYMBOL(bpf_link_get_from_fd); -struct bpf_tracing_link { - struct bpf_link link; - enum bpf_attach_type attach_type; - struct bpf_trampoline *trampoline; - struct bpf_prog *tgt_prog; -}; - static void bpf_tracing_link_release(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); bpf_trampoline_put(tr_link->trampoline); @@ -2889,7 +2882,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -2898,7 +2891,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); seq_printf(seq, "attach_type:\t%d\n", @@ -2909,7 +2902,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -2990,7 +2983,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; @@ -3060,11 +3053,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, tgt_prog = prog->aux->dst_prog; } - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link.link, &link_primer); if (err) goto out_unlock; - err = bpf_trampoline_link_prog(prog, tr); + err = bpf_trampoline_link_prog(&link->link, tr); if (err) { bpf_link_cleanup(&link_primer); link = NULL; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ada97751ae1b..d5e6bc5517cb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -168,30 +168,30 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -static struct bpf_tramp_progs * +static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - const struct bpf_prog_aux *aux; - struct bpf_tramp_progs *tprogs; - struct bpf_prog **progs; + struct bpf_tramp_link *link; + struct bpf_tramp_links *tlinks; + struct bpf_tramp_link **links; int kind; *total = 0; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tprogs[kind].nr_progs = tr->progs_cnt[kind]; + tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - progs = tprogs[kind].progs; + links = tlinks[kind].links; - hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= aux->prog->call_get_func_ip; - *progs++ = aux->prog; + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= link->link.prog->call_get_func_ip; + *links++ = link; } } - return tprogs; + return tlinks; } static void __bpf_tramp_image_put_deferred(struct work_struct *work) @@ -330,14 +330,14 @@ out: static int bpf_trampoline_update(struct bpf_trampoline *tr) { struct bpf_tramp_image *im; - struct bpf_tramp_progs *tprogs; + struct bpf_tramp_links *tlinks; u32 flags = BPF_TRAMP_F_RESTORE_REGS; bool ip_arg = false; int err, total; - tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tprogs)) - return PTR_ERR(tprogs); + tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tlinks)) + return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); @@ -353,15 +353,15 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs || - tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) + if (tlinks[BPF_TRAMP_FEXIT].nr_links || + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; if (ip_arg) flags |= BPF_TRAMP_F_IP_ARG; err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, - &tr->func.model, flags, tprogs, + &tr->func.model, flags, tlinks, tr->func.addr); if (err < 0) goto out; @@ -381,7 +381,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) tr->cur_image = im; tr->selector++; out: - kfree(tprogs); + kfree(tlinks); return err; } @@ -407,13 +407,14 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; + struct bpf_tramp_link *link_exiting; int err = 0; int cnt; - kind = bpf_attach_type_to_tramp(prog); + kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { /* cannot attach fentry/fexit if extension prog is attached. @@ -429,25 +430,33 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) err = -EBUSY; goto out; } - tr->extension_prog = prog; + tr->extension_prog = link->link.prog; err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - prog->bpf_func); + link->link.prog->bpf_func); goto out; } - if (cnt >= BPF_MAX_TRAMP_PROGS) { + if (cnt >= BPF_MAX_TRAMP_LINKS) { err = -E2BIG; goto out; } - if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + if (!hlist_unhashed(&link->tramp_hlist)) { /* prog already linked */ err = -EBUSY; goto out; } - hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + if (link_exiting->link.prog != link->link.prog) + continue; + /* prog already linked */ + err = -EBUSY; + goto out; + } + + hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr); if (err) { - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } out: @@ -456,12 +465,12 @@ out: } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(prog); + kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); @@ -470,7 +479,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) tr->extension_prog = NULL; goto out; } - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(tr); out: @@ -635,7 +644,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { return -ENOTSUPP; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index d0e54e30658a..e78dadfc5829 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -72,13 +72,16 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) args->args[3], args->args[4]); } +extern const struct bpf_link_ops bpf_struct_ops_link_lops; + int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_progs *tprogs; + struct bpf_tramp_links *tlinks; + struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; int prog_ret; @@ -92,8 +95,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(args)) return PTR_ERR(args); - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) { + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) { err = -ENOMEM; goto out; } @@ -105,8 +108,17 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, } set_vm_flush_reset_perms(image); + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out; + } + /* prog doesn't take the ownership of the reference from caller */ + bpf_prog_inc(prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); + op_idx = prog->expected_attach_type; - err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[op_idx], image, image + PAGE_SIZE); if (err < 0) @@ -124,7 +136,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, out: kfree(args); bpf_jit_free_exec(image); - kfree(tprogs); + if (link) + bpf_link_put(&link->link); + kfree(tlinks); return err; } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 8fb0116f9136..6353a789322b 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -23,6 +23,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_XDP] = "xdp", [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", + [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", }; static struct hashmap *link_table; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 95a3d1ff6255..3d032ea1b6a3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1013,6 +1013,7 @@ enum bpf_link_type { BPF_LINK_TYPE_XDP = 6, BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From e384c7b7b46d0a5f4bf3c554f963e6e9622d0ab1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:20 -0700 Subject: bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on stacks and set/reset the current bpf_run_ctx before/after calling a bpf_prog. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-3-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 41 ++++++++++++++++++++++++++++++++--------- include/linux/bpf.h | 17 +++++++++++++---- kernel/bpf/syscall.c | 7 +++++-- kernel/bpf/trampoline.c | 20 ++++++++++++++++---- 4 files changed, 66 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 38eb43159230..1fbc5cf1c7a7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1763,14 +1763,30 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, - bool save_ret) + int run_ctx_off, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; + int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); struct bpf_prog *p = l->link.prog; + /* mov rdi, 0 */ + emit_mov_imm64(&prog, BPF_REG_1, 0, 0); + + /* Prepare struct bpf_tramp_run_ctx. + * + * bpf_tramp_run_ctx is already preserved by + * arch_prepare_bpf_trampoline(). + * + * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi + */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: lea rsi, [rbp - ctx_cookie_off] */ + EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); + if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter, prog)) @@ -1816,6 +1832,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: mov rsi, rbx <- start time in nsec */ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + /* arg3: lea rdx, [rbp - run_ctx_off] */ + EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit, prog)) @@ -1853,14 +1871,14 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, - bool save_ret) + int run_ctx_off, bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tl->nr_links; i++) { if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, - save_ret)) + run_ctx_off, save_ret)) return -EINVAL; } *pprog = prog; @@ -1869,7 +1887,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, - u8 **branches) + int run_ctx_off, u8 **branches) { u8 *prog = *pprog; int i; @@ -1880,7 +1898,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); for (i = 0; i < tl->nr_links; i++) { - if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, true)) + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true)) return -EINVAL; /* mod_ret prog stored return value into [rbp - 8]. Emit: @@ -1986,7 +2004,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *orig_call) { int ret, i, nr_args = m->nr_args; - int regs_off, ip_off, args_off, stack_size = nr_args * 8; + int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; @@ -2016,6 +2034,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * RBP - args_off [ args count ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * + * RBP - run_ctx_off [ bpf_tramp_run_ctx ] */ /* room for return value of orig_call or fentry prog */ @@ -2034,6 +2054,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ip_off = stack_size; + stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7; + run_ctx_off = stack_size; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip patched call instruction and point orig_call to actual * body of the kernel function. @@ -2081,7 +2104,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_links) - if (invoke_bpf(m, &prog, fentry, regs_off, + if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; @@ -2092,7 +2115,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i return -ENOMEM; if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off, - branches)) { + run_ctx_off, branches)) { ret = -EINVAL; goto cleanup; } @@ -2129,7 +2152,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_links) - if (invoke_bpf(m, &prog, fexit, regs_off, false)) { + if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) { ret = -EINVAL; goto cleanup; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75e0110a65e1..256fb802e580 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -730,6 +730,8 @@ struct bpf_tramp_links { int nr_links; }; +struct bpf_tramp_run_ctx; + /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS @@ -756,10 +758,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); @@ -1351,6 +1354,12 @@ struct bpf_trace_run_ctx { u64 bpf_cookie; }; +struct bpf_tramp_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; + struct bpf_run_ctx *saved_run_ctx; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2dc582773344..d48165fccf49 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5020,6 +5020,7 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; switch (cmd) { case BPF_MAP_CREATE: @@ -5047,13 +5048,15 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return -EINVAL; } - if (!__bpf_prog_enter_sleepable(prog)) { + run_ctx.bpf_cookie = 0; + run_ctx.saved_run_ctx = NULL; + if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { /* recursion detected */ bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); bpf_prog_put(prog); return 0; #endif diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d5e6bc5517cb..baf1b65d523e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -568,11 +568,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; @@ -602,29 +605,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); -- cgit v1.2.3 From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:21 -0700 Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm. Pass a cookie along with BPF_LINK_CREATE requests. Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie. The cookie of a bpf_tracing_link is available by calling bpf_get_attach_cookie when running the BPF program of the attached link. The value of a cookie will be set at bpf_tramp_run_ctx by the trampoline of the link. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 5 +++-- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/bpf_lsm.c | 17 +++++++++++++++++ kernel/bpf/syscall.c | 12 ++++++++---- kernel/bpf/trampoline.c | 7 +++++-- kernel/trace/bpf_trace.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 8 files changed, 69 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1fbc5cf1c7a7..a2b6d197c226 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1769,9 +1769,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); struct bpf_prog *p = l->link.prog; + u64 cookie = l->cookie; - /* mov rdi, 0 */ - emit_mov_imm64(&prog, BPF_REG_1, 0, 0); + /* mov rdi, cookie */ + emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie); /* Prepare struct bpf_tramp_run_ctx. * diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 256fb802e580..aba7ded56436 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,7 @@ struct bpf_link_ops { struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; + u64 cookie; }; struct bpf_tracing_link { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3d032ea1b6a3..bc7f89948f54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1490,6 +1490,15 @@ union bpf_attr { __aligned_u64 addrs; __aligned_u64 cookies; } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; }; } link_create; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 064eccba641d..c1351df9f7ee 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -117,6 +117,21 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_1(bpf_get_attach_cookie, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto = { + .func = bpf_get_attach_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -141,6 +156,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; case BPF_FUNC_ima_file_hash: return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d48165fccf49..72e53489165d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2921,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, - u32 btf_id) + u32 btf_id, + u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -2986,6 +2987,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -3271,7 +3273,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0); + return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) @@ -4524,7 +4526,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_EXT: ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, - attr->link_create.target_btf_id); + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: @@ -4539,7 +4542,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, - attr->link_create.target_btf_id); + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index baf1b65d523e..0e9b3aefc34a 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex); bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; + enum bpf_prog_type ptype = prog->type; - return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN; + return (ptype == BPF_PROG_TYPE_TRACING && + (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN)) || + (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void *bpf_jit_alloc_exec_page(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fd11c17558d..2eaac094caf8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1091,6 +1091,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { + .func = bpf_get_attach_cookie_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { #ifndef CONFIG_X86 @@ -1719,6 +1734,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3d032ea1b6a3..bc7f89948f54 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1490,6 +1490,15 @@ union bpf_attr { __aligned_u64 addrs; __aligned_u64 cookies; } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; }; } link_create; -- cgit v1.2.3 From b3f9916d81e8ffb21cbe7abccf63f86a5a1d598a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 May 2022 12:16:21 -0500 Subject: sched: Update task_tick_numa to ignore tasks without an mm Qian Cai wrote: > Reverting the last 3 commits of the series fixed a boot crash. > > 1b2552cbdbe0 fork: Stop allowing kthreads to call execve > 753550eb0ce1 fork: Explicitly set PF_KTHREAD > 68d85f0a33b0 init: Deal with the init process being a user mode process > > BUG: KASAN: null-ptr-deref in task_nr_scan_windows.isra.0 > arch_atomic_long_read at ./include/linux/atomic/atomic-long.h:29 > (inlined by) atomic_long_read at ./include/linux/atomic/atomic-instrumented.h:1266 > (inlined by) get_mm_counter at ./include/linux/mm.h:1996 > (inlined by) get_mm_rss at ./include/linux/mm.h:2049 > (inlined by) task_nr_scan_windows at kernel/sched/fair.c:1123 > Read of size 8 at addr 00000000000003d0 by task swapper/0/1 With the change to init and the user mode helper processes to not have PF_KTHREAD set before they call kernel_execve the PF_KTHREAD test in task_tick_numa became insufficient to detect all tasks that have "->mm == NULL". Correct that by testing for "->mm == NULL" directly. Reported-by: Qian Cai Tested-by: Qian Cai Fixes: 1b2552cbdbe0 ("fork: Stop allowing kthreads to call execve") Link: https://lkml.kernel.org/r/87r150ug1l.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..db6f0df9d43e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2915,7 +2915,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* -- cgit v1.2.3 From 92826e967535db2eb117db227b1191aaf98e4bb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Apr 2022 19:20:24 +0200 Subject: dma-direct: don't fail on highmem CMA pages in dma_direct_alloc_pages When dma_direct_alloc_pages encounters a highmem page it just gives up currently. But what we really should do is to try memory using the page allocator instead - without this platforms with a global highmem CMA pool will fail all dma_alloc_pages allocations. Fixes: efa70f2fdc84 ("dma-mapping: add a new dma_alloc_pages API") Reported-by: Mark O'Neill Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 9743c6ccce1a..3e7f4aab740e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -115,7 +115,7 @@ static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp) + gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); struct page *page = NULL; @@ -129,9 +129,12 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, size); - page = NULL; + if (page) { + if (!dma_coherent_ok(dev, page_to_phys(page), size) || + (!allow_highmem && PageHighMem(page))) { + dma_free_contiguous(dev, page, size); + page = NULL; + } } again: if (!page) @@ -189,7 +192,7 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, { struct page *page; - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; @@ -262,7 +265,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; @@ -370,19 +373,9 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); - page = __dma_direct_alloc_pages(dev, size, gfp); + page = __dma_direct_alloc_pages(dev, size, gfp, false); if (!page) return NULL; - if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, - * so log an error and fail. - */ - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; - } ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) -- cgit v1.2.3 From 84bc4f1dbbbb5f8aa68706a96711dccb28b518e5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 10 May 2022 13:17:32 -0400 Subject: dma-debug: change allocation mode from GFP_NOWAIT to GFP_ATIOMIC We observed the error "cacheline tracking ENOMEM, dma-debug disabled" during a light system load (copying some files). The reason for this error is that the dma_active_cacheline radix tree uses GFP_NOWAIT allocation - so it can't access the emergency memory reserves and it fails as soon as anybody reaches the watermark. This patch changes GFP_NOWAIT to GFP_ATOMIC, so that it can access the emergency memory reserves. Signed-off-by: Mikulas Patocka Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f8ff598596b8..ac740630c79c 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -448,7 +448,7 @@ void debug_dma_dump_mappings(struct device *dev) * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. */ -static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) -- cgit v1.2.3 From 157cc18122b4a1456d19048e151a164216c4a704 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Apr 2022 09:48:54 -0500 Subject: signal: Rename send_signal send_signal_locked Rename send_signal and __send_signal to send_signal_locked and __send_signal_locked to make send_signal usable outside of signal.c. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 2 ++ kernel/signal.c | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index a6db6f2ae113..55605bdf5ce9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -283,6 +283,8 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info, extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); +extern int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); diff --git a/kernel/signal.c b/kernel/signal.c index 30cd1ca43bcd..b0403197b0ad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1071,8 +1071,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, bool force) +static int __send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; @@ -1212,8 +1212,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) return ret; } -static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type) +int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; @@ -1245,7 +1245,7 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct force = true; } } - return __send_signal(sig, info, t, type, force); + return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1284,7 +1284,7 @@ __setup("print-fatal-signals=", setup_print_fatal_signals); int __group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { - return send_signal(sig, info, p, PIDTYPE_TGID); + return send_signal_locked(sig, info, p, PIDTYPE_TGID); } int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, @@ -1294,7 +1294,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, type); + ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -1347,7 +1347,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = send_signal(sig, info, t, PIDTYPE_PID); + ret = send_signal_locked(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; @@ -1567,7 +1567,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); + ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -2103,7 +2103,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * parent's namespaces. */ if (valid_signal(sig) && sig) - __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false); + __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); @@ -2601,7 +2601,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr) || fatal_signal_pending(current)) { - send_signal(signr, info, current, type); + send_signal_locked(signr, info, current, type); signr = 0; } @@ -4793,7 +4793,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); + ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", -- cgit v1.2.3 From e71ba124078e391879e0bf111529fa2d630d106c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Apr 2022 09:28:50 -0500 Subject: signal: Replace __group_send_sig_info with send_signal_locked The function __group_send_sig_info is just a light wrapper around send_signal_locked with one parameter fixed to a constant value. As the wrapper adds no real value update the code to directly call the wrapped function. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- drivers/tty/tty_jobctrl.c | 4 ++-- include/linux/signal.h | 1 - kernel/signal.c | 8 +------- kernel/time/posix-cpu-timers.c | 6 +++--- 4 files changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c index 80b86a7992b5..0d04287da098 100644 --- a/drivers/tty/tty_jobctrl.c +++ b/drivers/tty/tty_jobctrl.c @@ -215,8 +215,8 @@ int tty_signal_session_leader(struct tty_struct *tty, int exit_session) spin_unlock_irq(&p->sighand->siglock); continue; } - __group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); - __group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); + send_signal_locked(SIGHUP, SEND_SIG_PRIV, p, PIDTYPE_TGID); + send_signal_locked(SIGCONT, SEND_SIG_PRIV, p, PIDTYPE_TGID); put_pid(p->signal->tty_old_pgrp); /* A noop */ spin_lock(&tty->ctrl.lock); tty_pgrp = get_pid(tty->ctrl.pgrp); diff --git a/include/linux/signal.h b/include/linux/signal.h index 55605bdf5ce9..3b98e7a28538 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -282,7 +282,6 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); -extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); diff --git a/kernel/signal.c b/kernel/signal.c index b0403197b0ad..72d96614effc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1281,12 +1281,6 @@ static int __init setup_print_fatal_signals(char *str) __setup("print-fatal-signals=", setup_print_fatal_signals); -int -__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) -{ - return send_signal_locked(sig, info, p, PIDTYPE_TGID); -} - int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { @@ -2173,7 +2167,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - __group_send_sig_info(SIGCHLD, &info, parent); + send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a97193984db..cb925e8ef9a8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -870,7 +870,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) { if (tsk->dl.dl_overrun) { tsk->dl.dl_overrun = 0; - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } } @@ -884,7 +884,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) rt ? "RT" : "CPU", hard ? "hard" : "soft", current->comm, task_pid_nr(current)); } - __group_send_sig_info(signo, SEND_SIG_PRIV, current); + send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID); return true; } @@ -958,7 +958,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, task_tgid(tsk), cur_time); - __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } if (it->expires && it->expires < *expires) -- cgit v1.2.3 From 16cc1bc67de88be19fa595f4645506ea2ac106d2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 11:09:11 -0500 Subject: ptrace: Remove arch_ptrace_attach The last remaining implementation of arch_ptrace_attach is ia64's ptrace_attach_sync_user_rbs which was added at the end of 2007 in commit aa91a2e90044 ("[IA64] Synchronize RBS on PTRACE_ATTACH"). Reading the comments and examining the code ptrace_attach_sync_user_rbs has the sole purpose of saving registers to the stack when ptrace_attach changes TASK_STOPPED to TASK_TRACED. In all other cases arch_ptrace_stop takes care of the register saving. In commit d79fdd6d96f4 ("ptrace: Clean transitions between TASK_STOPPED and TRACED") modified ptrace_attach to wake up the thread and enter ptrace_stop normally even when the thread starts out stopped. This makes ptrace_attach_sync_user_rbs completely unnecessary. So just remove it. I read through the code to verify that ptrace_attach_sync_user_rbs is unnecessary. What I found is that the code is quite dead. Reading ptrace_attach_sync_user_rbs it is easy to see that the it does nothing unless __state == TASK_STOPPED. Calling arch_ptrace_attach (aka ptrace_attach_sync_user_rbs) after ptrace_traceme it is easy to see that because we are talking about the current process the value of __state is TASK_RUNNING. Which means ptrace_attach_sync_user_rbs does nothing. The only other call of arch_ptrace_attach (aka ptrace_attach_sync_user_rbs) is after ptrace_attach. If the task is running (and PTRACE_SEIZE is not specified), a SIGSTOP is sent which results in do_signal_stop setting JOBCTL_TRAP_STOP on the target task (as it is ptraced) and the target task stopping in ptrace_stop with __state == TASK_TRACED. If the task was already stopped then ptrace_attach sets JOBCTL_TRAPPING and JOBCTL_TRAP_STOP, wakes it out of __TASK_STOPPED, and waits until the JOBCTL_TRAPPING_BIT is clear. At which point the task stops in ptrace_stop. In both cases there are a couple of funning excpetions such as if the traced task receiveds a SIGCONT, or is set a fatal signal. However in all of those cases the tracee never stops in __state TASK_STOPPED. Which is a long way of saying that ptrace_attach_sync_user_rbs is guaranteed never to do anything. Cc: linux-ia64@vger.kernel.org Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/ia64/include/asm/ptrace.h | 4 --- arch/ia64/kernel/ptrace.c | 57 ------------------------------------------ kernel/ptrace.c | 18 ------------- 3 files changed, 79 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index a10a498eede1..402874489890 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -139,10 +139,6 @@ static inline long regs_return_value(struct pt_regs *regs) #define arch_ptrace_stop_needed() \ (!test_thread_flag(TIF_RESTORE_RSE)) - extern void ptrace_attach_sync_user_rbs (struct task_struct *); - #define arch_ptrace_attach(child) \ - ptrace_attach_sync_user_rbs(child) - #define arch_has_single_step() (1) #define arch_has_block_step() (1) diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index a19acd9f5e1f..a45f529046c3 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -617,63 +617,6 @@ void ia64_sync_krbs(void) unw_init_running(do_sync_rbs, ia64_sync_kernel_rbs); } -/* - * After PTRACE_ATTACH, a thread's register backing store area in user - * space is assumed to contain correct data whenever the thread is - * stopped. arch_ptrace_stop takes care of this on tracing stops. - * But if the child was already stopped for job control when we attach - * to it, then it might not ever get into ptrace_stop by the time we - * want to examine the user memory containing the RBS. - */ -void -ptrace_attach_sync_user_rbs (struct task_struct *child) -{ - int stopped = 0; - struct unw_frame_info info; - - /* - * If the child is in TASK_STOPPED, we need to change that to - * TASK_TRACED momentarily while we operate on it. This ensures - * that the child won't be woken up and return to user mode while - * we are doing the sync. (It can only be woken up for SIGKILL.) - */ - - read_lock(&tasklist_lock); - if (child->sighand) { - spin_lock_irq(&child->sighand->siglock); - if (READ_ONCE(child->__state) == TASK_STOPPED && - !test_and_set_tsk_thread_flag(child, TIF_RESTORE_RSE)) { - set_notify_resume(child); - - WRITE_ONCE(child->__state, TASK_TRACED); - stopped = 1; - } - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); - - if (!stopped) - return; - - unw_init_from_blocked_task(&info, child); - do_sync_rbs(&info, ia64_sync_user_rbs); - - /* - * Now move the child back into TASK_STOPPED if it should be in a - * job control stop, so that SIGCONT can be used to wake it up. - */ - read_lock(&tasklist_lock); - if (child->sighand) { - spin_lock_irq(&child->sighand->siglock); - if (READ_ONCE(child->__state) == TASK_TRACED && - (child->signal->flags & SIGNAL_STOP_STOPPED)) { - WRITE_ONCE(child->__state, TASK_STOPPED); - } - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); -} - /* * Write f32-f127 back to task->thread.fph if it has been modified. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ccc4b465775b..da30dcd477a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1285,10 +1285,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -#ifndef arch_ptrace_attach -#define arch_ptrace_attach(child) do { } while (0) -#endif - SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, unsigned long, data) { @@ -1297,8 +1293,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); - if (!ret) - arch_ptrace_attach(current); goto out; } @@ -1310,12 +1304,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } @@ -1455,12 +1443,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } -- cgit v1.2.3 From cb3c19c93d656caa6fe63d6277aabd7e570f1d03 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 09:16:10 -0500 Subject: signal: Use lockdep_assert_held instead of assert_spin_locked The distinction is that assert_spin_locked() checks if the lock is held *by*anyone* whereas lockdep_assert_held() asserts the current context holds the lock. Also, the check goes away if you build without lockdep. Suggested-by: Peter Zijlstra Link: https://lkml.kernel.org/r/Ympr/+PX4XgT/UKU@hirez.programming.kicks-ass.net Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-6-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 72d96614effc..3fd2ce133387 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); @@ -1079,7 +1079,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info, int override_rlimit; int ret = 0, result; - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) -- cgit v1.2.3 From 6a2d90ba027adba528509ffa27097cffd3879257 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 09:23:55 -0500 Subject: ptrace: Reimplement PTRACE_KILL by always sending SIGKILL The current implementation of PTRACE_KILL is buggy and has been for many years as it assumes it's target has stopped in ptrace_stop. At a quick skim it looks like this assumption has existed since ptrace support was added in linux v1.0. While PTRACE_KILL has been deprecated we can not remove it as a quick search with google code search reveals many existing programs calling it. When the ptracee is not stopped at ptrace_stop some fields would be set that are ignored except in ptrace_stop. Making the userspace visible behavior of PTRACE_KILL a noop in those case. As the usual rules are not obeyed it is not clear what the consequences are of calling PTRACE_KILL on a running process. Presumably userspace does not do this as it achieves nothing. Replace the implementation of PTRACE_KILL with a simple send_sig_info(SIGKILL) followed by a return 0. This changes the observable user space behavior only in that PTRACE_KILL on a process not stopped in ptrace_stop will also kill it. As that has always been the intent of the code this seems like a reasonable change. Cc: stable@vger.kernel.org Reported-by: Al Viro Suggested-by: Al Viro Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-7-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/step.c | 3 +-- kernel/ptrace.c | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0f3c307b37b3..8e2b2552b5ee 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -180,8 +180,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index da30dcd477a0..7105821595bc 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1236,9 +1236,8 @@ int ptrace_request(struct task_struct *child, long request, return ptrace_resume(child, request, data); case PTRACE_KILL: - if (child->exit_state) /* already dead */ - return 0; - return ptrace_resume(child, request, SIGKILL); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, child); + return 0; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: -- cgit v1.2.3 From 7b0fe1367ef2d2591c20f03c4e64b7230e1ebcd7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 5 May 2022 12:25:36 -0500 Subject: ptrace: Document that wait_task_inactive can't fail After ptrace_freeze_traced succeeds it is known that the tracee has a __state value of __TASK_TRACED and that no __ptrace_unlink will happen because the tracer is waiting for the tracee, and the tracee is in ptrace_stop. The function ptrace_freeze_traced can succeed at any point after ptrace_stop has set TASK_TRACED and dropped siglock. The read_lock on tasklist_lock only excludes ptrace_attach. This means that the !current->ptrace which executes under a read_lock of tasklist_lock will never see a ptrace_freeze_trace as the tracer must have gone away before the tasklist_lock was taken and ptrace_attach can not occur until the read_lock is dropped. As ptrace_freeze_traced depends upon ptrace_attach running before it can run that excludes ptrace_freeze_traced until __state is set to TASK_RUNNING. This means that task_is_traced will fail in ptrace_freeze_attach and ptrace_freeze_attached will fail. On the current->ptrace branch of ptrace_stop which will be reached any time after ptrace_freeze_traced has succeed it is known that __state is __TASK_TRACED and schedule() will be called with that state. Use a WARN_ON_ONCE to document that wait_task_inactive(TASK_TRACED) should never fail. Remove the stale comment about may_ptrace_stop. Strictly speaking this is not true because if PREEMPT_RT is enabled wait_task_inactive can fail because __state can be changed. I don't see this as a problem as the ptrace code is currently broken on PREMPT_RT, and this is one of the issues. Failing and warning when the assumptions of the code are broken is good. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-8-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7105821595bc..05953ac9f7bd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,17 +266,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) { - if (!wait_task_inactive(child, __TASK_TRACED)) { - /* - * This can only happen if may_ptrace_stop() fails and - * ptrace_stop() changes ->state back to TASK_RUNNING, - * so we should not worry about leaking __TASK_TRACED. - */ - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); - ret = -ESRCH; - } - } + if (!ret && !ignore_state && + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + ret = -ESRCH; return ret; } -- cgit v1.2.3 From 57b6de08b5f6586851c2261ef0cc16cd275615e7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 May 2022 13:39:58 -0500 Subject: ptrace: Admit ptrace_stop can generate spuriuos SIGTRAPs Long ago and far away there was a BUG_ON at the start of ptrace_stop that did "BUG_ON(!(current->ptrace & PT_PTRACED));" [1]. The BUG_ON had never triggered but examination of the code showed that the BUG_ON could actually trigger. To complement removing the BUG_ON an attempt to better handle the race was added. The code detected the tracer had gone away and did not call do_notify_parent_cldstop. The code also attempted to prevent ptrace_report_syscall from sending spurious SIGTRAPs when the tracer went away. The code to detect when the tracer had gone away before sending a signal to tracer was a legitimate fix and continues to work to this date. The code to prevent sending spurious SIGTRAPs is a failure. At the time and until today the code only catches it when the tracer goes away after siglock is dropped and before read_lock is acquired. If the tracer goes away after read_lock is dropped a spurious SIGTRAP can still be sent to the tracee. The tracer going away after read_lock is dropped is the far likelier case as it is the bigger window. Given that the attempt to prevent the generation of a SIGTRAP was a failure and continues to be a failure remove the code that attempts to do that. This simplifies the code in ptrace_stop and makes ptrace_stop much easier to reason about. To successfully deal with the tracer going away, all of the tracer's instrumentation of the child would need to be removed, and reliably detecting when the tracer has set a signal to continue with would need to be implemented. [1] 66519f549ae5 ("[PATCH] fix ptracer death race yielding bogus BUG_ON") History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-9-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 92 ++++++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 3fd2ce133387..d2d0c753156c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2187,13 +2187,12 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ -static int ptrace_stop(int exit_code, int why, int clear_code, - unsigned long message, kernel_siginfo_t *info) +static int ptrace_stop(int exit_code, int why, unsigned long message, + kernel_siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { bool gstop_done = false; - bool read_code = true; if (arch_ptrace_stop_needed()) { /* @@ -2212,7 +2211,14 @@ static int ptrace_stop(int exit_code, int why, int clear_code, /* * schedule() will not sleep if there is a pending signal that * can awaken the task. + * + * After this point ptrace_signal_wake_up will clear TASK_TRACED + * if ptrace_unlink happens. Handle previous ptrace_unlinks + * here to prevent ptrace_stop sleeping in schedule. */ + if (!current->ptrace) + return exit_code; + set_special_state(TASK_TRACED); /* @@ -2259,54 +2265,33 @@ static int ptrace_stop(int exit_code, int why, int clear_code, spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - if (likely(current->ptrace)) { - /* - * Notify parents of the stop. - * - * While ptraced, there are two parents - the ptracer and - * the real_parent of the group_leader. The ptracer should - * know about every stop while the real parent is only - * interested in the completion of group stop. The states - * for the two don't interact with each other. Notify - * separately unless they're gonna be duplicates. - */ + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + if (current->ptrace) do_notify_parent_cldstop(current, true, why); - if (gstop_done && ptrace_reparented(current)) - do_notify_parent_cldstop(current, false, why); + if (gstop_done && (!current->ptrace || ptrace_reparented(current))) + do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); - read_unlock(&tasklist_lock); - cgroup_enter_frozen(); - preempt_enable_no_resched(); - freezable_schedule(); - cgroup_leave_frozen(true); - } else { - /* - * By the time we got the lock, our tracer went away. - * Don't drop the lock yet, another tracer may come. - * - * If @gstop_done, the ptracer went away between group stop - * completion and here. During detach, it would have set - * JOBCTL_STOP_PENDING on us and we'll re-enter - * TASK_STOPPED in do_signal_stop() on return, so notifying - * the real parent of the group stop completion is enough. - */ - if (gstop_done) - do_notify_parent_cldstop(current, false, why); - - /* tasklist protects us from ptrace_freeze_traced() */ - __set_current_state(TASK_RUNNING); - read_code = false; - if (clear_code) - exit_code = 0; - read_unlock(&tasklist_lock); - } + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); + preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching @@ -2314,8 +2299,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code, * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(¤t->sighand->siglock); - if (read_code) - exit_code = current->exit_code; + exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; @@ -2343,7 +2327,7 @@ static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long mes info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ - return ptrace_stop(exit_code, why, 1, message, &info); + return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) @@ -2515,7 +2499,7 @@ static void do_jobctl_trap(void) CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); - ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } @@ -2568,7 +2552,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; - signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info); + signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) -- cgit v1.2.3 From 2500ad1c7fa42ad734677853961a3a8bec0772c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 08:43:34 -0500 Subject: ptrace: Don't change __state Stop playing with tsk->__state to remove TASK_WAKEKILL while a ptrace command is executing. Instead remove TASK_WAKEKILL from the definition of TASK_TRACED, and implement a new jobctl flag TASK_PTRACE_FROZEN. This new flag is set in jobctl_freeze_task and cleared when ptrace_stop is awoken or in jobctl_unfreeze_task (when ptrace_stop remains asleep). In signal_wake_up add __TASK_TRACED to state along with TASK_WAKEKILL when the wake up is for a fatal signal. Skip adding __TASK_TRACED when TASK_PTRACE_FROZEN is not set. This has the same effect as changing TASK_TRACED to __TASK_TRACED as all of the wake_ups that use TASK_KILLABLE go through signal_wake_up. Handle a ptrace_stop being called with a pending fatal signal. Previously it would have been handled by schedule simply failing to sleep. As TASK_WAKEKILL is no longer part of TASK_TRACED schedule will sleep with a fatal_signal_pending. The code in signal_wake_up guarantees that the code will be awaked by any fatal signal that codes after TASK_TRACED is set. Previously the __state value of __TASK_TRACED was changed to TASK_RUNNING when woken up or back to TASK_TRACED when the code was left in ptrace_stop. Now when woken up ptrace_stop now clears JOBCTL_PTRACE_FROZEN and when left sleeping ptrace_unfreezed_traced clears JOBCTL_PTRACE_FROZEN. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-10-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 2 +- include/linux/sched/jobctl.h | 2 ++ include/linux/sched/signal.h | 5 +++-- kernel/ptrace.c | 21 ++++++++------------- kernel/sched/core.c | 5 +---- kernel/signal.c | 14 ++++++-------- 6 files changed, 21 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..610f2fdb1e2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -103,7 +103,7 @@ struct task_group; /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index fa067de9f1a9..d556c3425963 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,6 +19,7 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ +#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -28,6 +29,7 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) +#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..e66948abbee4 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -435,9 +435,10 @@ extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); -static inline void signal_wake_up(struct task_struct *t, bool resume) +static inline void signal_wake_up(struct task_struct *t, bool fatal) { - signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); + fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN); + signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 05953ac9f7bd..83ed28262708 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -197,7 +197,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { - WRITE_ONCE(task->__state, __TASK_TRACED); + task->jobctl |= JOBCTL_PTRACE_FROZEN; ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -207,23 +207,19 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) - return; - - WARN_ON(!task->ptrace || task->parent != current); + unsigned long flags; /* - * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. - * Recheck state under the lock to close this race. + * The child may be awake and may have cleared + * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will + * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew. */ - spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { + if (lock_task_sighand(task, &flags)) { + task->jobctl &= ~JOBCTL_PTRACE_FROZEN; if (__fatal_signal_pending(task)) wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); + unlock_task_sighand(task, &flags); } - spin_unlock_irq(&task->sighand->siglock); } /** @@ -256,7 +252,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) */ read_lock(&tasklist_lock); if (child->ptrace && child->parent == current) { - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..3c351707e830 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6304,10 +6304,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* * We must load prev->state once (task_struct::state is volatile), such - * that: - * - * - we form a control dependency vs deactivate_task() below. - * - ptrace_{,un}freeze_traced() can change ->state underneath us. + * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { diff --git a/kernel/signal.c b/kernel/signal.c index d2d0c753156c..a58b68a2d3c6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2209,14 +2209,12 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, } /* - * schedule() will not sleep if there is a pending signal that - * can awaken the task. - * - * After this point ptrace_signal_wake_up will clear TASK_TRACED - * if ptrace_unlink happens. Handle previous ptrace_unlinks - * here to prevent ptrace_stop sleeping in schedule. + * After this point ptrace_signal_wake_up or signal_wake_up + * will clear TASK_TRACED if ptrace_unlink happens or a fatal + * signal comes in. Handle previous ptrace_unlinks and fatal + * signals here to prevent ptrace_stop sleeping in schedule. */ - if (!current->ptrace) + if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; set_special_state(TASK_TRACED); @@ -2305,7 +2303,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ - current->jobctl &= ~JOBCTL_LISTENING; + current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. -- cgit v1.2.3 From 5b4197cb287daf3cfd008fbf8682a1d6f4b13c0b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 10:50:17 -0500 Subject: ptrace: Always take siglock in ptrace_resume Make code analysis simpler and future changes easier by always taking siglock in ptrace_resume. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-11-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83ed28262708..36a5b7a00d2f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -837,8 +837,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { - bool need_siglock; - if (!valid_signal(data)) return -EIO; @@ -874,18 +872,11 @@ static int ptrace_resume(struct task_struct *child, long request, * Note that we need siglock even if ->exit_code == data and/or this * status was not reported yet, the new status must not be cleared by * wait_task_stopped() after resume. - * - * If data == 0 we do not care if wait_task_stopped() reports the old - * status and clears the code too; this can't race with the tracee, it - * takes siglock after resume. */ - need_siglock = data && !thread_group_empty(current); - if (need_siglock) - spin_lock_irq(&child->sighand->siglock); + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; wake_up_state(child, __TASK_TRACED); - if (need_siglock) - spin_unlock_irq(&child->sighand->siglock); + spin_unlock_irq(&child->sighand->siglock); return 0; } -- cgit v1.2.3 From 31cae1eaae4fd65095ad6a3659db467bc3c2599e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 May 2022 15:57:47 -0500 Subject: sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state Currently ptrace_stop() / do_signal_stop() rely on the special states TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this state exists only in task->__state and nowhere else. There's two spots of bother with this: - PREEMPT_RT has task->saved_state which complicates matters, meaning task_is_{traced,stopped}() needs to check an additional variable. - An alternative freezer implementation that itself relies on a special TASK state would loose TASK_TRACED/TASK_STOPPED and will result in misbehaviour. As such, add additional state to task->jobctl to track this state outside of task->__state. NOTE: this doesn't actually fix anything yet, just adds extra state. --EWB * didn't add a unnecessary newline in signal.h * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up instead of in signal_wake_up_state. This prevents the clearing of TASK_STOPPED and TASK_TRACED from getting lost. * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220421150654.757693825@infradead.org Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-12-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- include/linux/sched.h | 8 +++----- include/linux/sched/jobctl.h | 6 ++++++ include/linux/sched/signal.h | 19 +++++++++++++++---- kernel/ptrace.c | 16 +++++++++++++--- kernel/signal.c | 10 ++++++++-- 5 files changed, 45 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 610f2fdb1e2c..cbe5c899599c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,11 +118,9 @@ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - -#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) - -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) +#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index d556c3425963..68876d0a7ef9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -21,6 +21,9 @@ struct task_struct; #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ +#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */ +#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */ + #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) #define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) @@ -31,6 +34,9 @@ struct task_struct; #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) +#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT) +#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT) + #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e66948abbee4..07ba3404fcde 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void) static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); - if (current->jobctl & JOBCTL_STOP_DEQUEUED) + if (current->jobctl & JOBCTL_STOP_DEQUEUED) { + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); + } spin_unlock_irq(¤t->sighand->siglock); schedule(); @@ -437,12 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { - fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN); - signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0); + unsigned int state = 0; + if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); + state = TASK_WAKEKILL | __TASK_TRACED; + } + signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { - signal_wake_up_state(t, resume ? __TASK_TRACED : 0); + unsigned int state = 0; + if (resume) { + t->jobctl &= ~JOBCTL_TRACED; + state = __TASK_TRACED; + } + signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 36a5b7a00d2f..328a34a99124 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task) return true; } -/* Ensure that nothing can wake it up, even SIGKILL */ +/* + * Ensure that nothing can wake it up, even SIGKILL + * + * A task is switched to this state while a ptrace operation is in progress; + * such that the ptrace operation is uninterruptible. + */ static bool ptrace_freeze_traced(struct task_struct *task) { bool ret = false; @@ -216,8 +221,10 @@ static void ptrace_unfreeze_traced(struct task_struct *task) */ if (lock_task_sighand(task, &flags)) { task->jobctl &= ~JOBCTL_PTRACE_FROZEN; - if (__fatal_signal_pending(task)) + if (__fatal_signal_pending(task)) { + task->jobctl &= ~TASK_TRACED; wake_up_state(task, __TASK_TRACED); + } unlock_task_sighand(task, &flags); } } @@ -462,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request, * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_STOPPED; signal_wake_up_state(task, __TASK_STOPPED); + } spin_unlock(&task->sighand->siglock); @@ -875,6 +884,7 @@ static int ptrace_resume(struct task_struct *child, long request, */ spin_lock_irq(&child->sighand->siglock); child->exit_code = data; + child->jobctl &= ~JOBCTL_TRACED; wake_up_state(child, __TASK_TRACED); spin_unlock_irq(&child->sighand->siglock); diff --git a/kernel/signal.c b/kernel/signal.c index a58b68a2d3c6..e782c2611b64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -762,7 +762,10 @@ still_pending: */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { + lockdep_assert_held(&t->sighand->siglock); + set_tsk_thread_flag(t, TIF_SIGPENDING); + /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it @@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) for_each_thread(p, t) { flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - if (likely(!(t->ptrace & PT_SEIZED))) + if (likely(!(t->ptrace & PT_SEIZED))) { + t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); - else + } else ptrace_trap_notify(t); } @@ -2218,6 +2222,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, return exit_code; set_special_state(TASK_TRACED); + current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before @@ -2436,6 +2441,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.2.3 From 07343110b293456d30393e89b86c4dee1ac051c8 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Wed, 11 May 2022 17:38:53 +0800 Subject: bpf: add bpf_map_lookup_percpu_elem for percpu map Add new ebpf helpers bpf_map_lookup_percpu_elem. The implementation method is relatively simple, refer to the implementation method of map_lookup_elem of percpu map, increase the parameters of cpu, and obtain it according to the specified cpu. Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220511093854.411-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/arraymap.c | 15 +++++++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/hashtab.c | 32 ++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 18 ++++++++++++++++++ kernel/bpf/verifier.c | 17 +++++++++++++++-- kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 9 files changed, 103 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ded8711457f..5061ccd8b2dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -89,6 +89,7 @@ struct bpf_map_ops { int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); int (*map_pop_elem)(struct bpf_map *map, void *value); int (*map_peek_elem)(struct bpf_map *map, void *value); + void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -2184,6 +2185,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; +extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc7f89948f54..0210f85131b3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5164,6 +5164,14 @@ union bpf_attr { * if not NULL, is a reference which must be released using its * corresponding release function, or moved into a BPF map before * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5361,6 +5369,7 @@ union bpf_attr { FN(skb_set_tstamp), \ FN(ima_file_hash), \ FN(kptr_xchg), \ + FN(map_lookup_percpu_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 724613da6576..fe40d3b9458f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -243,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (cpu >= nr_cpu_ids) + return NULL; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); +} + int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -725,6 +739,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 13e9dbeeedf3..76f68d0a7ae8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2619,6 +2619,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 705841279d16..17fb69c0e0dc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2199,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + else + return NULL; +} + static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -2211,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; @@ -2300,6 +2330,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, @@ -2318,6 +2349,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3e709fed5306..d5f104a39092 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -119,6 +119,22 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, }; +BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); +} + +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { + .func = bpf_map_lookup_percpu_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, @@ -1420,6 +1436,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c27fee73a2cb..05c1b6656824 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6137,6 +6137,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) goto error; break; + case BPF_FUNC_map_lookup_percpu_elem: + if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH) + goto error; + break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) @@ -6750,7 +6756,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_for_each_map_elem && - func_id != BPF_FUNC_redirect_map) + func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_percpu_elem) return 0; if (map == NULL) { @@ -13810,7 +13817,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_pop_elem || insn->imm == BPF_FUNC_map_peek_elem || insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem)) { + insn->imm == BPF_FUNC_for_each_map_elem || + insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -13859,6 +13867,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, + (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); patch_map_ops_generic: switch (insn->imm) { @@ -13886,6 +13896,9 @@ patch_map_ops_generic: case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); continue; + case BPF_FUNC_map_lookup_percpu_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); + continue; } goto patch_call_imm; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2eaac094caf8..7141ca8a1c2d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1197,6 +1197,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bc7f89948f54..0210f85131b3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5164,6 +5164,14 @@ union bpf_attr { * if not NULL, is a reference which must be released using its * corresponding release function, or moved into a BPF map before * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5361,6 +5369,7 @@ union bpf_attr { FN(skb_set_tstamp), \ FN(ima_file_hash), \ FN(kptr_xchg), \ + FN(map_lookup_percpu_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From a2aa95b71c9bbec793b5c5fa50f0a80d882b3e8d Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sat, 30 Apr 2022 21:08:03 +0800 Subject: bpf: Fix potential array overflow in bpf_trampoline_get_progs() The cnt value in the 'cnt >= BPF_MAX_TRAMP_PROGS' check does not include BPF_TRAMP_MODIFY_RETURN bpf programs, so the number of the attached BPF_TRAMP_MODIFY_RETURN bpf programs in a trampoline can exceed BPF_MAX_TRAMP_PROGS. When this happens, the assignment '*progs++ = aux->prog' in bpf_trampoline_get_progs() will cause progs array overflow as the progs field in the bpf_tramp_progs struct can only hold at most BPF_MAX_TRAMP_PROGS bpf programs. Fixes: 88fd9e5352fe ("bpf: Refactor trampoline update code") Signed-off-by: Yuntao Wang Link: https://lore.kernel.org/r/20220430130803.210624-1-ytcoode@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 0e9b3aefc34a..93c7675f0c9e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -415,7 +415,7 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; int err = 0; - int cnt; + int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); mutex_lock(&tr->mutex); @@ -426,7 +426,10 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline err = -EBUSY; goto out; } - cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) { @@ -512,16 +515,19 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { + int i; + if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) - goto out; - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) - goto out; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) + goto out; + /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip -- cgit v1.2.3 From c14e522bc76efed6e947cd0ab83a1fac7a7a3ec9 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 2 May 2022 21:51:03 +0100 Subject: module: Make module_flags_taint() accept a module's taints bitmap and usable outside core code No functional change. The purpose of this patch is to modify module_flags_taint() to accept a module's taints bitmap as a parameter and modifies all users accordingly. Furthermore, it is now possible to access a given module's taint flags data outside of non-essential code yet does remain for internal use only. This is in preparation for module unload taint tracking support. Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 1 + kernel/module/main.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 3e23bef5884d..abbd1c5ef264 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -100,6 +100,7 @@ int cmp_name(const void *name, const void *sym); long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section); char *module_flags(struct module *mod, char *buf); +size_t module_flags_taint(unsigned long taints, char *buf); static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { diff --git a/kernel/module/main.c b/kernel/module/main.c index 05a42d8fcd7a..7dbdd098b995 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -890,13 +890,13 @@ static inline int module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ -static size_t module_flags_taint(struct module *mod, char *buf) +size_t module_flags_taint(unsigned long taints, char *buf) { size_t l = 0; int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &mod->taints)) + if (taint_flags[i].module && test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } @@ -974,7 +974,7 @@ static ssize_t show_taint(struct module_attribute *mattr, { size_t l; - l = module_flags_taint(mk->mod, buffer); + l = module_flags_taint(mk->mod->taints, buffer); buffer[l++] = '\n'; return l; } @@ -2993,7 +2993,7 @@ char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - bx += module_flags_taint(mod, buf + bx); + bx += module_flags_taint(mod->taints, buf + bx); /* Show a - for module-is-being-unloaded */ if (mod->state == MODULE_STATE_GOING) buf[bx++] = '-'; -- cgit v1.2.3 From 6fb0538d0121ffab770a505b183968d93466ad59 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 2 May 2022 21:51:04 +0100 Subject: module: Move module_assert_mutex_or_preempt() to internal.h No functional change. This patch migrates module_assert_mutex_or_preempt() to internal.h. So, the aforementiond function can be used outside of main/or core module code yet will remain restricted for internal use only. Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 12 ++++++++++++ kernel/module/main.c | 11 ----------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index abbd1c5ef264..0bdf64c9dfb5 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 @@ -102,6 +103,17 @@ long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, char *module_flags(struct module *mod, char *buf); size_t module_flags_taint(unsigned long taints, char *buf); +static inline void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON_ONCE(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS diff --git a/kernel/module/main.c b/kernel/module/main.c index 7dbdd098b995..7a0484900320 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -118,17 +118,6 @@ static void mod_update_bounds(struct module *mod) #endif } -static void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); -- cgit v1.2.3 From 99bd9956551b27cb6f5b445abaced7e13b9976cd Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 2 May 2022 21:52:52 +0100 Subject: module: Introduce module unload taint tracking Currently, only the initial module that tainted the kernel is recorded e.g. when an out-of-tree module is loaded. The purpose of this patch is to allow the kernel to maintain a record of each unloaded module that taints the kernel. So, in addition to displaying a list of linked modules (see print_modules()) e.g. in the event of a detected bad page, unloaded modules that carried a taint/or taints are displayed too. A tainted module unload count is maintained. The number of tracked modules is not fixed. This feature is disabled by default. Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- init/Kconfig | 11 +++++++++ kernel/module/Makefile | 1 + kernel/module/internal.h | 21 +++++++++++++++++ kernel/module/main.c | 5 ++++ kernel/module/tracking.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 99 insertions(+) create mode 100644 kernel/module/tracking.c (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9..6b30210f787d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2118,6 +2118,17 @@ config MODULE_FORCE_UNLOAD rmmod). This is mainly for kernel developers and desperate users. If unsure, say N. +config MODULE_UNLOAD_TAINT_TRACKING + bool "Tainted module unload tracking" + depends on MODULE_UNLOAD + default n + help + This option allows you to maintain a record of each unloaded + module that tainted the kernel. In addition to displaying a + list of linked (or loaded) modules e.g. on detection of a bad + page (see bad_page()), the aforementioned details are also + shown. If unsure, say N. + config MODVERSIONS bool "Module versioning support" help diff --git a/kernel/module/Makefile b/kernel/module/Makefile index d1ca799c12e2..948efea81e85 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_PROC_FS) += procfs.o obj-$(CONFIG_SYSFS) += sysfs.o obj-$(CONFIG_KGDB_KDB) += kdb.o obj-$(CONFIG_MODVERSIONS) += version.o +obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 0bdf64c9dfb5..bc5507ab8450 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -145,6 +145,27 @@ static inline bool set_livepatch_module(struct module *mod) #endif } +#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING +struct mod_unload_taint { + struct list_head list; + char name[MODULE_NAME_LEN]; + unsigned long taints; + u64 count; +}; + +int try_add_tainted_module(struct module *mod); +void print_unloaded_tainted_modules(void); +#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ +static inline int try_add_tainted_module(struct module *mod) +{ + return 0; +} + +static inline void print_unloaded_tainted_modules(void) +{ +} +#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ + #ifdef CONFIG_MODULE_DECOMPRESS int module_decompress(struct load_info *info, const void *buf, size_t size); void module_decompress_cleanup(struct load_info *info); diff --git a/kernel/module/main.c b/kernel/module/main.c index 7a0484900320..6c3b4a846645 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1190,6 +1190,9 @@ static void free_module(struct module *mod) module_bug_cleanup(mod); /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); + if (try_add_tainted_module(mod)) + pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", + mod->name); mutex_unlock(&module_mutex); /* Clean up CFI for the module. */ @@ -3125,6 +3128,8 @@ void print_modules(void) continue; pr_cont(" %s%s", mod->name, module_flags(mod, buf)); } + + print_unloaded_tainted_modules(); preempt_enable(); if (last_unloaded_module[0]) pr_cont(" [last unloaded: %s]", last_unloaded_module); diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c new file mode 100644 index 000000000000..7f8133044d09 --- /dev/null +++ b/kernel/module/tracking.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module taint unload tracking support + * + * Copyright (C) 2022 Aaron Tomlin + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static LIST_HEAD(unloaded_tainted_modules); + +int try_add_tainted_module(struct module *mod) +{ + struct mod_unload_taint *mod_taint; + + module_assert_mutex_or_preempt(); + + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list, + lockdep_is_held(&module_mutex)) { + if (!strcmp(mod_taint->name, mod->name) && + mod_taint->taints & mod->taints) { + mod_taint->count++; + goto out; + } + } + + mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL); + if (unlikely(!mod_taint)) + return -ENOMEM; + strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN); + mod_taint->taints = mod->taints; + list_add_rcu(&mod_taint->list, &unloaded_tainted_modules); + mod_taint->count = 1; +out: + return 0; +} + +void print_unloaded_tainted_modules(void) +{ + struct mod_unload_taint *mod_taint; + char buf[MODULE_FLAGS_BUF_SIZE]; + + if (!list_empty(&unloaded_tainted_modules)) { + printk(KERN_DEFAULT "Unloaded tainted modules:"); + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, + list) { + size_t l; + + l = module_flags_taint(mod_taint->taints, buf); + buf[l++] = '\0'; + pr_cont(" %s(%s):%llu", mod_taint->name, buf, + mod_taint->count); + } + } +} -- cgit v1.2.3 From 391e982bfa632b8315235d8be9c0a81374c6a19c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 4 May 2022 12:54:20 +0300 Subject: module: fix [e_shstrndx].sh_size=0 OOB access It is trivial to craft a module to trigger OOB access in this line: if (info->secstrings[strhdr->sh_size - 1] != '\0') { BUG: unable to handle page fault for address: ffffc90000aa0fff PGD 100000067 P4D 100000067 PUD 100066067 PMD 10436f067 PTE 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 7 PID: 1215 Comm: insmod Not tainted 5.18.0-rc5-00007-g9bf578647087-dirty #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014 RIP: 0010:load_module+0x19b/0x2391 Fixes: ec2a29593c83 ("module: harden ELF info handling") Signed-off-by: Alexey Dobriyan [rebased patch onto modules-next] Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 6c3b4a846645..23432fabfde8 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1726,6 +1726,10 @@ static int elf_validity_check(struct load_info *info) * strings in the section safe. */ info->secstrings = (void *)info->hdr + strhdr->sh_offset; + if (strhdr->sh_size == 0) { + pr_err("empty section name table\n"); + goto no_exec; + } if (info->secstrings[strhdr->sh_size - 1] != '\0') { pr_err("ELF Spec violation: section name table isn't null terminated\n"); goto no_exec; -- cgit v1.2.3 From 8eac910a49347821cbafc770a319e00ccd69d58b Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Wed, 27 Apr 2022 15:36:06 +0800 Subject: module: show disallowed symbol name for inherit_taint() The error log for inherit_taint() doesn't really help to find the symbol which violates GPL rules. For example, if a module has 300 symbol and includes 50 disallowed symbols, the log only shows the content below and we have no idea what symbol is. AAA: module using GPL-only symbols uses symbols from proprietary module BBB. It's hard for user who doesn't really know how the symbol was parsing. This patch add symbol name to tell the offending symbols explicitly. AAA: module using GPL-only symbols uses symbols SSS from proprietary module BBB. Signed-off-by: Lecopzer Chen Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 23432fabfde8..ac0a7882899b 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1038,20 +1038,20 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } -static bool inherit_taint(struct module *mod, struct module *owner) +static bool inherit_taint(struct module *mod, struct module *owner, const char *name) { if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) return true; if (mod->using_gplonly_symbols) { - pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", - mod->name, owner->name); + pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n", + mod->name, name, owner->name); return false; } if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { - pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", - mod->name, owner->name); + pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n", + mod->name, name, owner->name); set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); } return true; @@ -1083,7 +1083,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (fsa.license == GPL_ONLY) mod->using_gplonly_symbols = true; - if (!inherit_taint(mod, fsa.owner)) { + if (!inherit_taint(mod, fsa.owner, name)) { fsa.sym = NULL; goto getname; } -- cgit v1.2.3 From c6eee9df57a6d9252bae93a9386d0d872798f5d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 May 2022 12:52:10 +0900 Subject: module: do not pass opaque pointer for symbol search There is no need to use an opaque pointer for check_exported_symbol() or find_exported_symbol_in_section. Pass (struct find_symbol_arg *) explicitly. Signed-off-by: Masahiro Yamada Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index ac0a7882899b..d00a8de6d8f5 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -244,11 +244,9 @@ static __maybe_unused void *any_section_objs(const struct load_info *info, #endif static bool check_exported_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) + struct module *owner, unsigned int symnum, + struct find_symbol_arg *fsa) { - struct find_symbol_arg *fsa = data; - if (!fsa->gplok && syms->license == GPL_ONLY) return false; fsa->owner = owner; @@ -285,16 +283,15 @@ int cmp_name(const void *name, const void *sym) static bool find_exported_symbol_in_section(const struct symsearch *syms, struct module *owner, - void *data) + struct find_symbol_arg *fsa) { - struct find_symbol_arg *fsa = data; struct kernel_symbol *sym; sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); if (sym != NULL && check_exported_symbol(syms, owner, - sym - syms->start, data)) + sym - syms->start, fsa)) return true; return false; -- cgit v1.2.3 From cdd66eb52fdaa9bdab7f1be8dc9162bf4acc64ae Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 May 2022 12:52:11 +0900 Subject: module: do not binary-search in __ksymtab_gpl if fsa->gplok is false Currently, !fsa->gplok && syms->license == GPL_ONLY) is checked after bsearch() succeeds. It is meaningless to do the binary search in the GPL symbol table when fsa->gplok is false because we know find_exported_symbol_in_section() will fail anyway. This check should be done before bsearch(). Signed-off-by: Masahiro Yamada Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index d00a8de6d8f5..8a068fff437c 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -247,8 +247,6 @@ static bool check_exported_symbol(const struct symsearch *syms, struct module *owner, unsigned int symnum, struct find_symbol_arg *fsa) { - if (!fsa->gplok && syms->license == GPL_ONLY) - return false; fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); fsa->sym = &syms->start[symnum]; @@ -287,6 +285,9 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, { struct kernel_symbol *sym; + if (!fsa->gplok && syms->license == GPL_ONLY) + return false; + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); -- cgit v1.2.3 From 7390b94a3c2d93272d6da4945b81a9cf78055b7b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 May 2022 12:52:12 +0900 Subject: module: merge check_exported_symbol() into find_exported_symbol_in_section() Now check_exported_symbol() always succeeds. Merge it into find_exported_symbol_in_search() to make the code concise. Signed-off-by: Masahiro Yamada Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 8a068fff437c..fed58d30725d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -243,17 +243,6 @@ static __maybe_unused void *any_section_objs(const struct load_info *info, #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -static bool check_exported_symbol(const struct symsearch *syms, - struct module *owner, unsigned int symnum, - struct find_symbol_arg *fsa) -{ - fsa->owner = owner; - fsa->crc = symversion(syms->crcs, symnum); - fsa->sym = &syms->start[symnum]; - fsa->license = syms->license; - return true; -} - static const char *kernel_symbol_name(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -290,12 +279,15 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); + if (!sym) + return false; - if (sym != NULL && check_exported_symbol(syms, owner, - sym - syms->start, fsa)) - return true; + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, sym - syms->start); + fsa->sym = sym; + fsa->license = syms->license; - return false; + return true; } /* -- cgit v1.2.3 From a7bd57b87f65e0e1c5d41baf51a0d0b49fb30808 Mon Sep 17 00:00:00 2001 From: lizhe Date: Thu, 12 May 2022 20:38:36 -0700 Subject: kernel/crash_core.c: remove redundant check of ck_cmdline At the end of get_last_crashkernel(), the judgement of ck_cmdline is obviously unnecessary and causes redundance, let's clean it up. Link: https://lkml.kernel.org/r/20220506104116.259323-1-sensor1010@163.com Signed-off-by: lizhe Acked-by: Baoquan He Acked-by: Philipp Rudo Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Andrew Morton --- kernel/crash_core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 256cf6db573c..c232f01a2c54 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -222,9 +222,6 @@ next: p = strstr(p+1, name); } - if (!ck_cmdline) - return NULL; - return ck_cmdline; } -- cgit v1.2.3 From 47b7eae62aa7dc69f0e6d12493e5468ba57bf074 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 12 May 2022 20:38:37 -0700 Subject: relay: remove redundant assignment to pointer buf Pointer buf is being assigned a value that is not being read, buf is being re-assigned in the next starement. The assignment is redundant and can be removed. Cleans up clang scan build warning: kernel/relay.c:443:8: warning: Although the value stored to 'buf' is used in the enclosing expression, the value is never actually read from 'buf' [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220508212152.58753-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Jens Axboe Cc: Christoph Hellwig Cc: Kalle Valo Signed-off-by: Andrew Morton --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index d1a67fbb819d..6a611e779e95 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu) mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, cpu))) + if (*per_cpu_ptr(chan->buf, cpu)) continue; buf = relay_open_buf(chan, cpu); if (!buf) { -- cgit v1.2.3 From 1521c607cabe7c7edb028e211e88ba1e0f19714e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 May 2022 08:44:29 +0200 Subject: swiotlb: don't panic when the swiotlb buffer can't be allocated For historical reasons the switlb code paniced when the metadata could not be allocated, but just printed a warning when the actual main swiotlb buffer could not be allocated. Restore this somewhat unexpected behavior as changing it caused a boot failure on the Microchip RISC-V PolarFire SoC Icicle kit. Fixes: 6424e31b1c05 ("swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl") Reported-by: Conor Dooley Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Acked-by: Conor Dooley Tested-by: Conor Dooley --- kernel/dma/swiotlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e2ef0864eb1e..3e992a308c8a 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -254,8 +254,10 @@ retry: tlb = memblock_alloc(bytes, PAGE_SIZE); else tlb = memblock_alloc_low(bytes, PAGE_SIZE); - if (!tlb) - panic("%s: failed to allocate tlb structure\n", __func__); + if (!tlb) { + pr_warn("%s: failed to allocate tlb structure\n", __func__); + return; + } if (remap && remap(tlb, nslabs) < 0) { memblock_free(tlb, PAGE_ALIGN(bytes)); -- cgit v1.2.3 From a5e891321a219679d5a2828150a7dda29a47d8a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 May 2022 08:13:57 +0200 Subject: swiotlb: use the right nslabs value in swiotlb_init_remap default_nslabs should only be used to initialize nslabs, after that we need to use the local variable that can shrink when allocations or the remap don't succeed. Fixes: 6424e31b1c05 ("swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini --- kernel/dma/swiotlb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 3e992a308c8a..113e1e8aaca3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -234,7 +234,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, { struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = default_nslabs; - size_t alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); + size_t alloc_size; size_t bytes; void *tlb; @@ -249,7 +249,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, * memory encryption. */ retry: - bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); + bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); if (flags & SWIOTLB_ANY) tlb = memblock_alloc(bytes, PAGE_SIZE); else @@ -269,12 +269,13 @@ retry: goto retry; } + alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); if (!mem->slots) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - swiotlb_init_io_tlb_mem(mem, __pa(tlb), default_nslabs, false); + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); mem->force_bounce = flags & SWIOTLB_FORCE; if (flags & SWIOTLB_VERBOSE) -- cgit v1.2.3 From 1b8e5d1a53696d92374acce2b19a649427f1ec1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 May 2022 08:24:10 +0200 Subject: swiotlb: use the right nslabs-derived sizes in swiotlb_init_late nslabs can shrink when allocations or the remap don't succeed, so make sure to use it for all sizing. For that remove the bytes value that can get stale and replace it with local calculations and a boolean to indicate if the originally requested size could not be allocated. Fixes: 6424e31b1c05 ("swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini --- kernel/dma/swiotlb.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 113e1e8aaca3..d6e62a6a42ce 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -297,9 +297,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, { struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); - unsigned long bytes; unsigned char *vstart = NULL; unsigned int order; + bool retried = false; int rc = 0; if (swiotlb_force_disable) @@ -308,7 +308,6 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; - bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, @@ -316,16 +315,13 @@ retry: if (vstart) break; order--; + nslabs = SLABS_PER_PAGE << order; + retried = true; } if (!vstart) return -ENOMEM; - if (order != get_order(bytes)) { - pr_warn("only able to allocate %ld MB\n", - (PAGE_SIZE << order) >> 20); - nslabs = SLABS_PER_PAGE << order; - } if (remap) rc = remap(vstart, nslabs); if (rc) { @@ -334,9 +330,15 @@ retry: nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); if (nslabs < IO_TLB_MIN_SLABS) return rc; + retried = true; goto retry; } + if (retried) { + pr_warn("only able to allocate %ld MB\n", + (PAGE_SIZE << order) >> 20); + } + mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); if (!mem->slots) { @@ -344,7 +346,8 @@ retry: return -ENOMEM; } - set_memory_decrypted((unsigned long)vstart, bytes >> PAGE_SHIFT); + set_memory_decrypted((unsigned long)vstart, + (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true); swiotlb_print_info(); -- cgit v1.2.3 From 4b6313cf99b0d51b49aeaea98ec76ca8161ecb80 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 12 May 2022 18:10:24 -0700 Subject: bpf: Fix combination of jit blinding and pointers to bpf subprogs. The combination of jit blinding and pointers to bpf subprogs causes: [ 36.989548] BUG: unable to handle page fault for address: 0000000100000001 [ 36.990342] #PF: supervisor instruction fetch in kernel mode [ 36.990968] #PF: error_code(0x0010) - not-present page [ 36.994859] RIP: 0010:0x100000001 [ 36.995209] Code: Unable to access opcode bytes at RIP 0xffffffd7. [ 37.004091] Call Trace: [ 37.004351] [ 37.004576] ? bpf_loop+0x4d/0x70 [ 37.004932] ? bpf_prog_3899083f75e4c5de_F+0xe3/0x13b The jit blinding logic didn't recognize that ld_imm64 with an address of bpf subprogram is a special instruction and proceeded to randomize it. By itself it wouldn't have been an issue, but jit_subprogs() logic relies on two step process to JIT all subprogs and then JIT them again when addresses of all subprogs are known. Blinding process in the first JIT phase caused second JIT to miss adjustment of special ld_imm64. Fix this issue by ignoring special ld_imm64 instructions that don't have user controlled constants and shouldn't be blinded. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220513011025.13344-1-alexei.starovoitov@gmail.com --- kernel/bpf/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 76f68d0a7ae8..9cc91f0f3115 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1434,6 +1434,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* ld_imm64 with an address of bpf subprog is not + * a user controlled constant. Don't randomize it, + * since it will conflict with jit_subprogs() logic. + */ + insn++; + i++; + continue; + } + /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. -- cgit v1.2.3 From 534aa1dc975ac883ad89110534585a96630802a0 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 9 May 2022 18:20:53 -0700 Subject: printk: stop including cache.h from printk.h An inclusion of cache.h in printk.h was added in 2014 in commit c28aa1f0a847 ("printk/cache: mark printk_once test variable __read_mostly") in order to bring in the definition of __read_mostly. The usage of __read_mostly was later removed in commit 3ec25826ae33 ("printk: Tie printk_once / printk_deferred_once into .data.once for reset") which made the inclusion of cache.h unnecessary, so remove it. We have a small amount of code that depended on the inclusion of cache.h from printk.h; fix that code to include the appropriate header. This fixes a circular inclusion on arm64 (linux/printk.h -> linux/cache.h -> asm/cache.h -> linux/kasan-enabled.h -> linux/static_key.h -> linux/jump_label.h -> linux/bug.h -> asm/bug.h -> linux/printk.h) that would otherwise be introduced by the next patch. Build tested using {allyesconfig,defconfig} x {arm64,x86_64}. Link: https://linux-review.googlesource.com/id/I8fd51f72c9ef1f2d6afd3b2cbc875aa4792c1fba Link: https://lkml.kernel.org/r/20220427195820.1716975-1-pcc@google.com Signed-off-by: Peter Collingbourne Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric W. Biederman Cc: Herbert Xu Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arm64/include/asm/mte-kasan.h | 1 + arch/arm64/include/asm/percpu.h | 1 + arch/csky/include/asm/processor.h | 2 +- drivers/firmware/smccc/kvm_guest.c | 1 + include/linux/printk.h | 1 - kernel/bpf/bpf_lru_list.h | 1 + 6 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index a857bcacf0fe..9f79425fc65a 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -6,6 +6,7 @@ #define __ASM_MTE_KASAN_H #include +#include #include #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 8f1661603b78..b9ba19dbdb69 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -10,6 +10,7 @@ #include #include #include +#include static inline void set_my_cpu_offset(unsigned long off) { diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 688c7548b559..9638206bc44f 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -4,9 +4,9 @@ #define __ASM_CSKY_PROCESSOR_H #include +#include #include #include -#include #include #include #include diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c index 2d3e866decaa..89a68e7eeaa6 100644 --- a/drivers/firmware/smccc/kvm_guest.c +++ b/drivers/firmware/smccc/kvm_guest.c @@ -4,6 +4,7 @@ #include #include +#include #include #include diff --git a/include/linux/printk.h b/include/linux/printk.h index 1522df223c0f..8e8d74edf121 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 6b12f06ee18c..4ea227c9c1ad 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -4,6 +4,7 @@ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ +#include #include #include -- cgit v1.2.3 From 16d1e00c7e8a4950e914223b3112144289a82913 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 9 May 2022 15:42:52 -0700 Subject: bpf: Add MEM_UNINIT as a bpf_type_flag Instead of having uninitialized versions of arguments as separate bpf_arg_types (eg ARG_PTR_TO_UNINIT_MEM as the uninitialized version of ARG_PTR_TO_MEM), we can instead use MEM_UNINIT as a bpf_type_flag modifier to denote that the argument is uninitialized. Doing so cleans up some of the logic in the verifier. We no longer need to do two checks against an argument type (eg "if (base_type(arg_type) == ARG_PTR_TO_MEM || base_type(arg_type) == ARG_PTR_TO_UNINIT_MEM)"), since uninitialized and initialized versions of the same argument type will now share the same base type. In the near future, MEM_UNINIT will be used by dynptr helper functions as well. Signed-off-by: Joanne Koong Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/r/20220509224257.3222614-2-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 18 ++++++++++-------- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 28 ++++++++-------------------- 3 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5061ccd8b2dc..c107392b0ba7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -390,7 +390,10 @@ enum bpf_type_flag { */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_UNTRUSTED, + MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_FLAG_MAX, + __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; /* Max number of base types. */ @@ -409,16 +412,11 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ - ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - /* the following constraints used to prototype bpf_memcmp() and other - * functions that access data on eBPF program stack + /* Used to prototype bpf_memcmp() and other functions that access data + * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, - * helper function must fill all bytes or clear - * them in error case. - */ ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ @@ -450,6 +448,10 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + /* pointer to memory does not need to be initialized, helper function must fill + * all bytes or clear them in error case. + */ + ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d5f104a39092..bad96131a510 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -103,7 +103,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -116,7 +116,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 05c1b6656824..9b59581026f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5377,12 +5377,6 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } -static bool arg_type_is_mem_ptr(enum bpf_arg_type type) -{ - return base_type(type) == ARG_PTR_TO_MEM || - base_type(type) == ARG_PTR_TO_UNINIT_MEM; -} - static bool arg_type_is_mem_size(enum bpf_arg_type type) { return type == ARG_CONST_SIZE || @@ -5522,7 +5516,6 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } } static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, @@ -5536,7 +5529,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, @@ -5710,8 +5702,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; @@ -5797,8 +5788,7 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { if (type_may_be_null(arg_type) && register_is_null(reg)) return 0; @@ -5810,7 +5800,7 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); + meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); @@ -5837,11 +5827,11 @@ skip_type_check: return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; - } else if (arg_type_is_mem_ptr(arg_type)) { + } else if (base_type(arg_type) == ARG_PTR_TO_MEM) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. */ - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); + meta->raw_mode = arg_type & MEM_UNINIT; } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -6194,10 +6184,8 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, enum bpf_arg_type arg_next) { - return (arg_type_is_mem_ptr(arg_curr) && - !arg_type_is_mem_size(arg_next)) || - (!arg_type_is_mem_ptr(arg_curr) && - arg_type_is_mem_size(arg_next)); + return (base_type(arg_curr) == ARG_PTR_TO_MEM) != + arg_type_is_mem_size(arg_next); } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -6208,7 +6196,7 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - arg_type_is_mem_ptr(fn->arg5_type) || + base_type(fn->arg5_type) == ARG_PTR_TO_MEM || check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || -- cgit v1.2.3 From 2434031c7cb4906be2d380981caa1f87d8899288 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 4 May 2022 09:09:41 +0200 Subject: kcsan: test: use new suite_{init,exit} support Use the newly added suite_{init,exit} support for suite-wide init and cleanup. This avoids the unsupported method by which the test used to do suite-wide init and cleanup (avoiding issues such as missing TAP headers, and possible future conflicts). Signed-off-by: Marco Elver Signed-off-by: Shuah Khan --- kernel/kcsan/kcsan_test.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index a36fca063a73..59560b5e1d9c 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1565,14 +1565,6 @@ static void test_exit(struct kunit *test) torture_cleanup_end(); } -static struct kunit_suite kcsan_test_suite = { - .name = "kcsan", - .test_cases = kcsan_test_cases, - .init = test_init, - .exit = test_exit, -}; -static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; - __no_kcsan static void register_tracepoints(struct tracepoint *tp, void *ignore) { @@ -1588,11 +1580,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore) tracepoint_probe_unregister(tp, probe_console, NULL); } -/* - * We only want to do tracepoints setup and teardown once, therefore we have to - * customize the init and exit functions and cannot rely on kunit_test_suite(). - */ -static int __init kcsan_test_init(void) +static int kcsan_suite_init(struct kunit_suite *suite) { /* * Because we want to be able to build the test as a module, we need to @@ -1600,18 +1588,25 @@ static int __init kcsan_test_init(void) * won't work here. */ for_each_kernel_tracepoint(register_tracepoints, NULL); - return __kunit_test_suites_init(kcsan_test_suites); + return 0; } -static void kcsan_test_exit(void) +static void kcsan_suite_exit(struct kunit_suite *suite) { - __kunit_test_suites_exit(kcsan_test_suites); for_each_kernel_tracepoint(unregister_tracepoints, NULL); tracepoint_synchronize_unregister(); } -late_initcall_sync(kcsan_test_init); -module_exit(kcsan_test_exit); +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kcsan_suite_init, + .suite_exit = kcsan_suite_exit, +}; + +kunit_test_suites(&kcsan_test_suite); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver "); -- cgit v1.2.3 From 82806744fd7dde603b64c151eeddaa4ee62193fd Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Tue, 10 May 2022 10:21:09 -0400 Subject: swiotlb: max mapping size takes min align mask into account swiotlb_find_slots() skips slots according to io tlb aligned mask calculated from min aligned mask and original physical address offset. This affects max mapping size. The mapping size can't achieve the IO_TLB_SEGSIZE * IO_TLB_SIZE when original offset is non-zero. This will cause system boot up failure in Hyper-V Isolation VM where swiotlb force is enabled. Scsi layer use return value of dma_max_mapping_size() to set max segment size and it finally calls swiotlb_max_mapping_size(). Hyper-V storage driver sets min align mask to 4k - 1. Scsi layer may pass 256k length of request buffer with 0~4k offset and Hyper-V storage driver can't get swiotlb bounce buffer via DMA API. Swiotlb_find_slots() can't find 256k length bounce buffer with offset. Make swiotlb_max_mapping _size() take min align mask into account. Signed-off-by: Tianyu Lan Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d6e62a6a42ce..dfa1de89dc94 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -736,7 +736,18 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; + int min_align_mask = dma_get_min_align_mask(dev); + int min_align = 0; + + /* + * swiotlb_find_slots() skips slots according to + * min align mask. This affects max mapping size. + * Take it into acount here. + */ + if (min_align_mask) + min_align = roundup(min_align_mask, IO_TLB_SIZE); + + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } bool is_swiotlb_active(struct device *dev) -- cgit v1.2.3 From 29ed17389c4dcd09c5be8d88ddf6f4f60241567d Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 17 May 2022 19:25:23 +0800 Subject: cgroup: Make cgroup_debug static Make cgroup_debug static since it's only used in cgroup.c Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 1 - kernel/cgroup/cgroup.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 6e36e854b512..5da09c74228d 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -12,7 +12,6 @@ #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -extern bool cgroup_debug; extern void __init enable_debug_cgroup(void); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index adb820e98f24..a97fd051430b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -bool cgroup_debug __read_mostly; +static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without -- cgit v1.2.3 From 13dfd97a341a5cf9d15f415dd469f45e971ef12a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 19 May 2022 14:02:32 +0300 Subject: notifier: Add atomic_notifier_call_chain_is_empty() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add atomic_notifier_call_chain_is_empty() that returns true if given atomic call chain is empty. The first user of this new notifier API function will be the kernel power-off core code that will support power-off call chains. The core code will need to check whether there is a power-off handler registered at all in order to decide whether to halt machine or power it off. Reviewed-by: Michał Mirosław Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/notifier.h | 2 ++ kernel/notifier.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 87069b8459af..95e2440037de 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -173,6 +173,8 @@ extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); +extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh); + #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ diff --git a/kernel/notifier.c b/kernel/notifier.c index ba005ebf4730..137b902c985b 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -204,6 +204,19 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); +/** + * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty + * @nh: Pointer to head of the atomic notifier chain + * + * Checks whether notifier chain is empty. + * + * Returns true is notifier chain is empty, false otherwise. + */ +bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) +{ + return !rcu_access_pointer(nh->head); +} + /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. -- cgit v1.2.3 From c82f898d873ce10d88f8d60b38c8dd19f1ed7c66 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:10 +0300 Subject: notifier: Add blocking/atomic_notifier_chain_register_unique_prio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add variant of blocking/atomic_notifier_chain_register() functions that allow registration of a notifier only if it has unique priority, otherwise -EBUSY error code is returned by the new functions. Reviewed-by: Michał Mirosław Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/notifier.h | 5 +++ kernel/notifier.c | 88 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 74 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 95e2440037de..aef88c2d1173 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -150,6 +150,11 @@ extern int raw_notifier_chain_register(struct raw_notifier_head *nh, extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); +extern int atomic_notifier_chain_register_unique_prio( + struct atomic_notifier_head *nh, struct notifier_block *nb); +extern int blocking_notifier_chain_register_unique_prio( + struct blocking_notifier_head *nh, struct notifier_block *nb); + extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, diff --git a/kernel/notifier.c b/kernel/notifier.c index 137b902c985b..0d5bd62c480e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -20,7 +20,8 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); */ static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) + struct notifier_block *n, + bool unique_priority) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { @@ -30,6 +31,8 @@ static int notifier_chain_register(struct notifier_block **nl, } if (n->priority > (*nl)->priority) break; + if (n->priority == (*nl)->priority && unique_priority) + return -EBUSY; nl = &((*nl)->next); } n->next = *nl; @@ -144,12 +147,35 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, int ret; spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, false); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); +/** + * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an atomic notifier chain if there is no other + * notifier registered using the same priority. + * + * Returns 0 on success, %-EEXIST or %-EBUSY on error. + */ +int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n, true); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio); + /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain @@ -222,18 +248,9 @@ bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) * synchronized by an rwsem. */ -/** - * blocking_notifier_chain_register - Add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain. - * Must be called in process context. - * - * Returns 0 on success, %-EEXIST on error. - */ -int blocking_notifier_chain_register(struct blocking_notifier_head *nh, - struct notifier_block *n) +static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n, + bool unique_priority) { int ret; @@ -243,15 +260,48 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, unique_priority); down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, unique_priority); up_write(&nh->rwsem); return ret; } + +/** + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. + * + * Returns 0 on success, %-EEXIST on error. + */ +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + return __blocking_notifier_chain_register(nh, n, false); +} EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); +/** + * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an blocking notifier chain if there is no other + * notifier registered using the same priority. + * + * Returns 0 on success, %-EEXIST or %-EBUSY on error. + */ +int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + return __blocking_notifier_chain_register(nh, n, true); +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio); + /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain @@ -354,7 +404,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, false); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); @@ -433,10 +483,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, false); mutex_lock(&nh->mutex); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, false); mutex_unlock(&nh->mutex); return ret; } -- cgit v1.2.3 From 232edc2f72f5beda3586360de6254d443820050a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:11 +0300 Subject: kernel/reboot: Introduce sys-off handler API In order to support power-off chaining we need to get rid of the global pm_* variables, replacing them with the new kernel API functions that support chaining. Introduce new generic sys-off handler API that brings the following features: 1. Power-off and restart handlers are registered using same API function that supports chaining, hence all power-off and restart modes will support chaining using this unified function. 2. Prevents notifier priority collisions by disallowing registration of multiple handlers at the non-default priority level. 3. Supports passing opaque user argument to callback, which allows us to remove global variables from drivers. This patch adds support of the following sys-off modes: - SYS_OFF_MODE_POWER_OFF_PREPARE that replaces global pm_power_off_prepare variable and provides chaining support for power-off-prepare handlers. - SYS_OFF_MODE_POWER_OFF that replaces global pm_power_off variable and provides chaining support for power-off handlers. - SYS_OFF_MODE_RESTART that provides a better restart API, removing a need from drivers to have a global scratch variable by utilizing the opaque callback argument. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 77 +++++++++++++++++++++ kernel/reboot.c | 182 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index af907a3d68d1..b79f8942ea5f 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -7,6 +7,7 @@ #include struct device; +struct sys_off_handler; #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN @@ -62,6 +63,82 @@ extern void machine_shutdown(void); struct pt_regs; extern void machine_crash_shutdown(struct pt_regs *); +/* + * sys-off handler API. + */ + +/* + * Standard sys-off priority levels. Users are expected to set priorities + * relative to the standard levels. + * + * SYS_OFF_PRIO_PLATFORM: Use this for platform-level handlers. + * + * SYS_OFF_PRIO_LOW: Use this for handler of last resort. + * + * SYS_OFF_PRIO_DEFAULT: Use this for normal handlers. + * + * SYS_OFF_PRIO_HIGH: Use this for higher priority handlers. + * + * SYS_OFF_PRIO_FIRMWARE: Use this if handler uses firmware call. + */ +#define SYS_OFF_PRIO_PLATFORM -256 +#define SYS_OFF_PRIO_LOW -128 +#define SYS_OFF_PRIO_DEFAULT 0 +#define SYS_OFF_PRIO_HIGH 192 +#define SYS_OFF_PRIO_FIRMWARE 224 + +enum sys_off_mode { + /** + * @SYS_OFF_MODE_POWER_OFF_PREPARE: + * + * Handlers prepare system to be powered off. Handlers are + * allowed to sleep. + */ + SYS_OFF_MODE_POWER_OFF_PREPARE, + + /** + * @SYS_OFF_MODE_POWER_OFF: + * + * Handlers power-off system. Handlers are disallowed to sleep. + */ + SYS_OFF_MODE_POWER_OFF, + + /** + * @SYS_OFF_MODE_RESTART: + * + * Handlers restart system. Handlers are disallowed to sleep. + */ + SYS_OFF_MODE_RESTART, +}; + +/** + * struct sys_off_data - sys-off callback argument + * + * @mode: Mode ID. Currently used only by the sys-off restart mode, + * see enum reboot_mode for the available modes. + * @cb_data: User's callback data. + * @cmd: Command string. Currently used only by the sys-off restart mode, + * NULL otherwise. + */ +struct sys_off_data { + int mode; + void *cb_data; + const char *cmd; +}; + +struct sys_off_handler * +register_sys_off_handler(enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data); +void unregister_sys_off_handler(struct sys_off_handler *handler); + +int devm_register_sys_off_handler(struct device *dev, + enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data); + /* * Architecture independent implemenations of sys_reboot commands. */ diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..3e6867ee61e0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -48,6 +48,15 @@ int reboot_cpu; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; +struct sys_off_handler { + struct notifier_block nb; + int (*sys_off_cb)(struct sys_off_data *data); + void *cb_data; + enum sys_off_mode mode; + bool blocking; + void *list; +}; + /* * If set, this is used for preparing the system to power off. */ @@ -281,6 +290,179 @@ void kernel_halt(void) } EXPORT_SYMBOL_GPL(kernel_halt); +/* + * Notifier list for kernel code which wants to be called + * to prepare system for power off. + */ +static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list); + +/* + * Notifier list for kernel code which wants to be called + * to power off system. + */ +static ATOMIC_NOTIFIER_HEAD(power_off_handler_list); + +static int sys_off_notify(struct notifier_block *nb, + unsigned long mode, void *cmd) +{ + struct sys_off_handler *handler; + struct sys_off_data data = {}; + + handler = container_of(nb, struct sys_off_handler, nb); + data.cb_data = handler->cb_data; + data.mode = mode; + data.cmd = cmd; + + return handler->sys_off_cb(&data); +} + +/** + * register_sys_off_handler - Register sys-off handler + * @mode: Sys-off mode + * @priority: Handler priority + * @callback: Callback function + * @cb_data: Callback argument + * + * Registers system power-off or restart handler that will be invoked + * at the step corresponding to the given sys-off mode. Handler's callback + * should return NOTIFY_DONE to permit execution of the next handler in + * the call chain or NOTIFY_STOP to break the chain (in error case for + * example). + * + * Multiple handlers can be registered at the default priority level. + * + * Only one handler can be registered at the non-default priority level, + * otherwise ERR_PTR(-EBUSY) is returned. + * + * Returns a new instance of struct sys_off_handler on success, or + * an ERR_PTR()-encoded error code otherwise. + */ +struct sys_off_handler * +register_sys_off_handler(enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + struct sys_off_handler *handler; + int err; + + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler) + return ERR_PTR(-ENOMEM); + + switch (mode) { + case SYS_OFF_MODE_POWER_OFF_PREPARE: + handler->list = &power_off_prep_handler_list; + handler->blocking = true; + break; + + case SYS_OFF_MODE_POWER_OFF: + handler->list = &power_off_handler_list; + break; + + case SYS_OFF_MODE_RESTART: + handler->list = &restart_handler_list; + break; + + default: + kfree(handler); + return ERR_PTR(-EINVAL); + } + + handler->nb.notifier_call = sys_off_notify; + handler->nb.priority = priority; + handler->sys_off_cb = callback; + handler->cb_data = cb_data; + handler->mode = mode; + + if (handler->blocking) { + if (priority == SYS_OFF_PRIO_DEFAULT) + err = blocking_notifier_chain_register(handler->list, + &handler->nb); + else + err = blocking_notifier_chain_register_unique_prio(handler->list, + &handler->nb); + } else { + if (priority == SYS_OFF_PRIO_DEFAULT) + err = atomic_notifier_chain_register(handler->list, + &handler->nb); + else + err = atomic_notifier_chain_register_unique_prio(handler->list, + &handler->nb); + } + + if (err) { + kfree(handler); + return ERR_PTR(err); + } + + return handler; +} +EXPORT_SYMBOL_GPL(register_sys_off_handler); + +/** + * unregister_sys_off_handler - Unregister sys-off handler + * @handler: Sys-off handler + * + * Unregisters given sys-off handler. + */ +void unregister_sys_off_handler(struct sys_off_handler *handler) +{ + int err; + + if (!handler) + return; + + if (handler->blocking) + err = blocking_notifier_chain_unregister(handler->list, + &handler->nb); + else + err = atomic_notifier_chain_unregister(handler->list, + &handler->nb); + + /* sanity check, shall never happen */ + WARN_ON(err); + + kfree(handler); +} +EXPORT_SYMBOL_GPL(unregister_sys_off_handler); + +static void devm_unregister_sys_off_handler(void *data) +{ + struct sys_off_handler *handler = data; + + unregister_sys_off_handler(handler); +} + +/** + * devm_register_sys_off_handler - Register sys-off handler + * @dev: Device that registers handler + * @mode: Sys-off mode + * @priority: Handler priority + * @callback: Callback function + * @cb_data: Callback argument + * + * Registers resource-managed sys-off handler. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_sys_off_handler(struct device *dev, + enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + struct sys_off_handler *handler; + + handler = register_sys_off_handler(mode, priority, callback, cb_data); + if (IS_ERR(handler)) + return PTR_ERR(handler); + + return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler, + handler); +} +EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); + /** * kernel_power_off - power_off the system * -- cgit v1.2.3 From 7b9a3de9ffe76e4a965bafe26c91d9c6351e1e18 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:12 +0300 Subject: kernel/reboot: Wrap legacy power-off callbacks into sys-off handlers Wrap legacy power-off callbacks into sys-off handlers in order to support co-existence of both legacy and new callbacks while we're in process of upgrading legacy callbacks to the new API. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 3e6867ee61e0..96f681f7b20c 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -463,6 +463,47 @@ int devm_register_sys_off_handler(struct device *dev, } EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); +static int legacy_pm_power_off_prepare(struct sys_off_data *data) +{ + if (pm_power_off_prepare) + pm_power_off_prepare(); + + return NOTIFY_DONE; +} + +static int legacy_pm_power_off(struct sys_off_data *data) +{ + if (pm_power_off) + pm_power_off(); + + return NOTIFY_DONE; +} + +/* + * Register sys-off handlers for legacy PM callbacks. This allows legacy + * PM callbacks co-exist with the new sys-off API. + * + * TODO: Remove legacy handlers once all legacy PM users will be switched + * to the sys-off based APIs. + */ +static int __init legacy_pm_init(void) +{ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF_PREPARE, + SYS_OFF_PRIO_DEFAULT, + legacy_pm_power_off_prepare, NULL); + + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_DEFAULT, + legacy_pm_power_off, NULL); + + return 0; +} +core_initcall(legacy_pm_init); + +static void do_kernel_power_off_prepare(void) +{ + blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL); +} + /** * kernel_power_off - power_off the system * @@ -471,8 +512,7 @@ EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); void kernel_power_off(void) { kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); + do_kernel_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); -- cgit v1.2.3 From 2b6aa7332f8020dfdaffe340ff038aac4df35238 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:13 +0300 Subject: kernel/reboot: Add do_kernel_power_off() Add do_kernel_power_off() helper that will remove open-coded pm_power_off invocations from the architecture code. This is the first step on the way to remove the global pm_power_off variable, which will allow us to implement consistent power-off chaining support. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 2 ++ kernel/reboot.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index b79f8942ea5f..99a9753e77bc 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -63,6 +63,8 @@ extern void machine_shutdown(void); struct pt_regs; extern void machine_crash_shutdown(struct pt_regs *); +void do_kernel_power_off(void); + /* * sys-off handler API. */ diff --git a/kernel/reboot.c b/kernel/reboot.c index 96f681f7b20c..892d25aa12df 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -504,6 +504,19 @@ static void do_kernel_power_off_prepare(void) blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL); } +/** + * do_kernel_power_off - Execute kernel power-off handler call chain + * + * Expected to be called as last step of the power-off sequence. + * + * Powers off the system immediately if a power-off handler function has + * been registered. Otherwise does nothing. + */ +void do_kernel_power_off(void) +{ + atomic_notifier_call_chain(&power_off_handler_list, 0, NULL); +} + /** * kernel_power_off - power_off the system * -- cgit v1.2.3 From 5d34b41aa420c3908793b43419a1097362ca2668 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:14 +0300 Subject: kernel/reboot: Add stub for pm_power_off Add weak stub for the global pm_power_off callback variable. This will allow us to remove pm_power_off definitions from arch/ code and transition to the new sys-off based API that will replace the global variable. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 892d25aa12df..7723aff4d09b 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -57,6 +57,12 @@ struct sys_off_handler { void *list; }; +/* + * Temporary stub that prevents linkage failure while we're in process + * of removing all uses of legacy pm_power_off() around the kernel. + */ +void __weak (*pm_power_off)(void); + /* * If set, this is used for preparing the system to power off. */ -- cgit v1.2.3 From 0e2110d2e910e44cc7cf23fce14613e232602602 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:15 +0300 Subject: kernel/reboot: Add kernel_can_power_off() Add kernel_can_power_off() helper that replaces open-coded checks of the global pm_power_off variable. This is a necessary step towards supporting chained power-off handlers. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 1 + kernel/reboot.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 99a9753e77bc..5837c6c2fb40 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -149,6 +149,7 @@ extern void kernel_restart_prepare(char *cmd); extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); +extern bool kernel_can_power_off(void); extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); diff --git a/kernel/reboot.c b/kernel/reboot.c index 7723aff4d09b..fb3f49350927 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -523,6 +523,18 @@ void do_kernel_power_off(void) atomic_notifier_call_chain(&power_off_handler_list, 0, NULL); } +/** + * kernel_can_power_off - check whether system can be powered off + * + * Returns true if power-off handler is registered and system can be + * powered off, false otherwise. + */ +bool kernel_can_power_off(void) +{ + return !atomic_notifier_call_chain_is_empty(&power_off_handler_list); +} +EXPORT_SYMBOL_GPL(kernel_can_power_off); + /** * kernel_power_off - power_off the system * @@ -581,7 +593,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) cmd = LINUX_REBOOT_CMD_HALT; mutex_lock(&system_transition_mutex); -- cgit v1.2.3 From fb61375ecfba49e153af561402f49f6fe3bebd39 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:16 +0300 Subject: kernel/reboot: Add register_platform_power_off() Add platform-level registration helpers that will ease transition of the arch/platform power-off callbacks to the new sys-off based API, allowing us to remove the global pm_power_off variable in the future. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 3 +++ kernel/reboot.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 5837c6c2fb40..a8fee1d09c65 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -141,6 +141,9 @@ int devm_register_sys_off_handler(struct device *dev, int (*callback)(struct sys_off_data *data), void *cb_data); +int register_platform_power_off(void (*power_off)(void)); +void unregister_platform_power_off(void (*power_off)(void)); + /* * Architecture independent implemenations of sys_reboot commands. */ diff --git a/kernel/reboot.c b/kernel/reboot.c index fb3f49350927..673d3f54dcad 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -469,6 +469,61 @@ int devm_register_sys_off_handler(struct device *dev, } EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); +static struct sys_off_handler *platform_power_off_handler; + +static int platform_power_off_notify(struct sys_off_data *data) +{ + void (*platform_power_power_off_cb)(void) = data->cb_data; + + platform_power_power_off_cb(); + + return NOTIFY_DONE; +} + +/** + * register_platform_power_off - Register platform-level power-off callback + * @power_off: Power-off callback + * + * Registers power-off callback that will be called as last step + * of the power-off sequence. This callback is expected to be invoked + * for the last resort. Only one platform power-off callback is allowed + * to be registered at a time. + * + * Returns zero on success, or error code on failure. + */ +int register_platform_power_off(void (*power_off)(void)) +{ + struct sys_off_handler *handler; + + handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_PLATFORM, + platform_power_off_notify, + power_off); + if (IS_ERR(handler)) + return PTR_ERR(handler); + + platform_power_off_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(register_platform_power_off); + +/** + * unregister_platform_power_off - Unregister platform-level power-off callback + * @power_off: Power-off callback + * + * Unregisters previously registered platform power-off callback. + */ +void unregister_platform_power_off(void (*power_off)(void)) +{ + if (platform_power_off_handler && + platform_power_off_handler->cb_data == power_off) { + unregister_sys_off_handler(platform_power_off_handler); + platform_power_off_handler = NULL; + } +} +EXPORT_SYMBOL_GPL(unregister_platform_power_off); + static int legacy_pm_power_off_prepare(struct sys_off_data *data) { if (pm_power_off_prepare) -- cgit v1.2.3 From 5b71808eb7c97815fc3b9c386ca0ef6daf2dc053 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:32 +0300 Subject: reboot: Remove pm_power_off_prepare() All pm_power_off_prepare() users were converted to sys-off handler API. Remove the obsolete global callback variable. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 1 - kernel/reboot.c | 19 ------------------- 2 files changed, 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/pm.h b/include/linux/pm.h index e65b3ab28377..3786d306dad2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -21,7 +21,6 @@ * Callbacks for platform drivers to implement. */ extern void (*pm_power_off)(void); -extern void (*pm_power_off_prepare)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP diff --git a/kernel/reboot.c b/kernel/reboot.c index 673d3f54dcad..88c835167d9e 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -63,13 +63,6 @@ struct sys_off_handler { */ void __weak (*pm_power_off)(void); -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); -EXPORT_SYMBOL_GPL(pm_power_off_prepare); - /** * emergency_restart - reboot the system * @@ -524,14 +517,6 @@ void unregister_platform_power_off(void (*power_off)(void)) } EXPORT_SYMBOL_GPL(unregister_platform_power_off); -static int legacy_pm_power_off_prepare(struct sys_off_data *data) -{ - if (pm_power_off_prepare) - pm_power_off_prepare(); - - return NOTIFY_DONE; -} - static int legacy_pm_power_off(struct sys_off_data *data) { if (pm_power_off) @@ -549,10 +534,6 @@ static int legacy_pm_power_off(struct sys_off_data *data) */ static int __init legacy_pm_init(void) { - register_sys_off_handler(SYS_OFF_MODE_POWER_OFF_PREPARE, - SYS_OFF_PRIO_DEFAULT, - legacy_pm_power_off_prepare, NULL); - register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_DEFAULT, legacy_pm_power_off, NULL); -- cgit v1.2.3 From d2c5415327171e6c1bca2dc4d8a77f450ecf7150 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:34 +0300 Subject: kernel/reboot: Add devm_register_power_off_handler() Add devm_register_power_off_handler() helper that registers sys-off handler using power-off mode and with a default priority. Most drivers will want to register power-off handler with a default priority, so this helper will reduce the boilerplate code and make code easier to read and follow. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 4 ++++ kernel/reboot.c | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index a8fee1d09c65..3b063c5b6118 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -141,6 +141,10 @@ int devm_register_sys_off_handler(struct device *dev, int (*callback)(struct sys_off_data *data), void *cb_data); +int devm_register_power_off_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data); + int register_platform_power_off(void (*power_off)(void)); void unregister_platform_power_off(void (*power_off)(void)); diff --git a/kernel/reboot.c b/kernel/reboot.c index 88c835167d9e..4dbb6e29247a 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -462,6 +462,28 @@ int devm_register_sys_off_handler(struct device *dev, } EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); +/** + * devm_register_power_off_handler - Register power-off handler + * @dev: Device that registers callback + * @callback: Callback function + * @cb_data: Callback's argument + * + * Registers resource-managed sys-off handler with a default priority + * and using power-off mode. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_power_off_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + return devm_register_sys_off_handler(dev, + SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + callback, cb_data); +} +EXPORT_SYMBOL_GPL(devm_register_power_off_handler); + static struct sys_off_handler *platform_power_off_handler; static int platform_power_off_notify(struct sys_off_data *data) -- cgit v1.2.3 From 6779db970bd287bb35b28bd5dc256fd7aef19d1c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:35 +0300 Subject: kernel/reboot: Add devm_register_restart_handler() Add devm_register_restart_handler() helper that registers sys-off handler using restart mode and with a default priority. Most drivers will want to register restart handler with a default priority, so this helper will reduce the boilerplate code and make code easier to read and follow. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 4 ++++ kernel/reboot.c | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 3b063c5b6118..d29d37a2abb6 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -145,6 +145,10 @@ int devm_register_power_off_handler(struct device *dev, int (*callback)(struct sys_off_data *data), void *cb_data); +int devm_register_restart_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data); + int register_platform_power_off(void (*power_off)(void)); void unregister_platform_power_off(void (*power_off)(void)); diff --git a/kernel/reboot.c b/kernel/reboot.c index 4dbb6e29247a..bdb635842ec8 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -484,6 +484,28 @@ int devm_register_power_off_handler(struct device *dev, } EXPORT_SYMBOL_GPL(devm_register_power_off_handler); +/** + * devm_register_restart_handler - Register restart handler + * @dev: Device that registers callback + * @callback: Callback function + * @cb_data: Callback's argument + * + * Registers resource-managed sys-off handler with a default priority + * and using restart mode. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_restart_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + return devm_register_sys_off_handler(dev, + SYS_OFF_MODE_RESTART, + SYS_OFF_PRIO_DEFAULT, + callback, cb_data); +} +EXPORT_SYMBOL_GPL(devm_register_restart_handler); + static struct sys_off_handler *platform_power_off_handler; static int platform_power_off_notify(struct sys_off_data *data) -- cgit v1.2.3 From 4853f68d158ac59b05985a6af5b7da7ccdbc14c8 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Fri, 8 Apr 2022 18:09:09 +0800 Subject: kexec_file: Fix kexec_file.c build error for riscv platform When CONFIG_KEXEC_FILE is set for riscv platform, the compilation of kernel/kexec_file.c generate build error: kernel/kexec_file.c: In function 'crash_prepare_elf64_headers': ./arch/riscv/include/asm/page.h:110:71: error: request for member 'virt_addr' in something not a structure or union 110 | ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < kernel_map.virt_addr)) | ^ ./arch/riscv/include/asm/page.h:131:2: note: in expansion of macro 'is_linear_mapping' 131 | is_linear_mapping(_x) ? \ | ^~~~~~~~~~~~~~~~~ ./arch/riscv/include/asm/page.h:140:31: note: in expansion of macro '__va_to_pa_nodebug' 140 | #define __phys_addr_symbol(x) __va_to_pa_nodebug(x) | ^~~~~~~~~~~~~~~~~~ ./arch/riscv/include/asm/page.h:143:24: note: in expansion of macro '__phys_addr_symbol' 143 | #define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0)) | ^~~~~~~~~~~~~~~~~~ kernel/kexec_file.c:1327:36: note: in expansion of macro '__pa_symbol' 1327 | phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); This occurs is because the "kernel_map" referenced in macro is_linear_mapping() is suppose to be the one of struct kernel_mapping defined in arch/riscv/mm/init.c, but the 2nd argument of crash_prepare_elf64_header() has same symbol name, in expansion of macro is_linear_mapping in function crash_prepare_elf64_header(), "kernel_map" actually is the local variable. Signed-off-by: Liao Chang Link: https://lore.kernel.org/r/20220408100914.150110-2-lizhengyu3@huawei.com Signed-off-by: Palmer Dabbelt --- include/linux/kexec.h | 2 +- kernel/kexec_file.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 58d1b58a971e..ebb1bffbf068 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -227,7 +227,7 @@ struct crash_mem { extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend); -extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, +extern int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz); #endif /* CONFIG_KEXEC_FILE */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 8347fc158d2b..331a4f0f10f5 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1260,7 +1260,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } -int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, +int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { Elf64_Ehdr *ehdr; @@ -1324,7 +1324,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr++; /* Prepare PT_LOAD type program header for kernel text region */ - if (kernel_map) { + if (need_kernel_map) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_vaddr = (unsigned long) _text; -- cgit v1.2.3 From d2081b2bf8195b8239c67fdd61518e077da7cbec Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: make khugepaged_enter() void function The most callers of khugepaged_enter() don't care about the return value. Only dup_mmap(), anonymous THP page fault and MADV_HUGEPAGE handle the error by returning -ENOMEM. Actually it is not harmful for them to ignore the error case either. It also sounds overkilling to fail fork() and page fault early due to khugepaged_enter() error, and MADV_HUGEPAGE does set VM_HUGEPAGE flag regardless of the error. Link: https://lkml.kernel.org/r/20220510203222.24246-6-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 30 ++++++++++++------------------ kernel/fork.c | 4 +--- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 18 +++++++----------- 4 files changed, 22 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 2fcc01891b47..0423d3619f26 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -12,10 +12,10 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags); +extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); @@ -40,11 +40,10 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm, (transparent_hugepage_flags & \ (1<flags)) - return __khugepaged_enter(mm); - return 0; + __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) @@ -53,7 +52,7 @@ static inline void khugepaged_exit(struct mm_struct *mm) __khugepaged_exit(mm); } -static inline int khugepaged_enter(struct vm_area_struct *vma, +static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) @@ -62,27 +61,22 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && !(vm_flags & VM_NOHUGEPAGE) && !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - if (__khugepaged_enter(vma->vm_mm)) - return -ENOMEM; - return 0; + __khugepaged_enter(vma->vm_mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - return 0; } static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline int khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { - return 0; } -static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { - return 0; } static inline void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) diff --git a/kernel/fork.c b/kernel/fork.c index 9796897560ab..0d13baf86650 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -612,9 +612,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = ksm_fork(mm, oldmm); if (retval) goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; + khugepaged_fork(mm, oldmm); prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a00bdf0f1e8..efd06af61fca 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -726,8 +726,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma, vma->vm_flags))) - return VM_FAULT_OOM; + khugepaged_enter(vma, vma->vm_flags); + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1f4822dcc970..b43b294ca5f8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -365,8 +365,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (khugepaged_enter_vma_merge(vma, *vm_flags)) - return -ENOMEM; + khugepaged_enter_vma_merge(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; @@ -475,20 +474,20 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, return true; } -int __khugepaged_enter(struct mm_struct *mm) +void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; int wakeup; mm_slot = alloc_mm_slot(); if (!mm_slot) - return -ENOMEM; + return; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); - return 0; + return; } spin_lock(&khugepaged_mm_lock); @@ -504,11 +503,9 @@ int __khugepaged_enter(struct mm_struct *mm) mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); - - return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma, +void khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { unsigned long hstart, hend; @@ -519,13 +516,12 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * file-private shmem THP is not supported. */ if (!hugepage_vma_check(vma, vm_flags)) - return 0; + return; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma, vm_flags); - return 0; + khugepaged_enter(vma, vm_flags); } void __khugepaged_exit(struct mm_struct *mm) -- cgit v1.2.3 From 279b192c23d2fc9cb9e8c3851d6cad968f51789f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Mar 2022 16:11:23 -0500 Subject: blob_to_mnt(): kern_unmount() is needed to undo kern_mount() plain mntput() won't do. Signed-off-by: Al Viro --- kernel/usermode_driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 9dae1f648713..8303f4c7ca71 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -28,7 +28,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); if (IS_ERR(file)) { - mntput(mnt); + kern_unmount(mnt); return ERR_CAST(file); } @@ -38,7 +38,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na if (err >= 0) err = -ENOMEM; filp_close(file, NULL); - mntput(mnt); + kern_unmount(mnt); return ERR_PTR(err); } -- cgit v1.2.3 From b154a017c92011d8f71ce804583e5f9c3d90bb9a Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Wed, 18 May 2022 09:36:47 +0800 Subject: cgroup: remove the superfluous judgment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the superfluous judgment since the function is never called for a root cgroup, as suggested by Tejun. Suggested-by: Tejun Heo Signed-off-by: Shida Zhang Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a97fd051430b..1779ccddb734 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5685,7 +5685,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) + if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); -- cgit v1.2.3 From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 May 2022 16:30:10 -0700 Subject: bpf: Add bpf_skc_to_mptcp_sock_proto This patch implements a new struct bpf_func_proto, named bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP, and a new helper bpf_skc_to_mptcp_sock(), which invokes another new helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct mptcp_sock from a given subflow socket. v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c, remove build check for CONFIG_BPF_JIT v5: Drop EXPORT_SYMBOL (Martin) Co-developed-by: Nicolas Rybowski Co-developed-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com --- include/linux/bpf.h | 1 + include/linux/btf_ids.h | 3 ++- include/net/mptcp.h | 6 ++++++ include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/verifier.c | 1 + kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 18 ++++++++++++++++++ net/mptcp/Makefile | 2 ++ net/mptcp/bpf.c | 21 +++++++++++++++++++++ scripts/bpf_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 7 +++++++ 11 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 net/mptcp/bpf.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c107392b0ba7..a3ef078401cf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2231,6 +2231,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index bc5d9cc34e4c..335a19092368 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -178,7 +178,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) enum { #define BTF_SOCK_TYPE(name, str) name, diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8b1afd6f5cc4..2ba09de955c7 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -284,4 +284,10 @@ static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif +#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); +#else +static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } +#endif + #endif /* __NET_MPTCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0210f85131b3..56688bee20d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5172,6 +5172,12 @@ union bpf_attr { * Return * Map value associated to *key* on *cpu*, or **NULL** if no entry * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5370,6 +5376,7 @@ union bpf_attr { FN(ima_file_hash), \ FN(kptr_xchg), \ FN(map_lookup_percpu_elem), \ + FN(skc_to_mptcp_sock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b59581026f8..14e8c17d3d8d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -509,6 +509,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_sock || func_id == BPF_FUNC_skc_to_tcp6_sock || func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_mptcp_sock || func_id == BPF_FUNC_skc_to_tcp_timewait_sock || func_id == BPF_FUNC_skc_to_tcp_request_sock; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7141ca8a1c2d..10b157a6d73e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1705,6 +1705,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_udp6_sock_proto; case BPF_FUNC_skc_to_unix_sock: return &bpf_skc_to_unix_sock_proto; + case BPF_FUNC_skc_to_mptcp_sock: + return &bpf_skc_to_mptcp_sock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: diff --git a/net/core/filter.c b/net/core/filter.c index fe0da529d00f..5af58eb48587 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -78,6 +78,7 @@ #include #include #include +#include static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -11281,6 +11282,20 @@ const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; +BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) +{ + BTF_TYPE_EMIT(struct mptcp_sock); + return (unsigned long)bpf_mptcp_sock_from_subflow(sk); +} + +const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { + .func = bpf_skc_to_mptcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], +}; + BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); @@ -11323,6 +11338,9 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_skc_to_mptcp_sock: + func = &bpf_skc_to_mptcp_sock_proto; + break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index e54daceac58b..99dddf08ca73 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -10,3 +10,5 @@ obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o + +obj-$(CONFIG_BPF_SYSCALL) += bpf.o diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c new file mode 100644 index 000000000000..5a0a84ad94af --- /dev/null +++ b/net/mptcp/bpf.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Tessares SA. + * Copyright (c) 2022, SUSE. + * + * Author: Nicolas Rybowski + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include +#include "protocol.h" + +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) +{ + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + return mptcp_sk(mptcp_subflow_ctx(sk)->conn); + + return NULL; +} diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 096625242475..d5452f7eb996 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -633,6 +633,7 @@ class PrinterHelpers(Printer): 'struct socket', 'struct file', 'struct bpf_timer', + 'struct mptcp_sock', ] known_types = { '...', @@ -682,6 +683,7 @@ class PrinterHelpers(Printer): 'struct socket', 'struct file', 'struct bpf_timer', + 'struct mptcp_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0210f85131b3..56688bee20d9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5172,6 +5172,12 @@ union bpf_attr { * Return * Map value associated to *key* on *cpu*, or **NULL** if no entry * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5370,6 +5376,7 @@ union bpf_attr { FN(ima_file_hash), \ FN(kptr_xchg), \ FN(map_lookup_percpu_elem), \ + FN(skc_to_mptcp_sock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 979497674e63666a99fd7d242dba53a5ca5d628b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 18 May 2022 22:59:08 +0200 Subject: bpf: Allow kfunc in tracing and syscall programs. Tracing and syscall BPF program types are very convenient to add BPF capabilities to subsystem otherwise not BPF capable. When we add kfuncs capabilities to those program types, we can add BPF features to subsystems without having to touch BPF core. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220518205924.399291-2-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2f0b0440131c..7bccaa4646e5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -202,6 +202,8 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_TRACING, + BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_MAX, }; @@ -7110,6 +7112,10 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_TC; case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; + case BPF_PROG_TYPE_TRACING: + return BTF_KFUNC_HOOK_TRACING; + case BPF_PROG_TYPE_SYSCALL: + return BTF_KFUNC_HOOK_SYSCALL; default: return BTF_KFUNC_HOOK_MAX; } -- cgit v1.2.3 From c8644cd0efe719608ddcb341bcf087d4bc0bf6b8 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 19 May 2022 15:25:33 +0100 Subject: bpf: refine kernel.unprivileged_bpf_disabled behaviour With unprivileged BPF disabled, all cmds associated with the BPF syscall are blocked to users without CAP_BPF/CAP_SYS_ADMIN. However there are use cases where we may wish to allow interactions with BPF programs without being able to load and attach them. So for example, a process with required capabilities loads/attaches a BPF program, and a process with less capabilities interacts with it; retrieving perf/ring buffer events, modifying map-specified config etc. With all BPF syscall commands blocked as a result of unprivileged BPF being disabled, this mode of interaction becomes impossible for processes without CAP_BPF. As Alexei notes "The bpf ACL model is the same as traditional file's ACL. The creds and ACLs are checked at open(). Then during file's write/read additional checks might be performed. BPF has such functionality already. Different map_creates have capability checks while map_lookup has: map_get_sys_perms(map, f) & FMODE_CAN_READ. In other words it's enough to gate FD-receiving parts of bpf with unprivileged_bpf_disabled sysctl. The rest is handled by availability of FD and access to files in bpffs." So key fd creation syscall commands BPF_PROG_LOAD and BPF_MAP_CREATE are blocked with unprivileged BPF disabled and no CAP_BPF. And as Alexei notes, map creation with unprivileged BPF disabled off blocks creation of maps aside from array, hash and ringbuf maps. Programs responsible for loading and attaching the BPF program can still control access to its pinned representation by restricting permissions on the pin path, as with normal files. Signed-off-by: Alan Maguire Acked-by: Yonghong Song Acked-by: Shung-Hsi Yu Acked-by: KP Singh Link: https://lore.kernel.org/r/1652970334-30510-2-git-send-email-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 72e53489165d..2b69306d3c6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4863,9 +4863,21 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; + bool capable; int err; - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; + + /* Intent here is for unprivileged_bpf_disabled to block key object + * creation commands for unprivileged users; other actions depend + * of fd availability and access to bpffs, so are dependent on + * object creation success. Capabilities are later verified for + * operations such as load and map create, so even with unprivileged + * BPF disabled, capability checks are still carried out for these + * and other operations. + */ + if (!capable && + (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); -- cgit v1.2.3 From 4a37f3dd9a83186cb88d44808ab35b78375082c9 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 20 May 2022 18:10:13 +0100 Subject: dma-direct: don't over-decrypt memory The original x86 sev_alloc() only called set_memory_decrypted() on memory returned by alloc_pages_node(), so the page order calculation fell out of that logic. However, the common dma-direct code has several potential allocators, not all of which are guaranteed to round up the underlying allocation to a power-of-two size, so carrying over that calculation for the encryption/decryption size was a mistake. Fix it by rounding to a *number* of pages, rather than an order. Until recently there was an even worse interaction with DMA_DIRECT_REMAP where we could have ended up decrypting part of the next adjacent vmalloc area, only averted by no architecture actually supporting both configs at once. Don't ask how I found that one out... Fixes: c10f07aa27da ("dma/direct: Handle force decryption for DMA coherent buffers in common code") Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig Acked-by: David Rientjes --- kernel/dma/direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 3e7f4aab740e..e978f36e6be8 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -79,7 +79,7 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) { if (!force_dma_unencrypted(dev)) return 0; - return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size)); + return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size)); } static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) @@ -88,7 +88,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) if (!force_dma_unencrypted(dev)) return 0; - ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size)); if (ret) pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); return ret; -- cgit v1.2.3 From d88bb5eed04ce50cc20e7f9282977841728be798 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:51 -0700 Subject: bpf: Fill new bpf_prog_pack with illegal instructions bpf_prog_pack enables sharing huge pages among multiple BPF programs. These pages are marked as executable before the JIT engine fill it with BPF programs. To make these pages safe, fill the hole bpf_prog_pack with illegal instructions before making it executable. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]") Reported-by: Linus Torvalds Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220520235758.1858153-2-song@kernel.org --- kernel/bpf/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9cc91f0f3115..2d0c9d4696ad 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void) return size; } -static struct bpf_prog_pack *alloc_new_pack(void) +static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; @@ -886,6 +886,7 @@ static struct bpf_prog_pack *alloc_new_pack(void) kfree(pack); return NULL; } + bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size); bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); @@ -895,7 +896,7 @@ static struct bpf_prog_pack *alloc_new_pack(void) return pack; } -static void *bpf_prog_pack_alloc(u32 size) +static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -910,6 +911,7 @@ static void *bpf_prog_pack_alloc(u32 size) size = round_up(size, PAGE_SIZE); ptr = module_alloc(size); if (ptr) { + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); set_memory_x((unsigned long)ptr, size / PAGE_SIZE); @@ -923,7 +925,7 @@ static void *bpf_prog_pack_alloc(u32 size) goto found_free_area; } - pack = alloc_new_pack(); + pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; @@ -1102,7 +1104,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(size)) return NULL; - ro_header = bpf_prog_pack_alloc(size); + ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; -- cgit v1.2.3 From fe736565efb775620dbcf3c459c1cd80d3e868da Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:53 -0700 Subject: bpf: Introduce bpf_arch_text_invalidate for bpf_prog_pack Introduce bpf_arch_text_invalidate and use it to fill unused part of the bpf_prog_pack with illegal instructions when a BPF program is freed. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]") Reported-by: Linus Torvalds Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220520235758.1858153-4-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 5 +++++ include/linux/bpf.h | 1 + kernel/bpf/core.c | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a2b6d197c226..f298b18a9a3d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -228,6 +228,11 @@ static void jit_fill_hole(void *area, unsigned int size) memset(area, 0xcc, size); } +int bpf_arch_text_invalidate(void *dst, size_t len) +{ + return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len)); +} + struct jit_context { int cleanup_addr; /* Epilogue code offset */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc4d5e394031..a9b1875212f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2365,6 +2365,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); void *bpf_arch_text_copy(void *dst, void *src, size_t len); +int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2d0c9d4696ad..cacd8684c3c4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -968,6 +968,9 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); + bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, bpf_prog_chunk_count(), 0) == 0) { @@ -2740,6 +2743,11 @@ void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) return ERR_PTR(-ENOTSUPP); } +int __weak bpf_arch_text_invalidate(void *dst, size_t len) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit v1.2.3 From 1ec5ee8c8a5a65ea377f8bea64bf4d5b743f6f79 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 May 2022 18:56:20 +0530 Subject: bpf: Suppress 'passing zero to PTR_ERR' warning Kernel Test Robot complains about passing zero to PTR_ERR for the said line, suppress it by using PTR_ERR_OR_ZERO. Fixes: c0a5a21c25f3 ("bpf: Allow storing referenced kptr in map") Reported-by: kernel test robot Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220521132620.1976921-1-memxor@gmail.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14e8c17d3d8d..45153cbc2bd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5352,7 +5352,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (!map_value_has_kptrs(map_ptr)) { - ret = PTR_ERR(map_ptr->kptr_off_tab); + ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); if (ret == -E2BIG) verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, BPF_MAP_VALUE_OFF_MAX); -- cgit v1.2.3 From 97e03f521050c092919591e668107b3d69c5f426 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:07 -0700 Subject: bpf: Add verifier support for dynptrs This patch adds the bulk of the verifier work for supporting dynamic pointers (dynptrs) in bpf. A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure defined internally as: struct bpf_dynptr_kern { void *data; u32 size; u32 offset; } __aligned(8); The upper 8 bits of *size* is reserved (it contains extra metadata about read-only status and dynptr type). Consequently, a dynptr only supports memory less than 16 MB. There are different types of dynptrs (eg malloc, ringbuf, ...). In this patchset, the most basic one, dynptrs to a bpf program's local memory, is added. For now only local memory that is of reg type PTR_TO_MAP_VALUE is supported. In the verifier, dynptr state information will be tracked in stack slots. When the program passes in an uninitialized dynptr (ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding to the frame pointer where the dynptr resides at are marked STACK_DYNPTR. For helper functions that take in initialized dynptrs (eg bpf_dynptr_read + bpf_dynptr_write which are added later in this patchset), the verifier enforces that the dynptr has been initialized properly by checking that their corresponding stack slots have been marked as STACK_DYNPTR. The 6th patch in this patchset adds test cases that the verifier should successfully reject, such as for example attempting to use a dynptr after doing a direct write into it inside the bpf program. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20220523210712.3641569-2-joannelkoong@gmail.com --- include/linux/bpf.h | 28 ++++++ include/linux/bpf_verifier.h | 18 ++++ include/uapi/linux/bpf.h | 5 ++ kernel/bpf/verifier.c | 188 ++++++++++++++++++++++++++++++++++++++++- scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 5 ++ 6 files changed, 243 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b1875212f6..b26c8176b9e0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -392,10 +392,15 @@ enum bpf_type_flag { MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to memory local to the bpf program. */ + DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; +#define DYNPTR_TYPE_FLAG_MASK DYNPTR_TYPE_LOCAL + /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -438,6 +443,7 @@ enum bpf_arg_type { ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ + ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -2376,4 +2382,26 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 **bin_buf, u32 num_args); void bpf_bprintf_cleanup(void); +/* the implementation of the opaque uapi struct bpf_dynptr */ +struct bpf_dynptr_kern { + void *data; + /* Size represents the number of usable bytes of dynptr data. + * If for example the offset is at 4 for a local dynptr whose data is + * of type u64, the number of usable bytes is 4. + * + * The upper 8 bits are reserved. It is as follows: + * Bits 0 - 23 = size + * Bits 24 - 30 = dynptr type + * Bit 31 = whether dynptr is read-only + */ + u32 size; + u32 offset; +} __aligned(8); + +enum bpf_dynptr_type { + BPF_DYNPTR_TYPE_INVALID, + /* Points to memory that is local to the bpf program */ + BPF_DYNPTR_TYPE_LOCAL, +}; + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1f1e7f2ea967..af5b2135215e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -72,6 +72,18 @@ struct bpf_reg_state { u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* For dynptr stack slots */ + struct { + enum bpf_dynptr_type type; + /* A dynptr is 16 bytes so it takes up 2 stack slots. + * We need to track which slot is the first slot + * to protect against cases where the user may try to + * pass in an address starting at the second slot of the + * dynptr. + */ + bool first_slot; + } dynptr; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -174,9 +186,15 @@ enum bpf_stack_slot_type { STACK_SPILL, /* register spilled into stack */ STACK_MISC, /* BPF program wrote some data into this slot */ STACK_ZERO, /* BPF program wrote constant zero */ + /* A dynptr is stored in this stack slot. The type of dynptr + * is stored in bpf_stack_state->spilled_ptr.dynptr.type + */ + STACK_DYNPTR, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +#define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) +#define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) struct bpf_stack_state { struct bpf_reg_state spilled_ptr; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 56688bee20d9..610944cb3389 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6528,6 +6528,11 @@ struct bpf_timer { __u64 :64; } __attribute__((aligned(8))); +struct bpf_dynptr { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45153cbc2bd6..1fd0b81c3fec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -259,6 +259,7 @@ struct bpf_call_arg_meta { u32 ret_btf_id; u32 subprogno; struct bpf_map_value_off_desc *kptr_off_desc; + u8 uninit_dynptr_regno; }; struct btf *btf_vmlinux; @@ -581,6 +582,7 @@ static char slot_type_char[] = { [STACK_SPILL] = 'r', [STACK_MISC] = 'm', [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', }; static void print_liveness(struct bpf_verifier_env *env, @@ -596,6 +598,25 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } +static int get_spi(s32 off) +{ + return (-off - 1) / BPF_REG_SIZE; +} + +static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) +{ + int allocated_slots = state->allocated_stack / BPF_REG_SIZE; + + /* We need to check that slots between [spi - nr_slots + 1, spi] are + * within [0, allocated_stack). + * + * Please note that the spi grows downwards. For example, a dynptr + * takes the size of two stack slots; the first slot will be at + * spi and the second slot will be at spi - 1. + */ + return spi - nr_slots + 1 >= 0 && spi < allocated_slots; +} + static struct bpf_func_state *func(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { @@ -647,6 +668,108 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) env->scratched_stack_slots = ~0ULL; } +static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) +{ + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + return BPF_DYNPTR_TYPE_LOCAL; + default: + return BPF_DYNPTR_TYPE_INVALID; + } +} + +static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type, int insn_idx) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type type; + int spi, i; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_DYNPTR; + state->stack[spi - 1].slot_type[i] = STACK_DYNPTR; + } + + type = arg_to_dynptr_type(arg_type); + if (type == BPF_DYNPTR_TYPE_INVALID) + return -EINVAL; + + state->stack[spi].spilled_ptr.dynptr.first_slot = true; + state->stack[spi].spilled_ptr.dynptr.type = type; + state->stack[spi - 1].spilled_ptr.dynptr.type = type; + + return 0; +} + +static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; + } + + state->stack[spi].spilled_ptr.dynptr.first_slot = false; + state->stack[spi].spilled_ptr.dynptr.type = 0; + state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + + return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return true; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) + return false; + } + + return true; +} + +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.dynptr.first_slot) + return false; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] != STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] != STACK_DYNPTR) + return false; + } + + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ + if (arg_type == ARG_PTR_TO_DYNPTR) + return true; + + return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type); +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -5400,6 +5523,11 @@ static bool arg_type_is_release(enum bpf_arg_type type) return type & OBJ_RELEASE; } +static bool arg_type_is_dynptr(enum bpf_arg_type type) +{ + return base_type(type) == ARG_PTR_TO_DYNPTR; +} + static int int_ptr_type_to_size(enum bpf_arg_type type) { if (type == ARG_PTR_TO_INT) @@ -5539,6 +5667,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -5628,8 +5757,13 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, bool fixed_off_ok = false; switch ((u32)type) { - case SCALAR_VALUE: /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_STACK: + if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset\n"); + return -EINVAL; + } + fallthrough; case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: @@ -5639,7 +5773,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: - case PTR_TO_STACK: + case SCALAR_VALUE: /* Some of the argument types nevertheless require a * zero register offset. */ @@ -5837,6 +5971,36 @@ skip_type_check: bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); + } else if (arg_type_is_dynptr(arg_type)) { + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } + + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local "; + break; + default: + break; + } + verbose(env, "Expected an initialized %sdynptr as arg #%d\n", + err_extra, arg + 1); + return -EINVAL; + } } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", @@ -6970,9 +7134,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); + if (meta.uninit_dynptr_regno) { + /* we write BPF_DW bits (8 bytes) at a time */ + for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { + err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], + fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], + insn_idx); + if (err) + return err; + } + if (meta.release_regno) { err = -EINVAL; - if (meta.ref_obj_id) + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + else if (meta.ref_obj_id) err = release_reference(env, meta.ref_obj_id); /* meta.ref_obj_id can only be 0 if register that is meant to be * released is NULL, which must be > R0. diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index d5452f7eb996..855b937e7585 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -634,6 +634,7 @@ class PrinterHelpers(Printer): 'struct file', 'struct bpf_timer', 'struct mptcp_sock', + 'struct bpf_dynptr', ] known_types = { '...', @@ -684,6 +685,7 @@ class PrinterHelpers(Printer): 'struct file', 'struct bpf_timer', 'struct mptcp_sock', + 'struct bpf_dynptr', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 56688bee20d9..610944cb3389 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6528,6 +6528,11 @@ struct bpf_timer { __u64 :64; } __attribute__((aligned(8))); +struct bpf_dynptr { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3 From 263ae152e96253f40c2c276faad8629e096b3bad Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:08 -0700 Subject: bpf: Add bpf_dynptr_from_mem for local dynptrs This patch adds a new api bpf_dynptr_from_mem: long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr); which initializes a dynptr to point to a bpf program's local memory. For now only local memory that is of reg type PTR_TO_MAP_VALUE is supported. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220523210712.3641569-3-joannelkoong@gmail.com --- include/uapi/linux/bpf.h | 12 ++++++++ kernel/bpf/helpers.c | 65 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 ++++ tools/include/uapi/linux/bpf.h | 12 ++++++++ 4 files changed, 95 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 610944cb3389..9be3644457dd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5178,6 +5178,17 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5377,6 +5388,7 @@ union bpf_attr { FN(kptr_xchg), \ FN(map_lookup_percpu_elem), \ FN(skc_to_mptcp_sock), \ + FN(dynptr_from_mem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bad96131a510..d3e935c2e25e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1412,6 +1412,69 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = { .arg2_btf_id = BPF_PTR_POISON, }; +/* Since the upper 8 bits of dynptr->size is reserved, the + * maximum supported size is 2^24 - 1. + */ +#define DYNPTR_MAX_SIZE ((1UL << 24) - 1) +#define DYNPTR_TYPE_SHIFT 28 + +static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) +{ + ptr->size |= type << DYNPTR_TYPE_SHIFT; +} + +static int bpf_dynptr_check_size(u32 size) +{ + return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; +} + +static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) +{ + ptr->data = data; + ptr->offset = offset; + ptr->size = size; + bpf_dynptr_set_type(ptr, type); +} + +static void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +{ + memset(ptr, 0, sizeof(*ptr)); +} + +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +{ + int err; + + err = bpf_dynptr_check_size(size); + if (err) + goto error; + + /* flags is currently unsupported */ + if (flags) { + err = -EINVAL; + goto error; + } + + bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); + + return 0; + +error: + bpf_dynptr_set_null(ptr); + return err; +} + +const struct bpf_func_proto bpf_dynptr_from_mem_proto = { + .func = bpf_dynptr_from_mem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1466,6 +1529,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1fd0b81c3fec..b657d46f886e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7204,6 +7204,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; + case BPF_FUNC_dynptr_from_mem: + if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { + verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n", + reg_type_str(env, regs[BPF_REG_1].type)); + return -EACCES; + } } if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 610944cb3389..9be3644457dd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5178,6 +5178,17 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5377,6 +5388,7 @@ union bpf_attr { FN(kptr_xchg), \ FN(map_lookup_percpu_elem), \ FN(skc_to_mptcp_sock), \ + FN(dynptr_from_mem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From bc34dee65a65e9c920c420005b8a43f2a721a458 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:09 -0700 Subject: bpf: Dynptr support for ring buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, our only way of writing dynamically-sized data into a ring buffer is through bpf_ringbuf_output but this incurs an extra memcpy cost. bpf_ringbuf_reserve + bpf_ringbuf_commit avoids this extra memcpy, but it can only safely support reservation sizes that are statically known since the verifier cannot guarantee that the bpf program won’t access memory outside the reserved space. The bpf_dynptr abstraction allows for dynamically-sized ring buffer reservations without the extra memcpy. There are 3 new APIs: long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr); void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags); void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags); These closely follow the functionalities of the original ringbuf APIs. For example, all ringbuffer dynptrs that have been reserved must be either submitted or discarded before the program exits. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20220523210712.3641569-4-joannelkoong@gmail.com --- include/linux/bpf.h | 15 +++++++- include/linux/bpf_verifier.h | 2 ++ include/uapi/linux/bpf.h | 35 +++++++++++++++++++ kernel/bpf/helpers.c | 14 +++++--- kernel/bpf/ringbuf.c | 78 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 35 +++++++++++++++++++ 7 files changed, 223 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b26c8176b9e0..c72321b6f306 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -395,11 +395,14 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to a ringbuf record. */ + DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK DYNPTR_TYPE_LOCAL +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -2231,6 +2234,9 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; @@ -2402,6 +2408,13 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, + /* Underlying data is a ringbuf record */ + BPF_DYNPTR_TYPE_RINGBUF, }; +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size); +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); +int bpf_dynptr_check_size(u32 size); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index af5b2135215e..e8439f6cbe57 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -100,6 +100,8 @@ struct bpf_reg_state { * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. + * For stack slots that are dynptrs, this is used to track references to + * the dynptr to determine proper reference freeing. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9be3644457dd..081a55540aa5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5189,6 +5189,38 @@ union bpf_attr { * Return * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5389,6 +5421,9 @@ union bpf_attr { FN(map_lookup_percpu_elem), \ FN(skc_to_mptcp_sock), \ FN(dynptr_from_mem), \ + FN(ringbuf_reserve_dynptr), \ + FN(ringbuf_submit_dynptr), \ + FN(ringbuf_discard_dynptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d3e935c2e25e..abb08999ff56 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1423,13 +1423,13 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -static int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u32 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } -static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, - enum bpf_dynptr_type type, u32 offset, u32 size) +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) { ptr->data = data; ptr->offset = offset; @@ -1437,7 +1437,7 @@ static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, bpf_dynptr_set_type(ptr, type); } -static void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { memset(ptr, 0, sizeof(*ptr)); } @@ -1523,6 +1523,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 311264ab80c4..ded4faeca192 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -475,3 +475,81 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, + struct bpf_dynptr_kern *, ptr) +{ + struct bpf_ringbuf_map *rb_map; + void *sample; + int err; + + if (unlikely(flags)) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + err = bpf_dynptr_check_size(size); + if (err) { + bpf_dynptr_set_null(ptr); + return err; + } + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + sample = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!sample) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { + .func = bpf_ringbuf_reserve_dynptr, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, +}; + +BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, false /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { + .func = bpf_ringbuf_submit_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, true /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { + .func = bpf_ringbuf_discard_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b657d46f886e..8be140351966 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -187,6 +187,9 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); +static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); + static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; @@ -673,17 +676,24 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { case DYNPTR_TYPE_LOCAL: return BPF_DYNPTR_TYPE_LOCAL; + case DYNPTR_TYPE_RINGBUF: + return BPF_DYNPTR_TYPE_RINGBUF; default: return BPF_DYNPTR_TYPE_INVALID; } } +static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +{ + return type == BPF_DYNPTR_TYPE_RINGBUF; +} + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type type; - int spi, i; + int spi, i, id; spi = get_spi(reg->off); @@ -703,6 +713,16 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi].spilled_ptr.dynptr.type = type; state->stack[spi - 1].spilled_ptr.dynptr.type = type; + if (dynptr_type_refcounted(type)) { + /* The id is used to track proper releasing */ + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + state->stack[spi].spilled_ptr.id = id; + state->stack[spi - 1].spilled_ptr.id = id; + } + return 0; } @@ -721,6 +741,13 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re state->stack[spi - 1].slot_type[i] = STACK_INVALID; } + /* Invalidate any slices associated with this dynptr */ + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + release_reference(env, state->stack[spi].spilled_ptr.id); + state->stack[spi].spilled_ptr.id = 0; + state->stack[spi - 1].spilled_ptr.id = 0; + } + state->stack[spi].spilled_ptr.dynptr.first_slot = false; state->stack[spi].spilled_ptr.dynptr.type = 0; state->stack[spi - 1].spilled_ptr.dynptr.type = 0; @@ -5859,7 +5886,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, skip_type_check: if (arg_type_is_release(arg_type)) { - if (!reg->ref_obj_id && !register_is_null(reg)) { + if (arg_type_is_dynptr(arg_type)) { + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else if (!reg->ref_obj_id && !register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; @@ -5994,9 +6030,13 @@ skip_type_check: case DYNPTR_TYPE_LOCAL: err_extra = "local "; break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf "; + break; default: break; } + verbose(env, "Expected an initialized %sdynptr as arg #%d\n", err_extra, arg + 1); return -EINVAL; @@ -6122,7 +6162,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output && func_id != BPF_FUNC_ringbuf_reserve && - func_id != BPF_FUNC_ringbuf_query) + func_id != BPF_FUNC_ringbuf_query && + func_id != BPF_FUNC_ringbuf_reserve_dynptr && + func_id != BPF_FUNC_ringbuf_submit_dynptr && + func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -6238,6 +6281,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_ringbuf_output: case BPF_FUNC_ringbuf_reserve: case BPF_FUNC_ringbuf_query: + case BPF_FUNC_ringbuf_reserve_dynptr: + case BPF_FUNC_ringbuf_submit_dynptr: + case BPF_FUNC_ringbuf_discard_dynptr: if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9be3644457dd..081a55540aa5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5189,6 +5189,38 @@ union bpf_attr { * Return * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5389,6 +5421,9 @@ union bpf_attr { FN(map_lookup_percpu_elem), \ FN(skc_to_mptcp_sock), \ FN(dynptr_from_mem), \ + FN(ringbuf_reserve_dynptr), \ + FN(ringbuf_submit_dynptr), \ + FN(ringbuf_discard_dynptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 13bbbfbea7598ea9f8d9c3d73bf053bb57f9c4b2 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:10 -0700 Subject: bpf: Add bpf_dynptr_read and bpf_dynptr_write This patch adds two helper functions, bpf_dynptr_read and bpf_dynptr_write: long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset); long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len); The dynptr passed into these functions must be valid dynptrs that have been initialized. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220523210712.3641569-5-joannelkoong@gmail.com --- include/uapi/linux/bpf.h | 19 ++++++++++ kernel/bpf/helpers.c | 78 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 19 ++++++++++ 3 files changed, 116 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 081a55540aa5..efe2505650e6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5221,6 +5221,23 @@ union bpf_attr { * 'bpf_ringbuf_discard'. * Return * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5424,6 +5441,8 @@ union bpf_attr { FN(ringbuf_reserve_dynptr), \ FN(ringbuf_submit_dynptr), \ FN(ringbuf_discard_dynptr), \ + FN(dynptr_read), \ + FN(dynptr_write), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index abb08999ff56..8cef3fb0d143 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1417,12 +1417,24 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = { */ #define DYNPTR_MAX_SIZE ((1UL << 24) - 1) #define DYNPTR_TYPE_SHIFT 28 +#define DYNPTR_SIZE_MASK 0xFFFFFF +#define DYNPTR_RDONLY_BIT BIT(31) + +static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_RDONLY_BIT; +} static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } +static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_SIZE_MASK; +} + int bpf_dynptr_check_size(u32 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; @@ -1442,6 +1454,16 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } +static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +{ + u32 size = bpf_dynptr_get_size(ptr); + + if (len > size || offset > size - len) + return -E2BIG; + + return 0; +} + BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1475,6 +1497,58 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; +BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset) +{ + int err; + + if (!src->data) + return -EINVAL; + + err = bpf_dynptr_check_off_len(src, offset, len); + if (err) + return err; + + memcpy(dst, src->data + src->offset + offset, len); + + return 0; +} + +const struct bpf_func_proto bpf_dynptr_read_proto = { + .func = bpf_dynptr_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_DYNPTR, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len) +{ + int err; + + if (!dst->data || bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(dst, offset, len); + if (err) + return err; + + memcpy(dst->data + dst->offset + offset, src, len); + + return 0; +} + +const struct bpf_func_proto bpf_dynptr_write_proto = { + .func = bpf_dynptr_write, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1537,6 +1611,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_strncmp_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; default: break; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 081a55540aa5..efe2505650e6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5221,6 +5221,23 @@ union bpf_attr { * 'bpf_ringbuf_discard'. * Return * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5424,6 +5441,8 @@ union bpf_attr { FN(ringbuf_reserve_dynptr), \ FN(ringbuf_submit_dynptr), \ FN(ringbuf_discard_dynptr), \ + FN(dynptr_read), \ + FN(dynptr_write), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 34d4ef5775f776ec4b0d53a02d588bf3195cada6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:11 -0700 Subject: bpf: Add dynptr data slices This patch adds a new helper function void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len); which returns a pointer to the underlying data of a dynptr. *len* must be a statically known value. The bpf program may access the returned data slice as a normal buffer (eg can do direct reads and writes), since the verifier associates the length with the returned pointer, and enforces that no out of bounds accesses occur. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523210712.3641569-6-joannelkoong@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 12 ++++++++++++ kernel/bpf/helpers.c | 28 ++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 23 +++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 12 ++++++++++++ 5 files changed, 76 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c72321b6f306..a7080c86fa76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index efe2505650e6..f4009dbdf62d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5238,6 +5238,17 @@ union bpf_attr { * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr. + * + * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5443,6 +5454,7 @@ union bpf_attr { FN(ringbuf_discard_dynptr), \ FN(dynptr_read), \ FN(dynptr_write), \ + FN(dynptr_data), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8cef3fb0d143..225806a02efb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1549,6 +1549,32 @@ const struct bpf_func_proto bpf_dynptr_write_proto = { .arg4_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +{ + int err; + + if (!ptr->data) + return 0; + + err = bpf_dynptr_check_off_len(ptr, offset, len); + if (err) + return 0; + + if (bpf_dynptr_is_rdonly(ptr)) + return 0; + + return (unsigned long)(ptr->data + ptr->offset + offset); +} + +const struct bpf_func_proto bpf_dynptr_data_proto = { + .func = bpf_dynptr_data, + .gpl_only = false, + .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1615,6 +1641,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_dynptr_read_proto; case BPF_FUNC_dynptr_write: return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8be140351966..aedac2ac02b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5832,6 +5832,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } +static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + return state->stack[spi].spilled_ptr.id; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -7384,6 +7392,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = id; /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = id; + } else if (func_id == BPF_FUNC_dynptr_data) { + int dynptr_id = 0, i; + + /* Find the id of the dynptr we're acquiring a reference to */ + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { + if (arg_type_is_dynptr(fn->arg_type[i])) { + if (dynptr_id) { + verbose(env, "verifier internal error: multiple dynptr args in func\n"); + return -EFAULT; + } + dynptr_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + } + } + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = dynptr_id; } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index efe2505650e6..f4009dbdf62d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5238,6 +5238,17 @@ union bpf_attr { * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr. + * + * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5443,6 +5454,7 @@ union bpf_attr { FN(ringbuf_discard_dynptr), \ FN(dynptr_read), \ FN(dynptr_write), \ + FN(dynptr_data), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Mar 2022 08:26:48 +0200 Subject: livepatch: Remove klp_arch_set_pc() and asm/livepatch.h All three versions of klp_arch_set_pc() do exactly the same: they call ftrace_instruction_pointer_set(). Call ftrace_instruction_pointer_set() directly and remove klp_arch_set_pc(). As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h on x86 and s390, remove asm/livepatch.h livepatch.h remains on powerpc but its content is exclusively used by powerpc specific code. Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Petr Mladek --- MAINTAINERS | 2 -- arch/powerpc/include/asm/livepatch.h | 10 +--------- arch/powerpc/kernel/irq.c | 1 - arch/powerpc/kernel/setup_64.c | 2 +- arch/s390/include/asm/livepatch.h | 22 ---------------------- arch/x86/include/asm/livepatch.h | 20 -------------------- include/linux/livepatch.h | 2 -- kernel/livepatch/patch.c | 2 +- 8 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 arch/s390/include/asm/livepatch.h delete mode 100644 arch/x86/include/asm/livepatch.h (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index 741825cff43c..589c61412e3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11338,8 +11338,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: arch/s390/include/asm/livepatch.h -F: arch/x86/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: lib/livepatch/ diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 1c60094ea0cd..d044a1fd4f44 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -7,17 +7,9 @@ #ifndef _ASM_POWERPC_LIVEPATCH_H #define _ASM_POWERPC_LIVEPATCH_H -#include -#include +#include #include -#ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} -#endif /* CONFIG_LIVEPATCH */ - #ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) { diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 752fb182eacb..fb99e3fa034d 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e547066a06aa..2c1e9a5ac6ca 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h deleted file mode 100644 index 5209f223331a..000000000000 --- a/arch/s390/include/asm/livepatch.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * livepatch.h - s390-specific Kernel Live Patching Core - * - * Copyright (c) 2013-2015 SUSE - * Authors: Jiri Kosina - * Vojtech Pavlik - * Jiri Slaby - */ - -#ifndef ASM_LIVEPATCH_H -#define ASM_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h deleted file mode 100644 index 7c5cc6660e4b..000000000000 --- a/arch/x86/include/asm/livepatch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * livepatch.h - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings - * Copyright (C) 2014 SUSE - */ - -#ifndef _ASM_X86_LIVEPATCH_H -#define _ASM_X86_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 2614247a9781..293e29960c6e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -16,8 +16,6 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) -#include - /* task patch states */ #define KLP_UNDEFINED -1 #define KLP_UNPATCHED 0 diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index c172bf92b576..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); -- cgit v1.2.3 From 7b4537199a4a8480b8c3ba37a2d44765ce76cd9b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 May 2022 20:39:22 +0900 Subject: kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS include/{linux,asm-generic}/export.h defines a weak symbol, __crc_* as a placeholder. Genksyms writes the version CRCs into the linker script, which will be used for filling the __crc_* symbols. The linker script format depends on CONFIG_MODULE_REL_CRCS. If it is enabled, __crc_* holds the offset to the reference of CRC. It is time to get rid of this complexity. Now that modpost parses text files (.*.cmd) to collect all the CRCs, it can generate C code that will be linked to the vmlinux or modules. Generate a new C file, .vmlinux.export.c, which contains the CRCs of symbols exported by vmlinux. It is compiled and linked to vmlinux in scripts/link-vmlinux.sh. Put the CRCs of symbols exported by modules into the existing *.mod.c files. No additional build step is needed for modules. As before, *.mod.c are compiled and linked to *.ko in scripts/Makefile.modfinal. No linker magic is used here. The new C implementation works in the same way, whether CONFIG_RELOCATABLE is enabled or not. CONFIG_MODULE_REL_CRCS is no longer needed. Previously, Kbuild invoked additional $(LD) to update the CRCs in objects, but this step is unneeded too. Signed-off-by: Masahiro Yamada Tested-by: Nathan Chancellor Tested-by: Nicolas Schier Reviewed-by: Nicolas Schier Tested-by: Sedat Dilek # LLVM-14 (x86-64) --- arch/m68k/include/asm/Kbuild | 1 + arch/m68k/include/asm/export.h | 2 -- arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/um/Kconfig | 1 - include/asm-generic/export.h | 22 ++++++++-------------- include/linux/export-internal.h | 17 +++++++++++++++++ include/linux/export.h | 30 ++++++++---------------------- init/Kconfig | 4 ---- kernel/module.c | 10 +--------- scripts/Makefile.build | 27 ++++----------------------- scripts/Makefile.vmlinux | 32 ++++++++++++++++++++++++++++++++ scripts/genksyms/genksyms.c | 18 ++++-------------- scripts/link-vmlinux.sh | 10 +++++++++- scripts/mod/modpost.c | 28 ++++++++++++++++++++++++---- 15 files changed, 108 insertions(+), 96 deletions(-) delete mode 100644 arch/m68k/include/asm/export.h create mode 100644 include/linux/export-internal.h create mode 100644 scripts/Makefile.vmlinux (limited to 'kernel') diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 0dbf9c5c6fae..1b720299deb1 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generated-y += syscall_table.h +generic-y += export.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h diff --git a/arch/m68k/include/asm/export.h b/arch/m68k/include/asm/export.h deleted file mode 100644 index b53008b67ce1..000000000000 --- a/arch/m68k/include/asm/export.h +++ /dev/null @@ -1,2 +0,0 @@ -#define KCRC_ALIGN 2 -#include diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 174edabb74fa..a4e8dd889e29 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -566,7 +566,6 @@ config RELOCATABLE bool "Build a relocatable kernel" depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE)) select NONSTATIC_KERNEL - select MODULE_REL_CRCS if MODVERSIONS help This builds a kernel image that is capable of running at the location the kernel is loaded at. For ppc32, there is no any diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 77b5a03de13a..aa5848004c76 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -567,7 +567,6 @@ endchoice config RELOCATABLE bool "Build a relocatable kernel" - select MODULE_REL_CRCS if MODVERSIONS default y help This builds a kernel image that retains relocation information diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 4d398b80aea8..e8983d098e73 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -106,7 +106,6 @@ config LD_SCRIPT_DYN bool default y depends on !LD_SCRIPT_STATIC - select MODULE_REL_CRCS if MODVERSIONS config LD_SCRIPT_DYN_RPATH bool "set rpath in the binary" if EXPERT diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 07a36a874dca..5e4b1f2369d2 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -2,6 +2,14 @@ #ifndef __ASM_GENERIC_EXPORT_H #define __ASM_GENERIC_EXPORT_H +/* + * This comment block is used by fixdep. Please do not remove. + * + * When CONFIG_MODVERSIONS is changed from n to y, all source files having + * EXPORT_SYMBOL variants must be re-compiled because genksyms is run as a + * side effect of the *.o build rule. + */ + #ifndef KSYM_FUNC #define KSYM_FUNC(x) x #endif @@ -12,9 +20,6 @@ #else #define KSYM_ALIGN 4 #endif -#ifndef KCRC_ALIGN -#define KCRC_ALIGN 4 -#endif .macro __put, val, name #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -43,17 +48,6 @@ __ksymtab_\name: __kstrtab_\name: .asciz "\name" .previous -#ifdef CONFIG_MODVERSIONS - .section ___kcrctab\sec+\name,"a" - .balign KCRC_ALIGN -#if defined(CONFIG_MODULE_REL_CRCS) - .long __crc_\name - . -#else - .long __crc_\name -#endif - .weak __crc_\name - .previous -#endif #endif .endm diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h new file mode 100644 index 000000000000..c2b1d4fd5987 --- /dev/null +++ b/include/linux/export-internal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Please do not include this explicitly. + * This is used by C files generated by modpost. + */ + +#ifndef __LINUX_EXPORT_INTERNAL_H__ +#define __LINUX_EXPORT_INTERNAL_H__ + +#include +#include + +/* __used is needed to keep __crc_* for LTO */ +#define SYMBOL_CRC(sym, crc, sec) \ + u32 __section("___kcrctab" sec "+" #sym) __used __crc_##sym = crc + +#endif /* __LINUX_EXPORT_INTERNAL_H__ */ diff --git a/include/linux/export.h b/include/linux/export.h index 27d848712b90..565c5ffcb26f 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -11,6 +11,14 @@ * hackers place grumpy comments in header files. */ +/* + * This comment block is used by fixdep. Please do not remove. + * + * When CONFIG_MODVERSIONS is changed from n to y, all source files having + * EXPORT_SYMBOL variants must be re-compiled because genksyms is run as a + * side effect of the *.o build rule. + */ + #ifndef __ASSEMBLY__ #ifdef MODULE extern struct module __this_module; @@ -19,26 +27,6 @@ extern struct module __this_module; #define THIS_MODULE ((struct module *)0) #endif -#ifdef CONFIG_MODVERSIONS -/* Mark the CRC weak since genksyms apparently decides not to - * generate a checksums for some symbols */ -#if defined(CONFIG_MODULE_REL_CRCS) -#define __CRC_SYMBOL(sym, sec) \ - asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ - " .weak __crc_" #sym " \n" \ - " .long __crc_" #sym " - . \n" \ - " .previous \n") -#else -#define __CRC_SYMBOL(sym, sec) \ - asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ - " .weak __crc_" #sym " \n" \ - " .long __crc_" #sym " \n" \ - " .previous \n") -#endif -#else -#define __CRC_SYMBOL(sym, sec) -#endif - #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS #include /* @@ -85,7 +73,6 @@ struct kernel_symbol { /* * For every exported symbol, do the following: * - * - If applicable, place a CRC entry in the __kcrctab section. * - Put the name of the symbol and namespace (empty string "" for none) in * __ksymtab_strings. * - Place a struct kernel_symbol entry in the __ksymtab section. @@ -98,7 +85,6 @@ struct kernel_symbol { extern typeof(sym) sym; \ extern const char __kstrtab_##sym[]; \ extern const char __kstrtabns_##sym[]; \ - __CRC_SYMBOL(sym, sec); \ asm(" .section \"__ksymtab_strings\",\"aMS\",%progbits,1 \n" \ "__kstrtab_" #sym ": \n" \ " .asciz \"" #sym "\" \n" \ diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9..f5b14318dfcb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2136,10 +2136,6 @@ config ASM_MODVERSIONS assembly. This can be enabled only when the target architecture supports it. -config MODULE_REL_CRCS - bool - depends on MODVERSIONS - config MODULE_SRCVERSION_ALL bool "Source checksum for all modules" help diff --git a/kernel/module.c b/kernel/module.c index 6cea788fd965..c9e2342da28e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1231,11 +1231,6 @@ static int try_to_force_load(struct module *mod, const char *reason) #ifdef CONFIG_MODVERSIONS -static u32 resolve_rel_crc(const s32 *crc) -{ - return *(u32 *)((void *)crc + *crc); -} - static int check_version(const struct load_info *info, const char *symname, struct module *mod, @@ -1264,10 +1259,7 @@ static int check_version(const struct load_info *info, if (strcmp(versions[i].name, symname) != 0) continue; - if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) - crcval = resolve_rel_crc(crc); - else - crcval = *crc; + crcval = *crc; if (versions[i].crc == crcval) return 1; pr_debug("Found checksum %X vs module %lX\n", diff --git a/scripts/Makefile.build b/scripts/Makefile.build index a1023868775f..ddd9080fc028 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -128,7 +128,6 @@ $(obj)/%.i: $(src)/%.c FORCE genksyms = scripts/genksyms/genksyms \ $(if $(1), -T $(2)) \ - $(if $(CONFIG_MODULE_REL_CRCS), -R) \ $(if $(KBUILD_PRESERVE), -p) \ -r $(or $(wildcard $(2:.symtypes=.symref)), /dev/null) @@ -162,19 +161,11 @@ ifdef CONFIG_MODVERSIONS # o if .o doesn't contain a __ksymtab version, i.e. does # not export symbols, it's done. # o otherwise, we calculate symbol versions using the good old -# genksyms on the preprocessed source and postprocess them in a way -# that they are usable as a linker script -# o generate .tmp_.o from .o using the linker to -# replace the unresolved symbols __crc_exported_symbol with -# the actual value of the checksum generated by genksyms -# o remove .tmp_.o to .o +# genksyms on the preprocessed source and dump them into the .cmd file. +# o modpost will extract versions from that file and create *.c files that will +# be compiled and linked to the kernel and/or modules. -# Generate .o.symversions files for each .o with exported symbols, and link these -# to the kernel and/or modules at the end. - -genksyms_format_rel_crc := [^_]*__crc_\([^ ]*\) = \.; LONG(\([^)]*\)).* -genksyms_format_normal := __crc_\(.*\) = \(.*\); -genksyms_format := $(if $(CONFIG_MODULE_REL_CRCS),$(genksyms_format_rel_crc),$(genksyms_format_normal)) +genksyms_format := __crc_\(.*\) = \(.*\); gen_symversions = \ if $(NM) $@ 2>/dev/null | grep -q __ksymtab; then \ @@ -188,12 +179,6 @@ gen_symversions = \ cmd_gen_symversions_c = $(call gen_symversions,c) -cmd_modversions = \ - if [ -r $@.symversions ]; then \ - $(LD) $(KBUILD_LDFLAGS) -r -o $(@D)/.tmp_$(@F) $@ \ - -T $@.symversions; \ - mv -f $(@D)/.tmp_$(@F) $@; \ - fi endif ifdef CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT @@ -273,7 +258,6 @@ define rule_cc_o_c $(call cmd,checkdoc) $(call cmd,gen_objtooldep) $(call cmd,gen_symversions_c) - $(if $(CONFIG_LTO_CLANG),,$(call cmd,modversions)) $(call cmd,record_mcount) endef @@ -282,7 +266,6 @@ define rule_as_o_S $(call cmd,gen_ksymdeps) $(call cmd,gen_objtooldep) $(call cmd,gen_symversions_S) - $(call cmd,modversions) endef # Built-in and composite module parts @@ -296,8 +279,6 @@ ifneq ($(CONFIG_LTO_CLANG)$(CONFIG_X86_KERNEL_IBT),) quiet_cmd_cc_prelink_modules = LD [M] $@ cmd_cc_prelink_modules = \ $(LD) $(ld_flags) -r -o $@ \ - $(shell [ -s $(@:.prelink.o=.o.symversions) ] && \ - echo -T $(@:.prelink.o=.o.symversions)) \ --whole-archive $(filter-out FORCE,$^) \ $(cmd_objtool) diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux new file mode 100644 index 000000000000..7a63abf22399 --- /dev/null +++ b/scripts/Makefile.vmlinux @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0-only + +include include/config/auto.conf +include $(srctree)/scripts/Kbuild.include + +# for c_flags +include $(srctree)/scripts/Makefile.lib + +quiet_cmd_cc_o_c = CC $@ + cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< + +%.o: %.c FORCE + $(call if_changed_dep,cc_o_c) + +targets := $(MAKECMDGOALS) + +# Add FORCE to the prequisites of a target to force it to be always rebuilt. +# --------------------------------------------------------------------------- + +PHONY += FORCE +FORCE: + +# Read all saved command lines and dependencies for the $(targets) we +# may be building above, using $(if_changed{,_dep}). As an +# optimization, we don't need to read them if the target does not +# exist, we will rebuild anyway in that case. + +existing-targets := $(wildcard $(sort $(targets))) + +-include $(foreach f,$(existing-targets),$(dir $(f)).$(notdir $(f)).cmd) + +.PHONY: $(PHONY) diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c index 4827c5abe5b7..67b23cc0df0f 100644 --- a/scripts/genksyms/genksyms.c +++ b/scripts/genksyms/genksyms.c @@ -33,7 +33,7 @@ char *cur_filename; int in_source_file; static int flag_debug, flag_dump_defs, flag_reference, flag_dump_types, - flag_preserve, flag_warnings, flag_rel_crcs; + flag_preserve, flag_warnings; static int errors; static int nsyms; @@ -680,11 +680,7 @@ void export_symbol(const char *name) if (flag_dump_defs) fputs(">\n", debugfile); - /* Used as a linker script. */ - printf(!flag_rel_crcs ? "__crc_%s = 0x%08lx;\n" : - "SECTIONS { .rodata : ALIGN(4) { " - "__crc_%s = .; LONG(0x%08lx); } }\n", - name, crc); + printf("__crc_%s = 0x%08lx;\n", name, crc); } } @@ -733,7 +729,6 @@ static void genksyms_usage(void) " -q, --quiet Disable warnings (default)\n" " -h, --help Print this message\n" " -V, --version Print the release version\n" - " -R, --relative-crc Emit section relative symbol CRCs\n" #else /* __GNU_LIBRARY__ */ " -s Select symbol prefix\n" " -d Increment the debug level (repeatable)\n" @@ -745,7 +740,6 @@ static void genksyms_usage(void) " -q Disable warnings (default)\n" " -h Print this message\n" " -V Print the release version\n" - " -R Emit section relative symbol CRCs\n" #endif /* __GNU_LIBRARY__ */ , stderr); } @@ -766,14 +760,13 @@ int main(int argc, char **argv) {"preserve", 0, 0, 'p'}, {"version", 0, 0, 'V'}, {"help", 0, 0, 'h'}, - {"relative-crc", 0, 0, 'R'}, {0, 0, 0, 0} }; - while ((o = getopt_long(argc, argv, "s:dwqVDr:T:phR", + while ((o = getopt_long(argc, argv, "s:dwqVDr:T:ph", &long_opts[0], NULL)) != EOF) #else /* __GNU_LIBRARY__ */ - while ((o = getopt(argc, argv, "s:dwqVDr:T:phR")) != EOF) + while ((o = getopt(argc, argv, "s:dwqVDr:T:ph")) != EOF) #endif /* __GNU_LIBRARY__ */ switch (o) { case 'd': @@ -813,9 +806,6 @@ int main(int argc, char **argv) case 'h': genksyms_usage(); return 0; - case 'R': - flag_rel_crcs = 1; - break; default: genksyms_usage(); return 1; diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index eceb3ee7ec06..db34dc3163ae 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -90,7 +90,6 @@ modpost_link() if is_enabled CONFIG_MODVERSIONS; then gen_symversions - lds="${lds} -T .tmp_symversions.lds" fi # This might take a while, so indicate that we're doing @@ -183,6 +182,10 @@ vmlinux_link() libs="${KBUILD_VMLINUX_LIBS}" fi + if is_enabled CONFIG_MODULES; then + objs="${objs} .vmlinux.export.o" + fi + if [ "${SRCARCH}" = "um" ]; then wl=-Wl, ld="${CC}" @@ -312,6 +315,7 @@ cleanup() rm -f vmlinux.o rm -f .vmlinux.d rm -f .vmlinux.objs + rm -f .vmlinux.export.c } # Use "make V=1" to debug this script @@ -363,6 +367,10 @@ info GEN modules.builtin tr '\0' '\n' < modules.builtin.modinfo | sed -n 's/^[[:alnum:]:_]*\.file=//p' | tr ' ' '\n' | uniq | sed -e 's:^:kernel/:' -e 's/$/.ko/' > modules.builtin +if is_enabled CONFIG_MODULES; then + ${MAKE} -f "${srctree}/scripts/Makefile.vmlinux" .vmlinux.export.o +fi + btf_vmlinux_bin_o="" if is_enabled CONFIG_DEBUG_INFO_BTF; then btf_vmlinux_bin_o=.btf.vmlinux.bin.o diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 1e2949775e1b..42e949cbc255 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2234,6 +2234,7 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "#define INCLUDE_VERMAGIC\n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); + buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "\n"); @@ -2268,20 +2269,26 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "\nMODULE_INFO(staging, \"Y\");\n"); } -static void check_symversions(struct module *mod) +static void add_exported_symbols(struct buffer *buf, struct module *mod) { struct symbol *sym; if (!modversions) return; + /* record CRCs for exported symbols */ + buf_printf(buf, "\n"); list_for_each_entry(sym, &mod->exported_symbols, list) { if (!sym->crc_valid) { warn("EXPORT symbol \"%s\" [%s%s] version generation failed, symbol will not be versioned.\n" "Is \"%s\" prototyped in ?\n", sym->name, mod->name, mod->is_vmlinux ? "" : ".ko", sym->name); + continue; } + + buf_printf(buf, "SYMBOL_CRC(%s, 0x%08x, \"%s\");\n", + sym->name, sym->crc, sym->is_gpl_only ? "_gpl" : ""); } } @@ -2418,6 +2425,18 @@ static void write_if_changed(struct buffer *b, const char *fname) write_buf(b, fname); } +static void write_vmlinux_export_c_file(struct module *mod) +{ + struct buffer buf = { }; + + buf_printf(&buf, + "#include \n"); + + add_exported_symbols(&buf, mod); + write_if_changed(&buf, ".vmlinux.export.c"); + free(buf.p); +} + /* do sanity checks, and generate *.mod.c file */ static void write_mod_c_file(struct module *mod) { @@ -2429,6 +2448,7 @@ static void write_mod_c_file(struct module *mod) check_exports(mod); add_header(&buf, mod); + add_exported_symbols(&buf, mod); add_versions(&buf, mod); add_depends(&buf, mod); add_moddevtable(&buf, mod); @@ -2626,9 +2646,9 @@ int main(int argc, char **argv) if (mod->from_dump) continue; - check_symversions(mod); - - if (!mod->is_vmlinux) + if (mod->is_vmlinux) + write_vmlinux_export_c_file(mod); + else write_mod_c_file(mod); } -- cgit v1.2.3 From da007f171fc9dd993d03a360b515cac1c87921bb Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 25 May 2022 00:21:18 +0300 Subject: kernel/reboot: Change registration order of legacy power-off handler We're unconditionally registering sys-off handler for the legacy pm_power_off() callback, this causes problem for platforms that don't use power-off handlers at all and should be halted. Now reboot syscall assumes that there is a power-off handler installed and tries to power off system instead of halting it. To fix the trouble, move the handler's registration to the reboot syscall and check the pm_power_off() presence. Fixes: 0e2110d2e910 ("kernel/reboot: Add kernel_can_power_off()") Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index bdb635842ec8..d4aa372c0cdc 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -569,22 +569,6 @@ static int legacy_pm_power_off(struct sys_off_data *data) return NOTIFY_DONE; } -/* - * Register sys-off handlers for legacy PM callbacks. This allows legacy - * PM callbacks co-exist with the new sys-off API. - * - * TODO: Remove legacy handlers once all legacy PM users will be switched - * to the sys-off based APIs. - */ -static int __init legacy_pm_init(void) -{ - register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_DEFAULT, - legacy_pm_power_off, NULL); - - return 0; -} -core_initcall(legacy_pm_init); - static void do_kernel_power_off_prepare(void) { blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL); @@ -646,6 +630,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { struct pid_namespace *pid_ns = task_active_pid_ns(current); + struct sys_off_handler *sys_off = NULL; char buffer[256]; int ret = 0; @@ -670,6 +655,21 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if (ret) return ret; + /* + * Register sys-off handlers for legacy PM callback. This allows + * legacy PM callbacks temporary co-exist with the new sys-off API. + * + * TODO: Remove legacy handlers once all legacy PM users will be + * switched to the sys-off based APIs. + */ + if (pm_power_off) { + sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + legacy_pm_power_off, NULL); + if (IS_ERR(sys_off)) + return PTR_ERR(sys_off); + } + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ @@ -727,6 +727,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, break; } mutex_unlock(&system_transition_mutex); + unregister_sys_off_handler(sys_off); return ret; } -- cgit v1.2.3 From 3159d79b56c15068aeb7e4630cd5f6dacd20fda4 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Mon, 23 May 2022 05:35:31 +0000 Subject: kcov: update pos before writing pc in trace function In __sanitizer_cov_trace_pc(), previously we write pc before updating pos. However, some early interrupt code could bypass check_kcov_mode() check and invoke __sanitizer_cov_trace_pc(). If such interrupt is raised between writing pc and updating pos, the pc could be overitten by the recursive __sanitizer_cov_trace_pc(). As suggested by Dmitry, we cold update pos before writing pc to avoid such interleaving. Apply the same change to write_comp_data(). Link: https://lkml.kernel.org/r/20220523053531.1572793-1-liu3101@purdue.edu Signed-off-by: Congyu Liu Reviewed-by: Dmitry Vyukov Cc: Andrey Konovalov Signed-off-by: Andrew Morton --- kernel/kcov.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index b3732b210593..e19c84b02452 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first 64-bit word is the number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = ip; + /* Previously we write pc before updating pos. However, some + * early interrupt code could bypass check_kcov_mode() check + * and invoke __sanitizer_cov_trace_pc(). If such interrupt is + * raised between writing pc and updating pos, the pc could be + * overitten by the recursive __sanitizer_cov_trace_pc(). + * Update pos before writing pc to avoid such interleaving. + */ WRITE_ONCE(area[0], pos); + barrier(); + area[pos] = ip; } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); @@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) start_index = 1 + count * KCOV_WORDS_PER_CMP; end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); if (likely(end_pos <= max_pos)) { + /* See comment in __sanitizer_cov_trace_pc(). */ + WRITE_ONCE(area[0], count + 1); + barrier(); area[start_index] = type; area[start_index + 1] = arg1; area[start_index + 2] = arg2; area[start_index + 3] = ip; - WRITE_ONCE(area[0], count + 1); } } -- cgit v1.2.3 From 499f12168aebd6da8fa32c9b7d6203ca9b5eb88d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Apr 2022 14:56:32 -0400 Subject: tracing: Have event format check not flag %p* on __get_dynamic_array() The print fmt check against trace events to make sure that the format does not use pointers that may be freed from the time of the trace to the time the event is read, gives a false positive on %pISpc when reading data that was saved in __get_dynamic_array() when it is perfectly fine to do so, as the data being read is on the ring buffer. Link: https://lore.kernel.org/all/20220407144524.2a592ed6@canb.auug.org.au/ Cc: stable@vger.kernel.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78f313b7b315..d5913487821a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -392,12 +392,6 @@ static void test_event_printk(struct trace_event_call *call) if (!(dereference_flags & (1ULL << arg))) goto next_arg; - /* Check for __get_sockaddr */; - if (str_has_prefix(fmt + i, "__get_sockaddr(")) { - dereference_flags &= ~(1ULL << arg); - goto next_arg; - } - /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); @@ -413,7 +407,14 @@ static void test_event_printk(struct trace_event_call *call) a = strchr(fmt + i, '&'); if ((a && (a < r)) || test_field(r, call)) dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); } + next_arg: i--; arg++; -- cgit v1.2.3 From e35c2d8e22745751cf304ec3fe39616643db2e0a Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Wed, 27 Apr 2022 11:41:19 +0800 Subject: tracing: Reset the function filter after completing trampoline/graph selftest The direct trampoline and graph coexistence test sets global_ops to trace only 'trace_selftest_dynamic_test_func', but does not reset it after the test is completed, resulting in the function filter being set already after the system starts. Although it can be reset through the tracefs interface, it is more or less confusing to the user, and we should reset it to trace all functions after the trampoline/graph test completes. Link: https://lkml.kernel.org/r/20220427034119.24668-1-lihuafei1@huawei.com Link: https://lore.kernel.org/all/20220418073958.104029-1-lihuafei1@huawei.com/ Fixes: 130c08065848 ("tracing: Add trampoline/graph selftest") Signed-off-by: Li Huafei Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index abcadbe933bb..a2d301f58ced 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -895,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace, ret = -1; goto out; } + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); #endif /* Don't test dynamic tracing, the function tracer already did */ -- cgit v1.2.3 From e4931b824a6f36cae9cc5632709f015cfa748a25 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 14 Jan 2022 21:10:52 +0800 Subject: tracing: Use trace_create_file() to simplify creation of tracefs entries Creating tracefs entries with tracefs_create_file() followed by pr_warn() is tedious and repetitive, we can use trace_create_file() to simplify this process and make the code more readable. Link: https://lkml.kernel.org/r/20220114131052.534382-1-ytcoode@gmail.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 9 +++------ kernel/trace/trace_dynevent.c | 9 ++------- kernel/trace/trace_events.c | 29 ++++++++++------------------- kernel/trace/trace_kprobe.c | 15 ++++----------- kernel/trace/trace_recursion_record.c | 7 ++----- 5 files changed, 21 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5c465e70d146..6c5b12669e06 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -952,7 +952,6 @@ static struct tracer_stat function_stats __initdata = { static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; - struct dentry *entry; char *name; int ret; int cpu; @@ -983,11 +982,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", - TRACE_MODE_WRITE, d_tracer, NULL, - &ftrace_profile_fops); - if (!entry) - pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); + trace_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); } #else /* CONFIG_FUNCTION_PROFILER */ diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e34e8182ee4b..076b447a1b88 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -255,19 +255,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *entry; int ret; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, - NULL, &dynamic_events_ops); - - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'dynamic_events' entry\n"); + trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, + NULL, &dynamic_events_ops); return 0; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d5913487821a..9a88de9d7815 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3546,12 +3546,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) { - pr_warn("Could not create tracefs 'set_event' entry\n"); + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) return -ENOMEM; - } d_events = tracefs_create_dir("events", parent); if (!d_events) { @@ -3566,16 +3564,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_pid' entry\n"); + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); - entry = tracefs_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ trace_create_file("header_page", TRACE_MODE_READ, d_events, @@ -3790,17 +3784,14 @@ static __init int event_trace_init_fields(void) __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *entry; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", TRACE_MODE_READ, - NULL, tr, &ftrace_avail_fops); - if (!entry) - pr_warn("Could not create tracefs 'available_events' entry\n"); + trace_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 47cebef78532..93507330462c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1907,25 +1907,18 @@ core_initcall(init_kprobe_trace_early); static __init int init_kprobe_trace(void) { int ret; - struct dentry *entry; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, - NULL, NULL, &kprobe_events_ops); - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'kprobe_events' entry\n"); + trace_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, - NULL, NULL, &kprobe_profile_ops); - - if (!entry) - pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); + trace_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); setup_boot_kprobe_events(); diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index 4d4b78c8ca25..a520b11afb0d 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = { __init static int create_recursed_functions(void) { - struct dentry *dentry; - dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, - NULL, NULL, &recursed_functions_fops); - if (!dentry) - pr_warn("WARNING: Failed to create recursed_functions\n"); + trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); return 0; } -- cgit v1.2.3 From 2889c658b2fbc7ad4c5d541734fdc1d97b130753 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 20 Jan 2022 06:59:49 +0000 Subject: ftrace: Deal with error return code of the ftrace_process_locs() function The ftrace_process_locs() function may return -ENOMEM error code, which should be handled by the callers. Link: https://lkml.kernel.org/r/20220120065949.1813231-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c5b12669e06..737e895e2aad 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6864,11 +6864,16 @@ void ftrace_module_enable(struct module *mod) void ftrace_module_init(struct module *mod) { + int ret; + if (ftrace_disabled || !mod->num_ftrace_callsites) return; - ftrace_process_locs(mod, mod->ftrace_callsites, - mod->ftrace_callsites + mod->num_ftrace_callsites); + ret = ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); + if (ret) + pr_warn("ftrace: failed to allocate entries for module '%s' functions\n", + mod->name); } static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, @@ -7201,15 +7206,19 @@ void __init ftrace_init(void) pr_info("ftrace: allocating %ld entries in %ld pages\n", count, count / ENTRIES_PER_PAGE + 1); - last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); + if (ret) { + pr_warn("ftrace: failed to allocate entries for functions\n"); + goto failed; + } pr_info("ftrace: allocated %ld pages with %ld groups\n", ftrace_number_of_pages, ftrace_number_of_groups); + last_ftrace_enabled = ftrace_enabled = 1; + set_ftrace_early_filters(); return; -- cgit v1.2.3 From cb24693d94ceaf658944ad2e922203c0503775d2 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 21 Jan 2022 09:56:23 +0000 Subject: tracing: Use strim() to remove whitespace instead of doing it manually The tracing_set_trace_write() function just removes the trailing whitespace from the user supplied tracer name, but the leading whitespace should also be removed. In addition, if the user supplied tracer name contains only a few whitespace characters, the first one will not be removed using the current method, which results it a single whitespace character left in the buf. To fix all of these issues, we use strim() to correctly remove both the leading and trailing whitespace. Link: https://lkml.kernel.org/r/20220121095623.1826679-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 087740f63d92..498ae22d4ffa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6461,7 +6461,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; - int i; + char *name; size_t ret; int err; @@ -6475,11 +6475,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + name = strim(buf); - err = tracing_set_tracer(tr, buf); + err = tracing_set_tracer(tr, name); if (err) return err; -- cgit v1.2.3 From 99696a2592bca641eb88cc9a80c90e591afebd0f Mon Sep 17 00:00:00 2001 From: Keita Suzuki Date: Mon, 25 Apr 2022 06:37:38 +0000 Subject: tracing: Fix potential double free in create_var_ref() In create_var_ref(), init_var_ref() is called to initialize the fields of variable ref_field, which is allocated in the previous function call to create_hist_field(). Function init_var_ref() allocates the corresponding fields such as ref_field->system, but frees these fields when the function encounters an error. The caller later calls destroy_hist_field() to conduct error handling, which frees the fields and the variable itself. This results in double free of the fields which are already freed in the previous function. Fix this by storing NULL to the corresponding fields when they are freed in init_var_ref(). Link: https://lkml.kernel.org/r/20220425063739.3859998-1-keitasuzuki.park@sslab.ics.keio.ac.jp Fixes: 067fe038e70f ("tracing: Add variable reference handling to hist triggers") CC: stable@vger.kernel.org Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Signed-off-by: Keita Suzuki Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 038dc591545d..c6a65738feb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2093,8 +2093,11 @@ static int init_var_ref(struct hist_field *ref_field, return err; free: kfree(ref_field->system); + ref_field->system = NULL; kfree(ref_field->event_name); + ref_field->event_name = NULL; kfree(ref_field->name); + ref_field->name = NULL; goto out; } -- cgit v1.2.3 From b27f266f74fbda4ee36c2b2b04d15992860cf23b Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 3 May 2022 14:05:46 +0900 Subject: tracing: Fix return value of trace_pid_write() Setting set_event_pid with trailing whitespace lead to endless write system calls like below. $ strace echo "123 " > /sys/kernel/debug/tracing/set_event_pid execve("/usr/bin/echo", ["echo", "123 "], ...) = 0 ... write(1, "123 \n", 5) = 4 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 .... This is because, the result of trace_get_user's are not returned when it read at least one pid. To fix it, update read variable even if parser->idx == 0. The result of applied patch is below. $ strace echo "123 " > /sys/kernel/debug/tracing/set_event_pid execve("/usr/bin/echo", ["echo", "123 "], ...) = 0 ... write(1, "123 \n", 5) = 5 close(1) = 0 Link: https://lkml.kernel.org/r/20220503050546.288911-1-vvghjk1234@gmail.com Cc: Ingo Molnar Cc: Baik Song An Cc: Hong Yeon Kim Cc: Taeung Song Cc: linuxgeek@linuxgeek.io Cc: stable@vger.kernel.org Fixes: 4909010788640 ("tracing: Add set_event_pid directory for future use") Signed-off-by: Wonhyuk Yang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 498ae22d4ffa..4825883b2ffd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -721,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); - if (ret < 0 || !trace_parser_loaded(&parser)) + if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; + if (!trace_parser_loaded(&parser)) + break; + ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -753,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); - read = ret; pid_list = NULL; } -- cgit v1.2.3 From 43994049180704fd1faf78623fabd9a5cd443708 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 4 May 2022 12:36:31 +0900 Subject: kprobes: Fix build errors with CONFIG_KRETPROBES=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Max Filippov reported: When building kernel with CONFIG_KRETPROBES=n kernel/kprobes.c compilation fails with the following messages: kernel/kprobes.c: In function ‘recycle_rp_inst’: kernel/kprobes.c:1273:32: error: implicit declaration of function ‘get_kretprobe’ kernel/kprobes.c: In function ‘kprobe_flush_task’: kernel/kprobes.c:1299:35: error: ‘struct task_struct’ has no member named ‘kretprobe_instances’ This came from the commit d741bf41d7c7 ("kprobes: Remove kretprobe hash") which introduced get_kretprobe() and kretprobe_instances member in task_struct when CONFIG_KRETPROBES=y, but did not make recycle_rp_inst() and kprobe_flush_task() depending on CONFIG_KRETPORBES. Since those functions are only used for kretprobe, move those functions into #ifdef CONFIG_KRETPROBE area. Link: https://lkml.kernel.org/r/165163539094.74407.3838114721073251225.stgit@devnote2 Reported-by: Max Filippov Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: "David S . Miller" Cc: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Tested-by: Max Filippov Signed-off-by: Steven Rostedt (Google) --- include/linux/kprobes.h | 2 +- kernel/kprobes.c | 144 ++++++++++++++++++++++++------------------------ 2 files changed, 72 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 157168769fc2..55041d2f884d 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -424,7 +424,7 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); -#ifdef CONFIG_KRETPROBE_ON_RETHOOK +#if defined(CONFIG_KRETPROBE_ON_RETHOOK) || !defined(CONFIG_KRETPROBES) #define kprobe_flush_task(tk) do {} while (0) #else void kprobe_flush_task(struct task_struct *tk); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index dbe57df2e199..172ba75a4a49 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1257,79 +1257,6 @@ void kprobe_busy_end(void) preempt_enable(); } -#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) -static void free_rp_inst_rcu(struct rcu_head *head) -{ - struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); - - if (refcount_dec_and_test(&ri->rph->ref)) - kfree(ri->rph); - kfree(ri); -} -NOKPROBE_SYMBOL(free_rp_inst_rcu); - -static void recycle_rp_inst(struct kretprobe_instance *ri) -{ - struct kretprobe *rp = get_kretprobe(ri); - - if (likely(rp)) - freelist_add(&ri->freelist, &rp->freelist); - else - call_rcu(&ri->rcu, free_rp_inst_rcu); -} -NOKPROBE_SYMBOL(recycle_rp_inst); - -/* - * This function is called from delayed_put_task_struct() when a task is - * dead and cleaned up to recycle any kretprobe instances associated with - * this task. These left over instances represent probed functions that - * have been called but will never return. - */ -void kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - struct llist_node *node; - - /* Early boot, not yet initialized. */ - if (unlikely(!kprobes_initialized)) - return; - - kprobe_busy_begin(); - - node = __llist_del_all(&tk->kretprobe_instances); - while (node) { - ri = container_of(node, struct kretprobe_instance, llist); - node = node->next; - - recycle_rp_inst(ri); - } - - kprobe_busy_end(); -} -NOKPROBE_SYMBOL(kprobe_flush_task); - -static inline void free_rp_inst(struct kretprobe *rp) -{ - struct kretprobe_instance *ri; - struct freelist_node *node; - int count = 0; - - node = rp->freelist.head; - while (node) { - ri = container_of(node, struct kretprobe_instance, freelist); - node = node->next; - - kfree(ri); - count++; - } - - if (refcount_sub_and_test(count, &rp->rph->ref)) { - kfree(rp->rph); - rp->rph = NULL; - } -} -#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ - /* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { @@ -1928,6 +1855,77 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES #if !defined(CONFIG_KRETPROBE_ON_RETHOOK) +static void free_rp_inst_rcu(struct rcu_head *head) +{ + struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); + + if (refcount_dec_and_test(&ri->rph->ref)) + kfree(ri->rph); + kfree(ri); +} +NOKPROBE_SYMBOL(free_rp_inst_rcu); + +static void recycle_rp_inst(struct kretprobe_instance *ri) +{ + struct kretprobe *rp = get_kretprobe(ri); + + if (likely(rp)) + freelist_add(&ri->freelist, &rp->freelist); + else + call_rcu(&ri->rcu, free_rp_inst_rcu); +} +NOKPROBE_SYMBOL(recycle_rp_inst); + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void kprobe_flush_task(struct task_struct *tk) +{ + struct kretprobe_instance *ri; + struct llist_node *node; + + /* Early boot, not yet initialized. */ + if (unlikely(!kprobes_initialized)) + return; + + kprobe_busy_begin(); + + node = __llist_del_all(&tk->kretprobe_instances); + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); + node = node->next; + + recycle_rp_inst(ri); + } + + kprobe_busy_end(); +} +NOKPROBE_SYMBOL(kprobe_flush_task); + +static inline void free_rp_inst(struct kretprobe *rp) +{ + struct kretprobe_instance *ri; + struct freelist_node *node; + int count = 0; + + node = rp->freelist.head; + while (node) { + ri = container_of(node, struct kretprobe_instance, freelist); + node = node->next; + + kfree(ri); + count++; + } + + if (refcount_sub_and_test(count, &rp->rph->ref)) { + kfree(rp->rph); + rp->rph = NULL; + } +} + /* This assumes the 'tsk' is the current task or the is not running. */ static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, struct llist_node **cur) -- cgit v1.2.3 From aa748949b4e665f473bc5abdc5f66029cb5f5522 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 10 May 2022 11:45:23 +0200 Subject: tracing/timerlat: Notify IRQ new max latency only if stop tracing is set Currently, the notification of a new max latency is sent from timerlat's IRQ handler anytime a new max latency is found. While this behavior is not wrong, the send IPI overhead itself will increase the thread latency and that is not the desired effect (tracing overhead). Moreover, the thread will notify a new max latency again because the thread latency as it is always higher than the IRQ latency that woke it up. The only case in which it is helpful to notify a new max latency from IRQ is when stop tracing (for the IRQ) is set, as in this case, the thread will not be dispatched. Notify a new max latency from the IRQ handler only if stop tracing is set for the IRQ handler. Link: https://lkml.kernel.org/r/2c2d9a56c0886c8402ba320de32856cbbb10c2bb.1652175637.git.bristot@kernel.org Cc: Juri Lelli Cc: Ingo Molnar Reported-by: Clark Williams Fixes: a955d7eac177 ("trace: Add timerlat tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e9ae1f33a7f0..6494ca27ea6f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1578,11 +1578,12 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) trace_timerlat_sample(&s); - notify_new_max_latency(diff); - - if (osnoise_data.stop_tracing) - if (time_to_us(diff) >= osnoise_data.stop_tracing) + if (osnoise_data.stop_tracing) { + if (time_to_us(diff) >= osnoise_data.stop_tracing) { osnoise_stop_tracing(); + notify_new_max_latency(diff); + } + } wake_up_process(tlat->kthread); -- cgit v1.2.3 From 4dd2aea24ed7613735664feadc9879d37f718c23 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 10 May 2022 11:45:24 +0200 Subject: tracing/timerlat: Print stacktrace in the IRQ handler if needed If print_stack and stop_tracing_us are set, and stop_tracing_us is hit with latency higher than or equal to print_stack, print the stack at the IRQ handler as it is useful to define the root cause for the IRQ latency. Link: https://lkml.kernel.org/r/fd04530ce98ae9270e41bb124ee5bf67b05ecfed.1652175637.git.bristot@kernel.org Cc: Juri Lelli Cc: Clark Williams Cc: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/timerlat-tracer.rst | 5 +++-- kernel/trace/trace_osnoise.c | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst index 64d1fe6e9b93..d643c95c01eb 100644 --- a/Documentation/trace/timerlat-tracer.rst +++ b/Documentation/trace/timerlat-tracer.rst @@ -74,8 +74,9 @@ directory. The timerlat configs are: - stop_tracing_total_us: stop the system tracing if a timer latency at the *thread* context is higher than the configured value happens. Writing 0 disables this option. - - print_stack: save the stack of the IRQ occurrence, and print - it after the *thread context* event". + - print_stack: save the stack of the IRQ occurrence. The stack is printed + after the *thread context* event, or at the IRQ handler if *stop_tracing_us* + is hit. timerlat and osnoise ---------------------------- diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 6494ca27ea6f..9b204ee3c6f5 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1580,6 +1580,19 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { + + /* + * At this point, if stop_tracing is set and <= print_stack, + * print_stack is set and would be printed in the thread handler. + * + * Thus, print the stack trace as it is helpful to define the + * root cause of an IRQ latency. + */ + if (osnoise_data.stop_tracing <= osnoise_data.print_stack) { + timerlat_save_stack(0); + timerlat_dump_stack(time_to_us(diff)); + } + osnoise_stop_tracing(); notify_new_max_latency(diff); } -- cgit v1.2.3 From 9c556e5a4dd5cfc0939a0575577d0517118f98af Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 10 May 2022 11:45:25 +0200 Subject: tracing/timerlat: Do not wakeup the thread if the trace stops at the IRQ There is no need to wakeup the timerlat/ thread if stop tracing is hit at the timerlat's IRQ handler. Return before waking up timerlat's thread. Link: https://lkml.kernel.org/r/b392356c91b56aedd2b289513cc56a84cf87e60d.1652175637.git.bristot@kernel.org Cc: Juri Lelli Cc: Clark Williams Cc: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 9b204ee3c6f5..035ec8b84e12 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1595,6 +1595,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + + return HRTIMER_NORESTART; } } -- cgit v1.2.3 From 2d601b98643dd2846e2958d931826e7b7af44969 Mon Sep 17 00:00:00 2001 From: liqiong Date: Thu, 12 May 2022 22:32:30 +0800 Subject: tracing: Change "char *" string form to "char []" The "char []" string form declares a single variable. It is better than "char *" which creates two variables in the final assembly. Link: https://lkml.kernel.org/r/20220512143230.28796-1-liqiong@nfschina.com Signed-off-by: liqiong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4825883b2ffd..e38a7ca4cdd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4252,7 +4252,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; + static const char space[] = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c6a65738feb3..48e82e141d54 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4165,7 +4165,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } -static const char *no_comm = "(no comm)"; +static const char no_comm[] = "(no comm)"; static u64 hist_field_execname(struct hist_field *hist_field, struct tracing_map_elt *elt, -- cgit v1.2.3 From 2decd16f47e3df3234b5486fe89a9aa5a1102af1 Mon Sep 17 00:00:00 2001 From: liqiong Date: Fri, 13 May 2022 15:52:21 +0800 Subject: tracing: Cleanup code by removing init "char *name" The pointer is assigned to "type->name" anyway. no need to initialize with "preemption". Link: https://lkml.kernel.org/r/20220513075221.26275-1-liqiong@nfschina.com Signed-off-by: liqiong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e38a7ca4cdd0..f400800bc910 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4276,9 +4276,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct tracer *type = iter->trace; unsigned long entries; unsigned long total; - const char *name = "preemption"; - - name = type->name; + const char *name = type->name; get_total_entries(buf, &total, &entries); -- cgit v1.2.3 From 3a2bfec0b02f2226ff3376a5d2ff604d799bd7ea Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 18 May 2022 10:36:40 +0800 Subject: ftrace: Remove return value of ftrace_arch_modify_*() All instances of the function ftrace_arch_modify_prepare() and ftrace_arch_modify_post_process() return zero. There's no point in checking their return value. Just have them be void functions. Link: https://lkml.kernel.org/r/20220518023639.4065-1-kunyu@nfschina.com Signed-off-by: Li kunyu Signed-off-by: Steven Rostedt (Google) --- arch/arm/kernel/ftrace.c | 6 ++---- arch/riscv/kernel/ftrace.c | 6 ++---- arch/s390/kernel/ftrace.c | 3 +-- arch/x86/kernel/ftrace.c | 6 ++---- include/linux/ftrace.h | 4 ++-- kernel/trace/ftrace.c | 16 ++++------------ 6 files changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 83cc068586bc..a0b6d1e3812f 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -79,16 +79,14 @@ static unsigned long __ref adjust_address(struct dyn_ftrace *rec, return (unsigned long)&ftrace_regs_caller_from_init; } -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) { - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) { /* Make sure any TLB misses during machine stop are cleared. */ flush_tlb_all(); - return 0; } static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr, diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 4716f4cdc038..2086f6585773 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -12,16 +12,14 @@ #include #ifdef CONFIG_DYNAMIC_FTRACE -int ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { mutex_lock(&text_mutex); - return 0; } -int ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { mutex_unlock(&text_mutex); - return 0; } static int ftrace_check_current_call(unsigned long hook_pos, diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 1852d46babb1..416b5a94353d 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -225,14 +225,13 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) { /* * Flush any pre-fetched instructions on all * CPUs to make the new code visible. */ text_poke_sync_lock(); - return 0; } #ifdef CONFIG_MODULES diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1e31c7d21597..73d2719ed12c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -37,7 +37,7 @@ static int ftrace_poke_late = 0; -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { /* @@ -47,10 +47,9 @@ int ftrace_arch_code_modify_prepare(void) */ mutex_lock(&text_mutex); ftrace_poke_late = 1; - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { /* @@ -61,7 +60,6 @@ int ftrace_arch_code_modify_post_process(void) text_poke_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); - return 0; } static const char *ftrace_nop_replace(void) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..a5f74f6e7e4e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -449,8 +449,8 @@ static inline void stack_tracer_enable(void) { } #ifdef CONFIG_DYNAMIC_FTRACE -int ftrace_arch_code_modify_prepare(void); -int ftrace_arch_code_modify_post_process(void); +void ftrace_arch_code_modify_prepare(void); +void ftrace_arch_code_modify_post_process(void); enum ftrace_bug_type { FTRACE_BUG_UNKNOWN, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 737e895e2aad..852c731cbd9a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2704,18 +2704,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) * archs can override this function if they must do something * before the modifying code is performed. */ -int __weak ftrace_arch_code_modify_prepare(void) +void __weak ftrace_arch_code_modify_prepare(void) { - return 0; } /* * archs can override this function if they must do something * after the modifying code is performed. */ -int __weak ftrace_arch_code_modify_post_process(void) +void __weak ftrace_arch_code_modify_post_process(void) { - return 0; } void ftrace_modify_all_code(int command) @@ -2801,12 +2799,7 @@ void __weak arch_ftrace_update_code(int command) static void ftrace_run_update_code(int command) { - int ret; - - ret = ftrace_arch_code_modify_prepare(); - FTRACE_WARN_ON(ret); - if (ret) - return; + ftrace_arch_code_modify_prepare(); /* * By default we use stop_machine() to modify the code. @@ -2816,8 +2809,7 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - ret = ftrace_arch_code_modify_post_process(); - FTRACE_WARN_ON(ret); + ftrace_arch_code_modify_post_process(); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, -- cgit v1.2.3 From 50c697819d59c5013a66728940015348919a0c0c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:31 +0200 Subject: ftrace: Fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20220521111145.81697-81-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 852c731cbd9a..fb8f08b4bd41 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -119,7 +119,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */ void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); -- cgit v1.2.3 From 154827f8e53d8c492b3fb0cb757fbcadb5d516b5 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Sat, 21 May 2022 23:18:26 -0700 Subject: tracing: Initialize integer variable to prevent garbage return value Initialize the integer variable to 0 to fix the clang scan warning: Undefined or garbage value returned to caller [core.uninitialized.UndefReturn] return ret; Link: https://lkml.kernel.org/r/20220522061826.1751-1-gautammenghani201@gmail.com Cc: stable@vger.kernel.org Fixes: 8993665abcce ("tracing/boot: Support multiple handlers for per-event histogram") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Gautam Menghani Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0580287d7a0d..778200dd8ede 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, { struct xbc_node *node; const char *p, *handler; - int ret; + int ret = 0; handler = xbc_node_get_data(hnode); -- cgit v1.2.3 From bb5eb8f3b329789fbf22c85328dcf696a3e97ffb Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Mon, 23 May 2022 06:30:33 +0000 Subject: tracing: Disable kcov on trace_preemptirq.c Functions in trace_preemptirq.c could be invoked from early interrupt code that bypasses kcov trace function's in_task() check. Disable kcov on this file to reduce random code coverage. Link: https://lkml.kernel.org/r/20220523063033.1778974-1-liu3101@purdue.edu Acked-by: Dmitry Vyukov Signed-off-by: Congyu Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d77cd8032213..0d261774d6f3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +# Functions in this file could be invoked from early interrupt +# code and produce random code coverage. +KCOV_INSTRUMENT_trace_preemptirq.o := n + CFLAGS_bpf_trace.o := -I$(src) CFLAGS_trace_benchmark.o := -I$(src) -- cgit v1.2.3 From 0a54f556b035e5217e21724d82dd98ce695bd6d6 Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 24 May 2022 14:39:37 +0800 Subject: tracing: Fix comments of create_filter() The name in comments of parameter "filter_string" in function create_filter is annotated as "filter_str", just fix it. Link: https://lkml.kernel.org/r/20220524063937.52873-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b458a9afa2c0..4b1057ab9d96 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1816,7 +1816,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * create_filter - create a filter for a trace_event_call * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for - * @filter_str: filter string + * @filter_string: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) * Must be a pointer that references a NULL pointer. -- cgit v1.2.3 From 7d54c15cb89a29a5f59e5ffc9ee62e6591769ef1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 24 May 2022 10:08:39 -0700 Subject: ftrace: Clean up hash direct_functions on register failures We see the following GPF when register_ftrace_direct fails: [ ] general protection fault, probably for non-canonical address \ 0x200000000000010: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [...] [ ] RIP: 0010:ftrace_find_rec_direct+0x53/0x70 [ ] Code: 48 c1 e0 03 48 03 42 08 48 8b 10 31 c0 48 85 d2 74 [...] [ ] RSP: 0018:ffffc9000138bc10 EFLAGS: 00010206 [ ] RAX: 0000000000000000 RBX: ffffffff813e0df0 RCX: 000000000000003b [ ] RDX: 0200000000000000 RSI: 000000000000000c RDI: ffffffff813e0df0 [ ] RBP: ffffffffa00a3000 R08: ffffffff81180ce0 R09: 0000000000000001 [ ] R10: ffffc9000138bc18 R11: 0000000000000001 R12: ffffffff813e0df0 [ ] R13: ffffffff813e0df0 R14: ffff888171b56400 R15: 0000000000000000 [ ] FS: 00007fa9420c7780(0000) GS:ffff888ff6a00000(0000) knlGS:000000000 [ ] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ ] CR2: 000000000770d000 CR3: 0000000107d50003 CR4: 0000000000370ee0 [ ] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ ] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ ] Call Trace: [ ] [ ] register_ftrace_direct+0x54/0x290 [ ] ? render_sigset_t+0xa0/0xa0 [ ] bpf_trampoline_update+0x3f5/0x4a0 [ ] ? 0xffffffffa00a3000 [ ] bpf_trampoline_link_prog+0xa9/0x140 [ ] bpf_tracing_prog_attach+0x1dc/0x450 [ ] bpf_raw_tracepoint_open+0x9a/0x1e0 [ ] ? find_held_lock+0x2d/0x90 [ ] ? lock_release+0x150/0x430 [ ] __sys_bpf+0xbd6/0x2700 [ ] ? lock_is_held_type+0xd8/0x130 [ ] __x64_sys_bpf+0x1c/0x20 [ ] do_syscall_64+0x3a/0x80 [ ] entry_SYSCALL_64_after_hwframe+0x44/0xae [ ] RIP: 0033:0x7fa9421defa9 [ ] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 9 f8 [...] [ ] RSP: 002b:00007ffed743bd78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 [ ] RAX: ffffffffffffffda RBX: 00000000069d2480 RCX: 00007fa9421defa9 [ ] RDX: 0000000000000078 RSI: 00007ffed743bd80 RDI: 0000000000000011 [ ] RBP: 00007ffed743be00 R08: 0000000000bb7270 R09: 0000000000000000 [ ] R10: 00000000069da210 R11: 0000000000000246 R12: 0000000000000001 [ ] R13: 00007ffed743c4b0 R14: 00000000069d2480 R15: 0000000000000001 [ ] [ ] Modules linked in: klp_vm(OK) [ ] ---[ end trace 0000000000000000 ]--- One way to trigger this is: 1. load a livepatch that patches kernel function xxx; 2. run bpftrace -e 'kfunc:xxx {}', this will fail (expected for now); 3. repeat #2 => gpf. This is because the entry is added to direct_functions, but not removed. Fix this by remove the entry from direct_functions when register_ftrace_direct fails. Also remove the last trailing space from ftrace.c, so we don't have to worry about it anymore. Link: https://lkml.kernel.org/r/20220524170839.900849-1-song@kernel.org Cc: stable@vger.kernel.org Fixes: 763e34e74bb7 ("ftrace: Add register_ftrace_direct()") Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb8f08b4bd41..d653ef4febc5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4454,7 +4454,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to remove the data from * * Returns the data if it is found, otherwise NULL. - * Note, if the data pointer is used as the data itself, (see + * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. */ @@ -5188,8 +5188,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { ret = register_ftrace_function(&direct_ops); @@ -5198,6 +5196,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(&direct->next); -- cgit v1.2.3 From 8d4a21b5ac9dad5d5221c60060b3ac28d22f57ca Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 26 May 2022 15:29:57 +0800 Subject: tracing: Fix comments for event_trigger_separate_filter() The parameter name in comments of event_trigger_separate_filter() is inconsistent with actual parameter name, fix it. Link: https://lkml.kernel.org/r/20220526072957.165655-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 21592bed2e89..cb866c3141af 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -738,15 +738,15 @@ bool event_trigger_empty_param(const char *param) /** * event_trigger_separate_filter - separate an event trigger from a filter - * @param: The param string containing trigger and possibly filter - * @trigger: outparam, will be filled with a pointer to the trigger + * @param_and_filter: String containing trigger and possibly filter + * @param: outparam, will be filled with a pointer to the trigger * @filter: outparam, will be filled with a pointer to the filter * @param_required: Specifies whether or not the param string is required * * Given a param string of the form '[trigger] [if filter]', this * function separates the filter from the trigger and returns the - * trigger in *trigger and the filter in *filter. Either the *trigger - * or the *filter may be set to NULL by this function - if not set to + * trigger in @param and the filter in @filter. Either the @param + * or the @filter may be set to NULL by this function - if not set to * NULL, they will contain strings corresponding to the trigger and * filter. * -- cgit v1.2.3 From 8b4dd2d8627e88dc3bd71bf29c48aaae2b69572b Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Fri, 27 May 2022 18:03:54 +0800 Subject: perf/core: Remove unused local variable Drop LIST_HEAD() where the variable it declares is never used. Compiler probably never warned us, because the LIST_HEAD() initializer is technically 'usage'. [ mingo: Tweak changelog. ] Signed-off-by: Haowen Bai Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/1653645835-29206-1-git-send-email-baihaowen@meizu.com --- kernel/events/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 950b25c3f210..80782cddb1da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4257,7 +4257,6 @@ static void perf_event_remove_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *event, *next; - LIST_HEAD(free_list); unsigned long flags; bool modified = false; -- cgit v1.2.3 From 809631e2bff5aba056df75cc4e43127dbd0278c9 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 26 May 2022 22:36:56 +0206 Subject: Revert "printk: wake up all waiters" This reverts commit 938ba4084abcf6fdd21d9078513c52f8fb9b00d0. The wait queue @log_wait never has exclusive waiters, so there is no need to use wake_up_interruptible_all(). Using wake_up_interruptible() was the correct function to wake all waiters. Since there are no exclusive waiters, erroneously changing wake_up_interruptible() to wake_up_interruptible_all() did not result in any behavior change. However, using wake_up_interruptible_all() on a wait queue without exclusive waiters is fundamentally wrong. Go back to using wake_up_interruptible() to wake all waiters. Signed-off-by: John Ogness Acked-by: Steven Rostedt (Google) Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220526203056.81123-1-john.ogness@linutronix.de --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a3e1035929b0..ea3dd55709e7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3904,7 +3904,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) } if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible_all(&log_wait); + wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = -- cgit v1.2.3 From 3e35142ef99fe6b4fe5d834ad43ee13cca10a2dc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 19 May 2022 14:42:37 +0530 Subject: kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add] Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") [1], binutils (v2.36+) started dropping section symbols that it thought were unused. This isn't an issue in general, but with kexec_file.c, gcc is placing kexec_arch_apply_relocations[_add] into a separate .text.unlikely section and the section symbol ".text.unlikely" is being dropped. Due to this, recordmcount is unable to find a non-weak symbol in .text.unlikely to generate a relocation record against. Address this by dropping the weak attribute from these functions. Instead, follow the existing pattern of having architectures #define the name of the function they want to override in their headers. [1] https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d1bcae833b32f1 [akpm@linux-foundation.org: arch/s390/include/asm/kexec.h needs linux/module.h] Link: https://lkml.kernel.org/r/20220519091237.676736-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Michael Ellerman Signed-off-by: Naveen N. Rao Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton --- arch/s390/include/asm/kexec.h | 10 ++++++++++ arch/x86/include/asm/kexec.h | 8 ++++++++ include/linux/kexec.h | 46 +++++++++++++++++++++++++++++++++++-------- kernel/kexec_file.c | 34 -------------------------------- 4 files changed, 56 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 7f3c9ac34bd8..63098df81c9f 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -9,6 +9,8 @@ #ifndef _S390_KEXEC_H #define _S390_KEXEC_H +#include + #include #include #include @@ -83,4 +85,12 @@ struct kimage_arch { extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add +#endif #endif /*_S390_KEXEC_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 11b7c06e2828..6ad8d946cd3e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -186,6 +186,14 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add +#endif #endif typedef void crash_vmclear_fn(void); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 58d1b58a971e..fcd5035209f1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -193,14 +193,6 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len); void *arch_kexec_kernel_image_load(struct kimage *image); -int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); -int arch_kexec_apply_relocations(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); int arch_kimage_file_post_load_cleanup(struct kimage *image); #ifdef CONFIG_KEXEC_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, @@ -229,6 +221,44 @@ extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mend); extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, void **addr, unsigned long *sz); + +#ifndef arch_kexec_apply_relocations_add +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ +static inline int +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} +#endif + +#ifndef arch_kexec_apply_relocations +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ +static inline int +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} +#endif #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_KEXEC_ELF diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 8347fc158d2b..c108a2a88754 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -108,40 +108,6 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, } #endif -/* - * arch_kexec_apply_relocations_add - apply relocations of type RELA - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELAs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * arch_kexec_apply_relocations - apply relocations of type REL - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - /* * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have -- cgit v1.2.3 From caff1fa4118cec4dfd4336521ebd22a6408a1e3e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 24 May 2022 10:12:27 +0800 Subject: bpf: Fix probe read error in ___bpf_prog_run() I think there is something wrong with BPF_PROBE_MEM in ___bpf_prog_run() in big-endian machine. Let's make a test and see what will happen if we want to load a 'u16' with BPF_PROBE_MEM. Let's make the src value '0x0001', the value of dest register will become 0x0001000000000000, as the value will be loaded to the first 2 byte of DST with following code: bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); Obviously, the value in DST is not correct. In fact, we can compare BPF_PROBE_MEM with LDX_MEM_H: DST = *(SIZE *)(unsigned long) (SRC + insn->off); If the memory load is done by LDX_MEM_H, the value in DST will be 0x1 now. And I think this error results in the test case 'test_bpf_sk_storage_map' failing: test_bpf_sk_storage_map:PASS:bpf_iter_bpf_sk_storage_map__open_and_load 0 nsec test_bpf_sk_storage_map:PASS:socket 0 nsec test_bpf_sk_storage_map:PASS:map_update 0 nsec test_bpf_sk_storage_map:PASS:socket 0 nsec test_bpf_sk_storage_map:PASS:map_update 0 nsec test_bpf_sk_storage_map:PASS:socket 0 nsec test_bpf_sk_storage_map:PASS:map_update 0 nsec test_bpf_sk_storage_map:PASS:attach_iter 0 nsec test_bpf_sk_storage_map:PASS:create_iter 0 nsec test_bpf_sk_storage_map:PASS:read 0 nsec test_bpf_sk_storage_map:FAIL:ipv6_sk_count got 0 expected 3 $10/26 bpf_iter/bpf_sk_storage_map:FAIL The code of the test case is simply, it will load sk->sk_family to the register with BPF_PROBE_MEM and check if it is AF_INET6. With this patch, now the test case 'bpf_iter' can pass: $10 bpf_iter:OK Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Menglong Dong Signed-off-by: Daniel Borkmann Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Cc: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/20220524021228.533216-1-imagedong@tencent.com --- kernel/bpf/core.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cacd8684c3c4..5f6f3f829b36 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1953,6 +1953,11 @@ out: CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read_kernel(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ + DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) @@ -1960,15 +1965,6 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST -#define LDX_PROBE(SIZEOP, SIZE) \ - LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \ - CONT; - LDX_PROBE(B, 1) - LDX_PROBE(H, 2) - LDX_PROBE(W, 4) - LDX_PROBE(DW, 8) -#undef LDX_PROBE #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ -- cgit v1.2.3 From b39181f7c6907dc66ff937b74758671fa6ba430c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 26 May 2022 14:19:12 -0400 Subject: ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function If an unused weak function was traced, it's call to fentry will still exist, which gets added into the __mcount_loc table. Ftrace will use kallsyms to retrieve the name for each location in __mcount_loc to display it in the available_filter_functions and used to enable functions via the name matching in set_ftrace_filter/notrace. Enabling these functions do nothing but enable an unused call to ftrace_caller. If a traced weak function is overridden, the symbol of the function would be used for it, which will either created duplicate names, or if the previous function was not traced, it would be incorrectly be listed in available_filter_functions as a function that can be traced. This became an issue with BPF[1] as there are tooling that enables the direct callers via ftrace but then checks to see if the functions were actually enabled. The case of one function that was marked notrace, but was followed by an unused weak function that was traced. The unused function's call to fentry was added to the __mcount_loc section, and kallsyms retrieved the untraced function's symbol as the weak function was overridden. Since the untraced function would not get traced, the BPF check would detect this and fail. The real fix would be to fix kallsyms to not show addresses of weak functions as the function before it. But that would require adding code in the build to add function size to kallsyms so that it can know when the function ends instead of just using the start of the next known symbol. In the mean time, this is a work around. Add a FTRACE_MCOUNT_MAX_OFFSET macro that if defined, ftrace will ignore any function that has its call to fentry/mcount that has an offset from the symbol that is greater than FTRACE_MCOUNT_MAX_OFFSET. If CONFIG_HAVE_FENTRY is defined for x86, define FTRACE_MCOUNT_MAX_OFFSET to zero (unless IBT is enabled), which will have ftrace ignore all locations that are not at the start of the function (or one after the ENDBR instruction). A worker thread is added at boot up to scan all the ftrace record entries, and will mark any that fail the FTRACE_MCOUNT_MAX_OFFSET test as disabled. They will still appear in the available_filter_functions file as: __ftrace_invalid_address___ (showing the offset that caused it to be invalid). This is required for tools that use libtracefs (like trace-cmd does) that scan the available_filter_functions and enable set_ftrace_filter and set_ftrace_notrace using indexes of the function listed in the file (this is a speedup, as enabling thousands of files via names is an O(n^2) operation and can take minutes to complete, where the indexing takes less than a second). The invalid functions cannot be removed from available_filter_functions as the names there correspond to the ftrace records in the array that manages them (and the indexing depends on this). [1] https://lore.kernel.org/all/20220412094923.0abe90955e5db486b7bca279@kernel.org/ Link: https://lkml.kernel.org/r/20220526141912.794c2786@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 7 +++ kernel/trace/ftrace.c | 141 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 024d9797646e..b5ef474be858 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -9,6 +9,13 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +/* Ignore unused weak functions which will have non zero offsets */ +#ifdef CONFIG_HAVE_FENTRY +# include +/* Add offset for endbr64 if IBT enabled */ +# define FTRACE_MCOUNT_MAX_OFFSET ENDBR_INSN_SIZE +#endif + #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d653ef4febc5..c5088c76a108 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,6 +45,8 @@ #include "trace_output.h" #include "trace_stat.h" +#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" + #define FTRACE_WARN_ON(cond) \ ({ \ int ___r = cond; \ @@ -3654,6 +3656,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, seq_printf(m, " ->%pS", ptr); } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +/* + * Weak functions can still have an mcount/fentry that is saved in + * the __mcount_loc section. These can be detected by having a + * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the + * symbol found by kallsyms is not the function that the mcount/fentry + * is part of. The offset is much greater in these cases. + * + * Test the record to make sure that the ip points to a valid kallsyms + * and if not, mark it disabled. + */ +static int test_for_valid_rec(struct dyn_ftrace *rec) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned long offset; + const char *ret; + + ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str); + + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + rec->flags |= FTRACE_FL_DISABLED; + return 0; + } + return 1; +} + +static struct workqueue_struct *ftrace_check_wq __initdata; +static struct work_struct ftrace_check_work __initdata; + +/* + * Scan all the mcount/fentry entries to make sure they are valid. + */ +static __init void ftrace_check_work_func(struct work_struct *work) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + test_for_valid_rec(rec); + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static int __init ftrace_check_for_weak_functions(void) +{ + INIT_WORK(&ftrace_check_work, ftrace_check_work_func); + + ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0); + + queue_work(ftrace_check_wq, &ftrace_check_work); + return 0; +} + +static int __init ftrace_check_sync(void) +{ + /* Make sure the ftrace_check updates are finished */ + if (ftrace_check_wq) + destroy_workqueue(ftrace_check_wq); + return 0; +} + +late_initcall_sync(ftrace_check_sync); +subsys_initcall(ftrace_check_for_weak_functions); + +static int print_rec(struct seq_file *m, unsigned long ip) +{ + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(ip, NULL, &offset, &modname, str); + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld", + FTRACE_INVALID_FUNCTION, offset); + ret = NULL; + } + + seq_puts(m, str); + if (modname) + seq_printf(m, " [%s]", modname); + return ret == NULL ? -1 : 0; +} +#else +static inline int test_for_valid_rec(struct dyn_ftrace *rec) +{ + return 1; +} + +static inline int print_rec(struct seq_file *m, unsigned long ip) +{ + seq_printf(m, "%ps", (void *)ip); + return 0; +} +#endif + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -3678,7 +3779,13 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps", (void *)rec->ip); + if (print_rec(m, rec->ip)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); + seq_putc(m, '\n'); + return 0; + } + if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; @@ -3996,6 +4103,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, return 0; } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + unsigned long offset; + + kallsyms_lookup(ip, NULL, &offset, modname, str); + if (offset > FTRACE_MCOUNT_MAX_OFFSET) + return -1; + return 0; +} +#else +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + kallsyms_lookup(ip, NULL, NULL, modname, str); + return 0; +} +#endif + static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) @@ -4003,7 +4128,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, char str[KSYM_SYMBOL_LEN]; char *modname; - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + if (lookup_ip(rec->ip, &modname, str)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING && + !(rec->flags & FTRACE_FL_DISABLED)); + return 0; + } if (mod_g) { int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; @@ -6819,6 +6949,13 @@ void ftrace_module_enable(struct module *mod) !within_module_init(rec->ip, mod)) break; + /* Weak functions should still be ignored */ + if (!test_for_valid_rec(rec)) { + /* Clear all other flags. Should not be enabled anyway */ + rec->flags = FTRACE_FL_DISABLED; + continue; + } + cnt = 0; /* -- cgit v1.2.3 From 82f586f923e3ac6062bc7867717a7f8afc09e0ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2022 09:45:38 +0200 Subject: sched/autogroup: Fix sysctl move Ivan reported /proc/sys/kernel/sched_autogroup_enabled went walk-about and using the noautogroup command line parameter would result in a boot error message. Turns out the sysctl move placed the init function wrong. Fixes: c8eaf6ac76f4 ("sched: move autogroup sysctls into its own file") Reported-by: Ivan Kozik Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ivan Kozik Link: https://lkml.kernel.org/r/YpR2IqndgsyMzN00@worktop.programming.kicks-ass.net --- kernel/sched/autogroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 16092b49ff6a..4ebaf97f7bd8 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -36,6 +36,7 @@ void __init autogroup_init(struct task_struct *init_task) kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; + sched_autogroup_sysctl_init(); } void autogroup_free(struct task_group *tg) @@ -219,7 +220,6 @@ void sched_autogroup_exit(struct signal_struct *sig) static int __init setup_autogroup(char *str) { sysctl_sched_autogroup_enabled = 0; - sched_autogroup_sysctl_init(); return 1; } -- cgit v1.2.3 From ff979b2a9d9779382030023bfc4e3b1989c8c314 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 6 May 2022 11:27:37 +0800 Subject: ftrace/fgraph: fix increased missing-prototypes warnings After commit e999995c84c3 ("ftrace: cleanup ftrace_graph_caller enable and disable") merged into the linux-next tree, the kernel test robot (lkp@intel.com) has send out report that there are increased missing-prototypes warnings caused by that commit. COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 \ O=build_dir ARCH=sh SHELL=/bin/bash kernel/trace/ warning: no previous prototype for 'ftrace_enable_ftrace_graph_caller' [-Wmissing-prototypes] warning: no previous prototype for 'ftrace_disable_ftrace_graph_caller' [-Wmissing-prototypes] warning: no previous prototype for 'ftrace_return_to_handler' [-Wmissing-prototypes] warning: no previous prototype for 'ftrace_graph_sleep_time_control' [-Wmissing-prototypes] BTW there are so many missing-prototypes warnings if build kernel with "W=1". The increased warnings for 'ftrace_[enable,disable]_ftrace_graph_caller' is caused by CONFIG_FUNCTION_GRAPH_TRACER && !CONFIG_DYNAMIC_FTRACE, so the declarations in can't be seen in fgraph.c. And this warning can't reproduce on x86_64 since x86_64 select HAVE_FUNCTION_GRAPH_TRACER only when DYNAMIC_FTRACE, so fgraph.c will always see the declarations in . This patch fix the increased warnings by put the definitions in CONFIG_DYNAMIC_FTRACE although there are no real problems exist. Signed-off-by: Chengming Zhou Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20220506032737.23375-1-zhouchengming@bytedance.com Signed-off-by: Catalin Marinas --- kernel/trace/fgraph.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 289311680c29..2cd374294be7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -30,6 +30,7 @@ int ftrace_graph_active; /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; +#ifdef CONFIG_DYNAMIC_FTRACE /* * archs can override this function if they must do something * to enable hook for graph tracer. @@ -47,6 +48,7 @@ int __weak ftrace_disable_ftrace_graph_caller(void) { return 0; } +#endif /** * ftrace_graph_stop - set to permanently disable function graph tracing -- cgit v1.2.3 From 662ce1dc9caf493c309200edbe38d186f1ea20d0 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 1 Jun 2022 15:55:25 -0700 Subject: delayacct: track delays from write-protect copy Delay accounting does not track the delay of write-protect copy. When tasks trigger many write-protect copys(include COW and unsharing of anonymous pages[1]), it may spend a amount of time waiting for them. To get the delay of tasks in write-protect copy, could help users to evaluate the impact of using KSM or fork() or GUP. Also update tools/accounting/getdelays.c: / # ./getdelays -dl -p 231 print delayacct stats ON listen forever PID 231 CPU count real total virtual total delay total delay average 6247 1859000000 2154070021 1674255063 0.268ms IO count delay total delay average 0 0 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 0 0 0ms COMPACT count delay total delay average 3 72758 0ms WPCOPY count delay total delay average 3635 271567604 0ms [1] commit 31cc5bc4af70("mm: support GUP-triggered unsharing of anonymous pages") Link: https://lkml.kernel.org/r/20220409014342.2505532-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: David Hildenbrand Reviewed-by: Jiang Xuexin Reviewed-by: Ran Xiaokai Reviewed-by: wangyong Cc: Jonathan Corbet Cc: Balbir Singh Cc: Mike Kravetz Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/accounting/delay-accounting.rst | 5 ++++- include/linux/delayacct.h | 28 +++++++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 16 +++++++++++++++ mm/hugetlb.c | 8 ++++++++ mm/memory.c | 8 ++++++++ tools/accounting/getdelays.c | 8 +++++++- 7 files changed, 76 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 197fe319cbec..241d1a87f2cd 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -15,6 +15,7 @@ c) swapping in pages d) memory reclaim e) thrashing page cache f) direct compact +g) write-protect copy and makes these statistics available to userspace through the taskstats interface. @@ -48,7 +49,7 @@ this structure. See for a description of the fields pertaining to delay accounting. It will generally be in the form of counters returning the cumulative delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page -cache, direct compact etc. +cache, direct compact, write-protect copy etc. Taking the difference of two successive readings of a given counter (say cpu_delay_total) for a task will give the delay @@ -117,6 +118,8 @@ Get sum of delays, since system boot, for all pids with tgid 5:: 0 0 0ms COMPACT count delay total delay average 0 0 0ms + WPCOPY count delay total delay average + 0 0 0ms Get IO accounting for pid 1, it works only with -p:: diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 6b16a6930a19..58aea2d7385c 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -45,9 +45,13 @@ struct task_delay_info { u64 compact_start; u64 compact_delay; /* wait for memory compact */ + u64 wpcopy_start; + u64 wpcopy_delay; /* wait for write-protect copy */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ u32 compact_count; /* total count of memory compact */ + u32 wpcopy_count; /* total count of write-protect copy */ }; #endif @@ -75,6 +79,8 @@ extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); extern void __delayacct_compact_start(void); extern void __delayacct_compact_end(void); +extern void __delayacct_wpcopy_start(void); +extern void __delayacct_wpcopy_end(void); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -191,6 +197,24 @@ static inline void delayacct_compact_end(void) __delayacct_compact_end(); } +static inline void delayacct_wpcopy_start(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_wpcopy_start(); +} + +static inline void delayacct_wpcopy_end(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_wpcopy_end(); +} + #else static inline void delayacct_init(void) {} @@ -225,6 +249,10 @@ static inline void delayacct_compact_start(void) {} static inline void delayacct_compact_end(void) {} +static inline void delayacct_wpcopy_start(void) +{} +static inline void delayacct_wpcopy_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 736154171489..a7f5b11a8f1b 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 12 +#define TASKSTATS_VERSION 13 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -194,6 +194,10 @@ struct taskstats { __u64 ac_exe_dev; /* program binary device ID */ __u64 ac_exe_inode; /* program binary inode number */ /* v12 end */ + + /* v13: Delay waiting for write-protect copy */ + __u64 wpcopy_count; + __u64 wpcopy_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 2c1e18f7c5cf..164ed9ef77a3 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -177,11 +177,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; tmp = d->compact_delay_total + tsk->delays->compact_delay; d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; + tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; + d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; + d->wpcopy_count += tsk->delays->wpcopy_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -249,3 +252,16 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count); } + +void __delayacct_wpcopy_start(void) +{ + current->delays->wpcopy_start = local_clock(); +} + +void __delayacct_wpcopy_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->wpcopy_start, + ¤t->delays->wpcopy_delay, + ¤t->delays->wpcopy_count); +} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c468ac1d069..a57e1be41401 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -5230,6 +5231,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, pte = huge_ptep_get(ptep); old_page = pte_page(pte); + delayacct_wpcopy_start(); + retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive @@ -5240,6 +5243,8 @@ retry_avoidcopy: page_move_anon_rmap(old_page, vma); if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); + + delayacct_wpcopy_end(); return 0; } VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), @@ -5309,6 +5314,7 @@ retry_avoidcopy: * race occurs while re-acquiring page table * lock, and our job is done. */ + delayacct_wpcopy_end(); return 0; } @@ -5367,6 +5373,8 @@ out_release_old: put_page(old_page); spin_lock(ptl); /* Caller expects lock to be held */ + + delayacct_wpcopy_end(); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 21dadf03f089..7a089145cad4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3090,6 +3090,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) int page_copied = 0; struct mmu_notifier_range range; + delayacct_wpcopy_start(); + if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -3114,6 +3116,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) put_page(new_page); if (old_page) put_page(old_page); + + delayacct_wpcopy_end(); return 0; } } @@ -3220,12 +3224,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) free_swap_cache(old_page); put_page(old_page); } + + delayacct_wpcopy_end(); return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; oom_free_new: put_page(new_page); oom: if (old_page) put_page(old_page); + + delayacct_wpcopy_end(); return VM_FAULT_OOM; } diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 11e86739456d..e83e6e47a21e 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -207,6 +207,8 @@ static void print_delayacct(struct taskstats *t) "THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n" "COMPACT %12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "WPCOPY %12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -234,7 +236,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->compact_count, (unsigned long long)t->compact_delay_total, - average_ms(t->compact_delay_total, t->compact_count)); + average_ms(t->compact_delay_total, t->compact_count), + "count", "delay total", "delay average", + (unsigned long long)t->wpcopy_count, + (unsigned long long)t->wpcopy_delay_total, + average_ms(t->wpcopy_delay_total, t->wpcopy_count)); } static void task_context_switch_counts(struct taskstats *t) -- cgit v1.2.3 From e19f8fa6ce1ca9b8b934ba7d2e8f34c95abc6e60 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 1 Jun 2022 07:51:16 -0700 Subject: dma-debug: make things less spammy under memory pressure Limit the error msg to avoid flooding the console. If you have a lot of threads hitting this at once, they could have already gotten passed the dma_debug_disabled() check before they get to the point of allocation failure, resulting in quite a lot of this error message spamming the log. Use pr_err_once() to limit that. Signed-off-by: Rob Clark Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index ac740630c79c..2caafd13f8aa 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -564,7 +564,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { - pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); + pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { err_printk(entry->dev, entry, -- cgit v1.2.3 From e15db62bc5648ab459a570862f654e787c498faf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jun 2022 20:49:39 +0200 Subject: swiotlb: fix setting ->force_bounce The swiotlb_init refactor messed up assigning ->force_bounce by doing it in different places based on what caused the setting of the flag. Fix this by passing the SWIOTLB_* flags to swiotlb_init_io_tlb_mem and just setting it there. Fixes: c6af2aa9ffc9 ("swiotlb: make the swiotlb_init interface more useful") Reported-by: Nathan Chancellor Signed-off-by: Christoph Hellwig Tested-by: Nathan Chancellor --- kernel/dma/swiotlb.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index dfa1de89dc94..cb50f8d38360 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -192,7 +192,7 @@ void __init swiotlb_update_mem_attributes(void) } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, - unsigned long nslabs, bool late_alloc) + unsigned long nslabs, unsigned int flags, bool late_alloc) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; @@ -203,8 +203,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->index = 0; mem->late_alloc = late_alloc; - if (swiotlb_force_bounce) - mem->force_bounce = true; + mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); spin_lock_init(&mem->lock); for (i = 0; i < mem->nslabs; i++) { @@ -275,8 +274,7 @@ retry: panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); - mem->force_bounce = flags & SWIOTLB_FORCE; + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); @@ -348,7 +346,7 @@ retry: set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true); swiotlb_print_info(); return 0; @@ -835,8 +833,8 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); - mem->force_bounce = true; + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE, + false); mem->for_alloc = true; rmem->priv = mem; -- cgit v1.2.3 From 587b9bfe0668bc997e51af9526a0c7c084d4660f Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 1 Jun 2022 01:11:02 +0300 Subject: kernel/reboot: Use static handler for register_platform_power_off() The register_platform_power_off() fails on m68k platform due to the memory allocation error that happens at a very early boot time when memory allocator isn't available yet. Fix it by using a static sys-off handler for the platform-level power-off handlers. Fixes: f0f7e5265b3b ("m68k: Switch to new sys-off handler API") Reported-by: Geert Uytterhoeven Signed-off-by: Dmitry Osipenko Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index a091145ee710..3b19b123efec 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -315,6 +315,37 @@ static int sys_off_notify(struct notifier_block *nb, return handler->sys_off_cb(&data); } +static struct sys_off_handler platform_sys_off_handler; + +static struct sys_off_handler *alloc_sys_off_handler(int priority) +{ + struct sys_off_handler *handler; + + /* + * Platforms like m68k can't allocate sys_off handler dynamically + * at the early boot time because memory allocator isn't available yet. + */ + if (priority == SYS_OFF_PRIO_PLATFORM) { + handler = &platform_sys_off_handler; + if (handler->cb_data) + return ERR_PTR(-EBUSY); + } else { + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler) + return ERR_PTR(-ENOMEM); + } + + return handler; +} + +static void free_sys_off_handler(struct sys_off_handler *handler) +{ + if (handler == &platform_sys_off_handler) + memset(handler, 0, sizeof(*handler)); + else + kfree(handler); +} + /** * register_sys_off_handler - Register sys-off handler * @mode: Sys-off mode @@ -345,9 +376,9 @@ register_sys_off_handler(enum sys_off_mode mode, struct sys_off_handler *handler; int err; - handler = kzalloc(sizeof(*handler), GFP_KERNEL); - if (!handler) - return ERR_PTR(-ENOMEM); + handler = alloc_sys_off_handler(priority); + if (IS_ERR(handler)) + return handler; switch (mode) { case SYS_OFF_MODE_POWER_OFF_PREPARE: @@ -364,7 +395,7 @@ register_sys_off_handler(enum sys_off_mode mode, break; default: - kfree(handler); + free_sys_off_handler(handler); return ERR_PTR(-EINVAL); } @@ -391,7 +422,7 @@ register_sys_off_handler(enum sys_off_mode mode, } if (err) { - kfree(handler); + free_sys_off_handler(handler); return ERR_PTR(err); } @@ -422,7 +453,7 @@ void unregister_sys_off_handler(struct sys_off_handler *handler) /* sanity check, shall never happen */ WARN_ON(err); - kfree(handler); + free_sys_off_handler(handler); } EXPORT_SYMBOL_GPL(unregister_sys_off_handler); -- cgit v1.2.3 From 73503963b715a64a44aa2b1c486114b917a17c73 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 1 Jun 2022 20:56:52 -0700 Subject: module: Fix prefix for module.sig_enforce module param Commit cfc1d277891e ("module: Move all into module/") changed the prefix of the module param by moving/renaming files. A later commit also moves the module_param() into a different file, thereby changing the prefix yet again. This would break kernel cmdline compatibility and also userspace compatibility at /sys/module/module/parameters/sig_enforce. So, set the prefix back to "module.". Fixes: cfc1d277891e ("module: Move all into module/") Link: https://lore.kernel.org/lkml/20220602034111.4163292-1-saravanak@google.com/ Cc: Christophe Leroy Cc: Aaron Tomlin Acked-by: Luis Chamberlain Signed-off-by: Saravana Kannan Signed-off-by: Linus Torvalds --- kernel/module/signing.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module/signing.c b/kernel/module/signing.c index 85c8999dfecf..a2ff4242e623 100644 --- a/kernel/module/signing.c +++ b/kernel/module/signing.c @@ -16,6 +16,9 @@ #include #include "internal.h" +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." + static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); -- cgit v1.2.3 From 2130a790ca49763f724ec45cf93b9dd765e2023e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Jun 2022 15:05:26 +0200 Subject: kernel: add platform_has() infrastructure Add a simple infrastructure for setting, resetting and querying platform feature flags. Flags can be either global or architecture specific. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Tested-by: Oleksandr Tyshchenko # Arm64 only Reviewed-by: Christoph Hellwig Acked-by: Borislav Petkov Signed-off-by: Juergen Gross --- MAINTAINERS | 8 ++++++++ include/asm-generic/Kbuild | 1 + include/asm-generic/platform-feature.h | 8 ++++++++ include/linux/platform-feature.h | 15 +++++++++++++++ kernel/Makefile | 2 +- kernel/platform-feature.c | 27 +++++++++++++++++++++++++++ 6 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 include/asm-generic/platform-feature.h create mode 100644 include/linux/platform-feature.h create mode 100644 kernel/platform-feature.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index a6d3bd9d2a8d..06678abc22ca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15824,6 +15824,14 @@ S: Maintained F: Documentation/devicetree/bindings/iio/chemical/plantower,pms7003.yaml F: drivers/iio/chemical/pms7003.c +PLATFORM FEATURE INFRASTRUCTURE +M: Juergen Gross +S: Maintained +F: arch/*/include/asm/platform-feature.h +F: include/asm-generic/platform-feature.h +F: include/linux/platform-feature.h +F: kernel/platform-feature.c + PLDMFW LIBRARY M: Jacob Keller S: Maintained diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 302506bbc2a4..8e47d483b524 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -44,6 +44,7 @@ mandatory-y += msi.h mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h +mandatory-y += platform-feature.h mandatory-y += preempt.h mandatory-y += rwonce.h mandatory-y += sections.h diff --git a/include/asm-generic/platform-feature.h b/include/asm-generic/platform-feature.h new file mode 100644 index 000000000000..4b0af3d51588 --- /dev/null +++ b/include/asm-generic/platform-feature.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_PLATFORM_FEATURE_H +#define _ASM_GENERIC_PLATFORM_FEATURE_H + +/* Number of arch specific feature flags. */ +#define PLATFORM_ARCH_FEAT_N 0 + +#endif /* _ASM_GENERIC_PLATFORM_FEATURE_H */ diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h new file mode 100644 index 000000000000..6ed859928b97 --- /dev/null +++ b/include/linux/platform-feature.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PLATFORM_FEATURE_H +#define _PLATFORM_FEATURE_H + +#include +#include + +/* The platform features are starting with the architecture specific ones. */ +#define PLATFORM_FEAT_N (0 + PLATFORM_ARCH_FEAT_N) + +void platform_set(unsigned int feature); +void platform_clear(unsigned int feature); +bool platform_has(unsigned int feature); + +#endif /* _PLATFORM_FEATURE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d3..a7e1f49ab2b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ - extable.o params.o \ + extable.o params.o platform-feature.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o diff --git a/kernel/platform-feature.c b/kernel/platform-feature.c new file mode 100644 index 000000000000..cb6a6c3e4fed --- /dev/null +++ b/kernel/platform-feature.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#define PLATFORM_FEAT_ARRAY_SZ BITS_TO_LONGS(PLATFORM_FEAT_N) +static unsigned long __read_mostly platform_features[PLATFORM_FEAT_ARRAY_SZ]; + +void platform_set(unsigned int feature) +{ + set_bit(feature, platform_features); +} +EXPORT_SYMBOL_GPL(platform_set); + +void platform_clear(unsigned int feature) +{ + clear_bit(feature, platform_features); +} +EXPORT_SYMBOL_GPL(platform_clear); + +bool platform_has(unsigned int feature) +{ + return test_bit(feature, platform_features); +} +EXPORT_SYMBOL_GPL(platform_has); -- cgit v1.2.3 From 3e684903a8574ffc9475fdf13c4780a7adb506ad Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 4 May 2022 13:08:40 -0500 Subject: entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set A livepatch transition may stall indefinitely when a kvm vCPU is heavily loaded. To the host, the vCPU task is a user thread which is spending a very long time in the ioctl(KVM_RUN) syscall. During livepatch transition, set_notify_signal() will be called on such tasks to interrupt the syscall so that the task can be transitioned. This interrupts guest execution, but when xfer_to_guest_mode_work() sees that TIF_NOTIFY_SIGNAL is set but not TIF_SIGPENDING it concludes that an exit to user mode is unnecessary, and guest execution is resumed without transitioning the task for the livepatch. This handling of TIF_NOTIFY_SIGNAL is incorrect, as set_notify_signal() is expected to break tasks out of interruptible kernel loops and cause them to return to userspace. Change xfer_to_guest_mode_work() to handle TIF_NOTIFY_SIGNAL the same as TIF_SIGPENDING, signaling to the vCPU run loop that an exit to userpsace is needed. Any pending task_work will be run when get_signal() is called from exit_to_user_mode_loop(), so there is no longer any need to run task work from xfer_to_guest_mode_work(). Suggested-by: "Eric W. Biederman" Cc: Petr Mladek Signed-off-by: Seth Forshee Message-Id: <20220504180840.2907296-1-sforshee@digitalocean.com> Signed-off-by: Paolo Bonzini --- kernel/entry/kvm.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 9d09f489b60e..2e0f75bcb7fd 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) int ret; if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - } - - if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); return -EINTR; } -- cgit v1.2.3 From c4f135d643823a869becfa87539f7820ef9d5bfa Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 1 Jun 2022 16:32:47 +0900 Subject: workqueue: Wrap flush_workqueue() using a macro Since flush operation synchronously waits for completion, flushing system-wide WQs (e.g. system_wq) might introduce possibility of deadlock due to unexpected locking dependency. Tejun Heo commented at [1] that it makes no sense at all to call flush_workqueue() on the shared WQs as the caller has no idea what it's gonna end up waiting for. Although there is flush_scheduled_work() which flushes system_wq WQ with "Think twice before calling this function! It's very easy to get into trouble if you don't take great care." warning message, syzbot found a circular locking dependency caused by flushing system_wq WQ [2]. Therefore, let's change the direction to that developers had better use their local WQs if flush_scheduled_work()/flush_workqueue(system_*_wq) is inevitable. Steps for converting system-wide WQs into local WQs are explained at [3], and a conversion to stop flushing system-wide WQs is in progress. Now we want some mechanism for preventing developers who are not aware of this conversion from again start flushing system-wide WQs. Since I found that WARN_ON() is complete but awkward approach for teaching developers about this problem, let's use __compiletime_warning() for incomplete but handy approach. For completeness, we will also insert WARN_ON() into __flush_workqueue() after all in-tree users stopped calling flush_scheduled_work(). Link: https://lore.kernel.org/all/YgnQGZWT%2Fn3VAITX@slm.duckdns.org/ [1] Link: https://syzkaller.appspot.com/bug?extid=bde0f89deacca7c765b8 [2] Link: https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp [3] Signed-off-by: Tetsuo Handa Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 64 +++++++++++++++++++++++++++++++++++++++++------ kernel/workqueue.c | 16 +++++++++--- 2 files changed, 68 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 7fee9b6cfede..e1f1c8b1121b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -445,7 +445,7 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); -extern void flush_workqueue(struct workqueue_struct *wq); +extern void __flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); @@ -563,15 +563,23 @@ static inline bool schedule_work(struct work_struct *work) return queue_work(system_wq, work); } +/* + * Detect attempt to flush system-wide workqueues at compile time when possible. + * + * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp + * for reasons and steps for converting system-wide workqueues into local workqueues. + */ +extern void __warn_flushing_systemwide_wq(void) + __compiletime_warning("Please avoid flushing system-wide workqueues."); + /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * - * Think twice before calling this function! It's very easy to get into - * trouble if you don't take great care. Either of the following situations - * will lead to deadlock: + * It's very easy to get into trouble if you don't take great care. + * Either of the following situations will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. @@ -586,11 +594,51 @@ static inline bool schedule_work(struct work_struct *work) * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. + * + * Please stop calling this function! A conversion to stop flushing system-wide + * workqueues is in progress. This function will be removed after all in-tree + * users stopped calling this function. */ -static inline void flush_scheduled_work(void) -{ - flush_workqueue(system_wq); -} +/* + * The background of commit 771c035372a036f8 ("deprecate the + * '__deprecated' attribute warnings entirely and for good") is that, + * since Linus builds all modules between every single pull he does, + * the standard kernel build needs to be _clean_ in order to be able to + * notice when new problems happen. Therefore, don't emit warning while + * there are in-tree users. + */ +#define flush_scheduled_work() \ +({ \ + if (0) \ + __warn_flushing_systemwide_wq(); \ + __flush_workqueue(system_wq); \ +}) + +/* + * Although there is no longer in-tree caller, for now just emit warning + * in order to give out-of-tree callers time to update. + */ +#define flush_workqueue(wq) \ +({ \ + struct workqueue_struct *_wq = (wq); \ + \ + if ((__builtin_constant_p(_wq == system_wq) && \ + _wq == system_wq) || \ + (__builtin_constant_p(_wq == system_highpri_wq) && \ + _wq == system_highpri_wq) || \ + (__builtin_constant_p(_wq == system_long_wq) && \ + _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_unbound_wq) && \ + _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_freezable_wq) && \ + _wq == system_freezable_wq) || \ + (__builtin_constant_p(_wq == system_power_efficient_wq) && \ + _wq == system_power_efficient_wq) || \ + (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \ + _wq == system_freezable_power_efficient_wq)) \ + __warn_flushing_systemwide_wq(); \ + __flush_workqueue(_wq); \ +}) /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4056f2a3f9d5..1ea50f6be843 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2788,13 +2788,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } /** - * flush_workqueue - ensure that any scheduled work has run to completion. + * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ -void flush_workqueue(struct workqueue_struct *wq) +void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), @@ -2943,7 +2943,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL(flush_workqueue); +EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2971,7 +2971,7 @@ void drain_workqueue(struct workqueue_struct *wq) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: - flush_workqueue(wq); + __flush_workqueue(wq); mutex_lock(&wq->mutex); @@ -6111,3 +6111,11 @@ void __init workqueue_init(void) wq_online = true; wq_watchdog_init(); } + +/* + * Despite the naming, this is a no-op function which is here only for avoiding + * link error. Since compile-time warning may fail to catch, we will need to + * emit run-time warning from __flush_workqueue(). + */ +void __warn_flushing_systemwide_wq(void) { } +EXPORT_SYMBOL(__warn_flushing_systemwide_wq); -- cgit v1.2.3 From fd58f7df2415ef747782e01f94880fefad1247cf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 May 2022 13:24:05 +0300 Subject: bpf: Use safer kvmalloc_array() where possible The kvmalloc_array() function is safer because it has a check for integer overflows. These sizes come from the user and I was not able to see any bounds checking so an integer overflow seems like a realistic concern. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Signed-off-by: Dan Carpenter Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/Yo9VRVMeHbALyjUH@kili Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 10b157a6d73e..7a13e6ac6327 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2263,11 +2263,11 @@ static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 int err = -ENOMEM; unsigned int i; - syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL); + syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); if (!syms) goto error; - buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL); + buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); if (!buf) goto error; @@ -2464,7 +2464,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; size = cnt * sizeof(*addrs); - addrs = kvmalloc(size, GFP_KERNEL); + addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; @@ -2489,7 +2489,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { - cookies = kvmalloc(size, GFP_KERNEL); + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!cookies) { err = -ENOMEM; goto error; -- cgit v1.2.3 From f858c2b2ca04fc7ead291821a793638ae120c11d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 6 Jun 2022 09:52:51 +0200 Subject: bpf: Fix calling global functions from BPF_PROG_TYPE_EXT programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verifier allows programs to call global functions as long as their argument types match, using BTF to check the function arguments. One of the allowed argument types to such global functions is PTR_TO_CTX; however the check for this fails on BPF_PROG_TYPE_EXT functions because the verifier uses the wrong type to fetch the vmlinux BTF ID for the program context type. This failure is seen when an XDP program is loaded using libxdp (which loads it as BPF_PROG_TYPE_EXT and attaches it to a global XDP type program). Fix the issue by passing in the target program type instead of the BPF_PROG_TYPE_EXT type to bpf_prog_get_ctx() when checking function argument compatibility. The first Fixes tag refers to the latest commit that touched the code in question, while the second one points to the code that first introduced the global function call verification. v2: - Use resolve_prog_type() Fixes: 3363bd0cfbb8 ("bpf: Extend kfunc with PTR_TO_CTX, PTR_TO_MEM argument support") Fixes: 51c39bb1d5d1 ("bpf: Introduce function-by-function verification") Reported-by: Simon Sundberg Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220606075253.28422-1-toke@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7bccaa4646e5..63d0ac7dfe2f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6054,6 +6054,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); @@ -6171,7 +6172,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } /* rest of the arguments can be anything, like normal kfunc */ - } else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { + } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ -- cgit v1.2.3 From 2b8c612c6102f751e6e3e1bd425f64e9d3d3f638 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 6 Jun 2022 19:56:40 +0300 Subject: kernel/reboot: Fix powering off using a non-syscall code paths There are other methods of powering off machine than the reboot syscall. Previously we missed to cover those methods and it created power-off regression for some machines, like the PowerPC e500. Fix this problem by moving the legacy sys-off handler registration to the latest phase of power-off process and making the kernel_can_power_off() check the legacy pm_power_off presence. Tested-by: Michael Ellerman # ppce500 Reported-by: Michael Ellerman # ppce500 Fixes: da007f171fc9 ("kernel/reboot: Change registration order of legacy power-off handler") Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 3b19b123efec..b5a71d1ff603 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -320,6 +320,7 @@ static struct sys_off_handler platform_sys_off_handler; static struct sys_off_handler *alloc_sys_off_handler(int priority) { struct sys_off_handler *handler; + gfp_t flags; /* * Platforms like m68k can't allocate sys_off handler dynamically @@ -330,7 +331,12 @@ static struct sys_off_handler *alloc_sys_off_handler(int priority) if (handler->cb_data) return ERR_PTR(-EBUSY); } else { - handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (system_state > SYSTEM_RUNNING) + flags = GFP_ATOMIC; + else + flags = GFP_KERNEL; + + handler = kzalloc(sizeof(*handler), flags); if (!handler) return ERR_PTR(-ENOMEM); } @@ -440,7 +446,7 @@ void unregister_sys_off_handler(struct sys_off_handler *handler) { int err; - if (!handler) + if (IS_ERR_OR_NULL(handler)) return; if (handler->blocking) @@ -615,7 +621,23 @@ static void do_kernel_power_off_prepare(void) */ void do_kernel_power_off(void) { + struct sys_off_handler *sys_off = NULL; + + /* + * Register sys-off handlers for legacy PM callback. This allows + * legacy PM callbacks temporary co-exist with the new sys-off API. + * + * TODO: Remove legacy handlers once all legacy PM users will be + * switched to the sys-off based APIs. + */ + if (pm_power_off) + sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + legacy_pm_power_off, NULL); + atomic_notifier_call_chain(&power_off_handler_list, 0, NULL); + + unregister_sys_off_handler(sys_off); } /** @@ -626,7 +648,8 @@ void do_kernel_power_off(void) */ bool kernel_can_power_off(void) { - return !atomic_notifier_call_chain_is_empty(&power_off_handler_list); + return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) || + pm_power_off; } EXPORT_SYMBOL_GPL(kernel_can_power_off); @@ -661,7 +684,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { struct pid_namespace *pid_ns = task_active_pid_ns(current); - struct sys_off_handler *sys_off = NULL; char buffer[256]; int ret = 0; @@ -686,21 +708,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if (ret) return ret; - /* - * Register sys-off handlers for legacy PM callback. This allows - * legacy PM callbacks temporary co-exist with the new sys-off API. - * - * TODO: Remove legacy handlers once all legacy PM users will be - * switched to the sys-off based APIs. - */ - if (pm_power_off) { - sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, - SYS_OFF_PRIO_DEFAULT, - legacy_pm_power_off, NULL); - if (IS_ERR(sys_off)) - return PTR_ERR(sys_off); - } - /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ @@ -758,7 +765,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, break; } mutex_unlock(&system_transition_mutex); - unregister_sys_off_handler(sys_off); return ret; } -- cgit v1.2.3 From 668a9fe5c6a1bcac6b65d5e9b91a9eca86f782a3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 8 Jun 2022 14:45:35 +0100 Subject: genirq: PM: Use runtime PM for chained interrupts When requesting an interrupt, we correctly call into the runtime PM framework to guarantee that the underlying interrupt controller is up and running. However, we fail to do so for chained interrupt controllers, as the mux interrupt is not requested along the same path. Augment __irq_do_set_handler() to call into the runtime PM code in this case, making sure the PM flow is the same for all interrupts. Reported-by: Lucas Stach Tested-by: Liu Ying Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/26973cddee5f527ea17184c0f3fccb70bc8969a0.camel@pengutronix.de --- kernel/irq/chip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e6b8e564b37f..886789dcee43 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1006,8 +1006,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); - if (is_chained) + if (is_chained) { desc->action = NULL; + WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc))); + } desc->depth = 1; } desc->handle_irq = handle; @@ -1033,6 +1035,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; + WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc))); irq_activate_and_startup(desc, IRQ_RESEND); } } -- cgit v1.2.3 From 04193d590b390ec7a0592630f46d559ec6564ba1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Jun 2022 22:41:55 +0200 Subject: sched: Fix balance_push() vs __sched_setscheduler() The purpose of balance_push() is to act as a filter on task selection in the case of CPU hotplug, specifically when taking the CPU out. It does this by (ab)using the balance callback infrastructure, with the express purpose of keeping all the unlikely/odd cases in a single place. In order to serve its purpose, the balance_push_callback needs to be (exclusively) on the callback list at all times (noting that the callback always places itself back on the list the moment it runs, also noting that when the CPU goes down, regular balancing concerns are moot, so ignoring them is fine). And here-in lies the problem, __sched_setscheduler()'s use of splice_balance_callbacks() takes the callbacks off the list across a lock-break, making it possible for, an interleaving, __schedule() to see an empty list and not get filtered. Fixes: ae7927023243 ("sched: Optimize finish_lock_switch()") Reported-by: Jing-Ting Wu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jing-Ting Wu Link: https://lkml.kernel.org/r/20220519134706.GH2578@worktop.programming.kicks-ass.net --- kernel/sched/core.c | 36 +++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 5 +++++ 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfa7452ca92e..da0bf6fe9ecd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4798,25 +4798,55 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) static void balance_push(struct rq *rq); +/* + * balance_push_callback is a right abuse of the callback interface and plays + * by significantly different rules. + * + * Where the normal balance_callback's purpose is to be ran in the same context + * that queued it (only later, when it's safe to drop rq->lock again), + * balance_push_callback is specifically targeted at __schedule(). + * + * This abuse is tolerated because it places all the unlikely/odd cases behind + * a single test, namely: rq->balance_callback == NULL. + */ struct callback_head balance_push_callback = { .next = NULL, .func = (void (*)(struct callback_head *))balance_push, }; -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct callback_head * +__splice_balance_callbacks(struct rq *rq, bool split) { struct callback_head *head = rq->balance_callback; + if (likely(!head)) + return NULL; + lockdep_assert_rq_held(rq); - if (head) + /* + * Must not take balance_push_callback off the list when + * splice_balance_callbacks() and balance_callbacks() are not + * in the same rq->lock section. + * + * In that case it would be possible for __schedule() to interleave + * and observe the list empty. + */ + if (split && head == &balance_push_callback) + head = NULL; + else rq->balance_callback = NULL; return head; } +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + return __splice_balance_callbacks(rq, true); +} + static void __balance_callbacks(struct rq *rq) { - do_balance_callbacks(rq, splice_balance_callbacks(rq)); + do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } static inline void balance_callbacks(struct rq *rq, struct callback_head *head) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 01259611beb9..47b89a0fc6e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1693,6 +1693,11 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_rq_held(rq); + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; -- cgit v1.2.3 From 4051a81774d6d8e28192742c26999d6f29bc0e68 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 17 May 2022 11:16:14 +0200 Subject: locking/lockdep: Use sched_clock() for random numbers Since the rewrote of prandom_u32(), in the commit mentioned below, the function uses sleeping locks which extracing random numbers and filling the batch. This breaks lockdep on PREEMPT_RT because lock_pin_lock() disables interrupts while calling __lock_pin_lock(). This can't be moved earlier because the main user of the function (rq_pin_lock()) invokes that function after disabling interrupts in order to acquire the lock. The cookie does not require random numbers as its goal is to provide a random value in order to notice unexpected "unlock + lock" sites. Use sched_clock() to provide random numbers. Fixes: a0103f4d86f88 ("random32: use real rng for non-deterministic randomness") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YoNn3pTkm5+QzE5k@linutronix.de --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81e87280513e..f06b91ca6482 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5432,7 +5432,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) * be guessable and still allows some pin nesting in * our u32 pin_count. */ - cookie.val = 1 + (prandom_u32() >> 16); + cookie.val = 1 + (sched_clock() & 0xffff); hlock->pin_count += cookie.val; return cookie; } -- cgit v1.2.3 From 57cd6d157eb479f0a8e820fd36b7240845c8a937 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 31 May 2022 10:59:10 -0700 Subject: cfi: Fix __cfi_slowpath_diag RCU usage with cpuidle RCU_NONIDLE usage during __cfi_slowpath_diag can result in an invalid RCU state in the cpuidle code path: WARNING: CPU: 1 PID: 0 at kernel/rcu/tree.c:613 rcu_eqs_enter+0xe4/0x138 ... Call trace: rcu_eqs_enter+0xe4/0x138 rcu_idle_enter+0xa8/0x100 cpuidle_enter_state+0x154/0x3a8 cpuidle_enter+0x3c/0x58 do_idle.llvm.6590768638138871020+0x1f4/0x2ec cpu_startup_entry+0x28/0x2c secondary_start_kernel+0x1b8/0x220 __secondary_switched+0x94/0x98 Instead, call rcu_irq_enter/exit to wake up RCU only when needed and disable interrupts for the entire CFI shadow/module check when we do. Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20220531175910.890307-1-samitolvanen@google.com Fixes: cf68fffb66d6 ("add support for Clang CFI") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/cfi.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cfi.c b/kernel/cfi.c index 9594cfd1cf2c..08102d19ec15 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -281,6 +281,8 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr) static inline cfi_check_fn find_check_fn(unsigned long ptr) { cfi_check_fn fn = NULL; + unsigned long flags; + bool rcu_idle; if (is_kernel_text(ptr)) return __cfi_check; @@ -290,13 +292,21 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) * the shadow and __module_address use RCU, so we need to wake it * up if necessary. */ - RCU_NONIDLE({ - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); + rcu_idle = !rcu_is_watching(); + if (rcu_idle) { + local_irq_save(flags); + rcu_irq_enter(); + } + + if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) + fn = find_shadow_check_fn(ptr); + if (!fn) + fn = find_module_check_fn(ptr); - if (!fn) - fn = find_module_check_fn(ptr); - }); + if (rcu_idle) { + rcu_irq_exit(); + local_irq_restore(flags); + } return fn; } -- cgit v1.2.3 From d1a374a1aeb7e31191448e225ed2f9c5e894f280 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 15 Jun 2022 09:51:51 +0530 Subject: bpf: Limit maximum modifier chain length in btf_check_type_tags On processing a module BTF of module built for an older kernel, we might sometimes find that some type points to itself forming a loop. If such a type is a modifier, btf_check_type_tags's while loop following modifier chain will be caught in an infinite loop. Fix this by defining a maximum chain length and bailing out if we spin any longer than that. Fixes: eb596b090558 ("bpf: Ensure type tags precede modifiers in BTF") Reported-by: Daniel Borkmann Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220615042151.2266537-1-memxor@gmail.com --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 63d0ac7dfe2f..eb12d4f705cc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4815,6 +4815,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env, n = btf_nr_types(btf); for (i = start_id; i < n; i++) { const struct btf_type *t; + int chain_limit = 32; u32 cur_id = i; t = btf_type_by_id(btf, i); @@ -4827,6 +4828,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, in_tags = btf_type_is_type_tag(t); while (btf_type_is_modifier(t)) { + if (!chain_limit--) { + btf_verifier_log(env, "Max chain length or cycle detected"); + return -ELOOP; + } if (btf_type_is_type_tag(t)) { if (!in_tags) { btf_verifier_log(env, "Type tags don't precede modifiers"); -- cgit v1.2.3 From c3230283e2819a69dad2cf7a63143fde8bab8b5c Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 15 Jun 2022 18:28:04 +0200 Subject: printk: Block console kthreads when direct printing will be required There are known situations when the console kthreads are not reliable or does not work in principle, for example, early boot, panic, shutdown. For these situations there is the direct (legacy) mode when printk() tries to get console_lock() and flush the messages directly. It works very well during the early boot when the console kthreads are not available at all. It gets more complicated in the other situations when console kthreads might be actively printing and block console_trylock() in printk(). The same problem is in the legacy code as well. Any console_lock() owner could block console_trylock() in printk(). It is solved by a trick that the current console_lock() owner is responsible for printing all pending messages. It is actually the reason why there is the risk of softlockups and why the console kthreads were introduced. The console kthreads use the same approach. They are responsible for printing the messages by definition. So that they handle the messages anytime when they are awake and see new ones. The global console_lock is available when there is nothing to do. It should work well when the problematic context is correctly detected and printk() switches to the direct mode. But it seems that it is not enough in practice. There are reports that the messages are not printed during panic() or shutdown() even though printk() tries to use the direct mode here. The problem seems to be that console kthreads become active in these situation as well. They steel the job before other CPUs are stopped. Then they are stopped in the middle of the job and block the global console_lock. First part of the solution is to block console kthreads when the system is in a problematic state and requires the direct printk() mode. Link: https://lore.kernel.org/r/20220610205038.GA3050413@paulmck-ThinkPad-P17-Gen-1 Link: https://lore.kernel.org/r/CAMdYzYpF4FNTBPZsEFeWRuEwSies36QM_As8osPWZSr2q-viEA@mail.gmail.com Suggested-by: John Ogness Tested-by: Paul E. McKenney Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220615162805.27962-2-pmladek@suse.com --- kernel/printk/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ea3dd55709e7..45c6c2b0b104 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3729,7 +3729,9 @@ static bool printer_should_wake(struct console *con, u64 seq) return true; if (con->blocked || - console_kthreads_atomically_blocked()) { + console_kthreads_atomically_blocked() || + system_state > SYSTEM_RUNNING || + oops_in_progress) { return false; } -- cgit v1.2.3 From b87f02307d3cfbda768520f0687c51ca77e14fc3 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 15 Jun 2022 18:28:05 +0200 Subject: printk: Wait for the global console lock when the system is going down There are reports that the console kthreads block the global console lock when the system is going down, for example, reboot, panic. First part of the solution was to block kthreads in these problematic system states so they stopped handling newly added messages. Second part of the solution is to wait when for the kthreads when they are actively printing. It solves the problem when a message was printed before the system entered the problematic state and the kthreads managed to step in. A busy waiting has to be used because panic() can be called in any context and in an unknown state of the scheduler. There must be a timeout because the kthread might get stuck or sleeping and never release the lock. The timeout 10s is an arbitrary value inspired by the softlockup timeout. Link: https://lore.kernel.org/r/20220610205038.GA3050413@paulmck-ThinkPad-P17-Gen-1 Link: https://lore.kernel.org/r/CAMdYzYpF4FNTBPZsEFeWRuEwSies36QM_As8osPWZSr2q-viEA@mail.gmail.com Signed-off-by: Petr Mladek Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220615162805.27962-3-pmladek@suse.com --- include/linux/printk.h | 5 +++++ kernel/panic.c | 2 ++ kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 4 ++++ kernel/printk/printk_safe.c | 32 ++++++++++++++++++++++++++++++++ kernel/reboot.c | 2 ++ 6 files changed, 47 insertions(+) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index cd26aab0ab2a..c1e07c0652c7 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -174,6 +174,7 @@ extern void printk_prefer_direct_enter(void); extern void printk_prefer_direct_exit(void); extern bool pr_flush(int timeout_ms, bool reset_on_progress); +extern void try_block_console_kthreads(int timeout_ms); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -238,6 +239,10 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) return true; } +static inline void try_block_console_kthreads(int timeout_ms) +{ +} + static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/panic.c b/kernel/panic.c index 6737b2332275..fe73d18ecdf0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -273,6 +273,7 @@ void panic(const char *fmt, ...) * unfortunately means it may not be hardened to work in a * panic situation. */ + try_block_console_kthreads(10000); smp_send_stop(); } else { /* @@ -280,6 +281,7 @@ void panic(const char *fmt, ...) * kmsg_dump, we will need architecture dependent extra * works in addition to stopping other CPUs. */ + try_block_console_kthreads(10000); crash_smp_send_stop(); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index d947ca6c84f9..e7d8578860ad 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -20,6 +20,8 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +extern bool block_console_kthreads; + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 45c6c2b0b104..b095fb5f5f61 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -250,6 +250,9 @@ static atomic_t console_kthreads_active = ATOMIC_INIT(0); #define console_kthread_printing_exit() \ atomic_dec(&console_kthreads_active) +/* Block console kthreads to avoid processing new messages. */ +bool block_console_kthreads; + /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -3730,6 +3733,7 @@ static bool printer_should_wake(struct console *con, u64 seq) if (con->blocked || console_kthreads_atomically_blocked() || + block_console_kthreads || system_state > SYSTEM_RUNNING || oops_in_progress) { return false; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index ef0f9a2044da..caac4de1ea59 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include "internal.h" @@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args) return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); + +/** + * try_block_console_kthreads() - Try to block console kthreads and + * make the global console_lock() avaialble + * + * @timeout_ms: The maximum time (in ms) to wait. + * + * Prevent console kthreads from starting processing new messages. Wait + * until the global console_lock() become available. + * + * Context: Can be called in any context. + */ +void try_block_console_kthreads(int timeout_ms) +{ + block_console_kthreads = true; + + /* Do not wait when the console lock could not be safely taken. */ + if (this_cpu_read(printk_context) || in_nmi()) + return; + + while (timeout_ms > 0) { + if (console_trylock()) { + console_unlock(); + return; + } + + udelay(1000); + timeout_ms -= 1; + } +} diff --git a/kernel/reboot.c b/kernel/reboot.c index 4177645e74d6..310363685502 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -74,6 +74,7 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; + try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } @@ -262,6 +263,7 @@ static void kernel_shutdown_prepare(enum system_states state) blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; + try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } -- cgit v1.2.3 From ef79c396c664be99d0c5660dc75fe863c1e20315 Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Wed, 15 Jun 2022 17:44:31 +0200 Subject: audit: free module name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reset the type of the record last as the helper `audit_free_module()` depends on it. unreferenced object 0xffff888153b707f0 (size 16): comm "modprobe", pid 1319, jiffies 4295110033 (age 1083.016s) hex dump (first 16 bytes): 62 69 6e 66 6d 74 5f 6d 69 73 63 00 6b 6b 6b a5 binfmt_misc.kkk. backtrace: [] kstrdup+0x2b/0x50 [] __audit_log_kern_module+0x4d/0xf0 [] load_module+0x9d4/0x2e10 [] __do_sys_finit_module+0x114/0x1b0 [] do_syscall_64+0x34/0x80 [] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Cc: stable@vger.kernel.org Fixes: 12c5e81d3fd0 ("audit: prepare audit_context for use in calling contexts beyond syscalls") Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f3a2abd6d1a1..3a8c9d744800 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1014,10 +1014,10 @@ static void audit_reset_context(struct audit_context *ctx) ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); - ctx->type = 0; audit_free_module(ctx); ctx->fds[0] = -1; audit_proctitle_free(ctx); + ctx->type = 0; /* reset last for audit_free_*() */ } static inline struct audit_context *audit_alloc_context(enum audit_state state) -- cgit v1.2.3 From d25c83c6606ffc3abdf0868136ad3399f648ad70 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 15 Mar 2022 11:24:44 +0100 Subject: kthread: make it clear that kthread_create_on_node() might be terminated by any fatal signal The comments in kernel/kthread.c create a feeling that only SIGKILL is able to terminate the creation of kernel kthreads by kthread_create()/_on_node()/_on_cpu() APIs. In reality, wait_for_completion_killable() might be killed by any fatal signal that does not have a custom handler: (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) static inline void signal_wake_up(struct task_struct *t, bool resume) { signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { [...] /* * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) ...) { if (!sig_kernel_coredump(sig)) { [...] do { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); return; } } } Update the comments in kernel/kthread.c to make this more obvious. The motivation for this change was debugging why a module initialization failed. The module was being loaded from initrd. It "magically" failed when systemd was switching to the real root. The clean up operations sent SIGTERM to various pending processed that were started from initrd. Link: https://lkml.kernel.org/r/20220315102444.2380-1-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Kees Cook Cc: Marco Elver Cc: Jens Axboe Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kthread.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 544fd4097406..3c677918d8f2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -340,7 +340,7 @@ static int kthread(void *_create) self = to_kthread(current); - /* If user was SIGKILLed, I release the structure. */ + /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create); @@ -398,7 +398,7 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { - /* If user was SIGKILLed, I release the structure. */ + /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); if (!done) { @@ -440,9 +440,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), */ if (unlikely(wait_for_completion_killable(&done))) { /* - * If I was SIGKILLed before kthreadd (or new kernel thread) - * calls complete(), leave the cleanup of this structure to - * that thread. + * If I was killed by a fatal signal before kthreadd (or new + * kernel thread) calls complete(), leave the cleanup of this + * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); @@ -876,7 +876,7 @@ fail_task: * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) - * when the worker was SIGKILLed. + * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) @@ -925,7 +925,7 @@ EXPORT_SYMBOL(kthread_create_worker); * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) - * when the worker was SIGKILLed. + * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, -- cgit v1.2.3 From eb1b2985fe5c5f02e43e4c0d47bbe7ed835007f3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:16 +0200 Subject: ftrace: Keep address offset in ftrace_lookup_symbols We want to store the resolved address on the same index as the symbol string, because that's the user (bpf kprobe link) code assumption. Also making sure we don't store duplicates that might be present in kallsyms. Acked-by: Song Liu Acked-by: Steven Rostedt (Google) Fixes: bed0d9a50dac ("ftrace: Add ftrace_lookup_symbols function") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/ftrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e750fe141a60..601ccf1b2f09 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name, struct module *mod, unsigned long addr) { struct kallsyms_data *args = data; + const char **sym; + int idx; - if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp)) + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) return 0; addr = ftrace_location(addr); if (!addr) return 0; - args->addrs[args->found++] = addr; + args->addrs[idx] = addr; + args->found++; return args->found == args->cnt ? 1 : 0; } @@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int err; + memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; args.cnt = cnt; -- cgit v1.2.3 From eb5fb0325698d05f0bf78d322de82c451a3685a2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:17 +0200 Subject: bpf: Force cookies array to follow symbols sorting When user specifies symbols and cookies for kprobe_multi link interface it's very likely the cookies will be misplaced and returned to wrong functions (via get_attach_cookie helper). The reason is that to resolve the provided functions we sort them before passing them to ftrace_lookup_symbols, but we do not do the same sort on the cookie values. Fixing this by using sort_r function with custom swap callback that swaps cookie values as well. Fixes: 0236fec57a15 ("bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 60 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a13e6ac6327..88589d74a892 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, kprobe_multi_link_prog_run(link, entry_ip, regs); } -static int symbols_cmp(const void *a, const void *b) +static int symbols_cmp_r(const void *a, const void *b, const void *priv) { const char **str_a = (const char **) a; const char **str_b = (const char **) b; @@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b) return strcmp(*str_a, *str_b); } +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; + +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; + + swap(*name_a, *name_b); + + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2468,38 +2490,46 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!addrs) return -ENOMEM; + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + if (uaddrs) { if (copy_from_user(addrs, uaddrs, size)) { err = -EFAULT; goto error; } } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; struct user_syms us; err = copy_user_syms(&us, usyms, cnt); if (err) goto error; - sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL); + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + err = ftrace_lookup_symbols(us.syms, cnt, addrs); free_user_syms(&us); if (err) goto error; } - ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); - if (ucookies) { - cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); - if (!cookies) { - err = -ENOMEM; - goto error; - } - if (copy_from_user(cookies, ucookies, size)) { - err = -EFAULT; - goto error; - } - } - link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; -- cgit v1.2.3 From 5cf9c91ba927119fc6606b938b1895bb2459d3bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 09:48:25 +0200 Subject: block: serialize all debugfs operations using q->debugfs_mutex Various places like I/O schedulers or the QOS infrastructure try to register debugfs files on demans, which can race with creating and removing the main queue debugfs directory. Use the existing debugfs_mutex to serialize all debugfs operations that rely on q->debugfs_dir or the directories hanging off it. To make the teardown code a little simpler declare all debugfs dentry pointers and not just the main one uncoditionally in blkdev.h. Move debugfs_mutex next to the dentries that it protects and document what it is used for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220614074827.458955-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 25 ++++++++++++++++++++----- block/blk-mq-debugfs.h | 5 ----- block/blk-mq-sched.c | 11 +++++++++++ block/blk-rq-qos.c | 2 ++ block/blk-rq-qos.h | 7 ++++++- block/blk-sysfs.c | 20 +++++++++----------- include/linux/blkdev.h | 8 ++++---- kernel/trace/blktrace.c | 3 --- 8 files changed, 52 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7e4136a60e1c..f0fcfe1387cb 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -711,11 +711,6 @@ void blk_mq_debugfs_register(struct request_queue *q) } } -void blk_mq_debugfs_unregister(struct request_queue *q) -{ - q->sched_debugfs_dir = NULL; -} - static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { @@ -746,6 +741,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; @@ -773,6 +770,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. @@ -790,6 +789,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) void blk_mq_debugfs_unregister_sched(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } @@ -811,6 +812,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { + lockdep_assert_held(&rqos->q->debugfs_mutex); + + if (!rqos->q->debugfs_dir) + return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; } @@ -820,6 +825,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) struct request_queue *q = rqos->q; const char *dir_name = rq_qos_id_to_name(rqos->id); + lockdep_assert_held(&q->debugfs_mutex); + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; @@ -835,6 +842,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->rqos_debugfs_dir); q->rqos_debugfs_dir = NULL; } @@ -844,6 +853,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs @@ -863,6 +874,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { + lockdep_assert_held(&hctx->queue->debugfs_mutex); + + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 69918f4170d6..771d45832878 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); void blk_mq_debugfs_register(struct request_queue *q); -void blk_mq_debugfs_unregister(struct request_queue *q); void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); @@ -42,10 +41,6 @@ static inline void blk_mq_debugfs_register(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister(struct request_queue *q) -{ -} - static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index eb3c65a21362..a4f7c101b53b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -594,7 +594,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (ret) goto err_free_map_and_rqs; + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); + mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { @@ -607,7 +609,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return ret; } } + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } return 0; @@ -648,14 +652,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } + + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index e83af7bc7591..249a6f05dd3b 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -294,7 +294,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_queue_rqos(q); + mutex_unlock(&q->debugfs_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 68267007da1c..0e46052b018a 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); - if (rqos->ops->debugfs_attrs) + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) @@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); } typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 88bd41d4cb59..6e4801b217a7 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -779,14 +779,13 @@ static void blk_release_queue(struct kobject *kobj) if (queue_is_mq(q)) blk_mq_release(q); - blk_trace_shutdown(q); mutex_lock(&q->debugfs_mutex); + blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); - if (queue_is_mq(q)) - blk_mq_debugfs_unregister(q); - bioset_exit(&q->bio_split); if (blk_queue_has_srcu(q)) @@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + if (queue_is_mq(q)) + __blk_mq_register_dev(dev, q); + mutex_lock(&q->sysfs_lock); + mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), blk_debugfs_root); - mutex_unlock(&q->debugfs_mutex); - - if (queue_is_mq(q)) { - __blk_mq_register_dev(dev, q); + if (queue_is_mq(q)) blk_mq_debugfs_register(q); - } - - mutex_lock(&q->sysfs_lock); + mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk, NULL); if (ret) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb6e3c31b3b7..73c886eba8e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -482,7 +482,6 @@ struct request_queue { #endif /* CONFIG_BLK_DEV_ZONED */ int node; - struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif @@ -526,11 +525,12 @@ struct request_queue { struct bio_set bio_split; struct dentry *debugfs_dir; - -#ifdef CONFIG_BLK_DEBUG_FS struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; -#endif + /* + * Serializes all debugfs metadata operations using the above dentries. + */ + struct mutex debugfs_mutex; bool mq_sysfs_init_done; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 10a32b0f2deb..fe04c6f96ca5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -770,14 +770,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - - mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From c0f3bb4054ef036e5f67e27f2e3cad9e6512cf00 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 8 Jun 2022 01:11:12 +0900 Subject: rethook: Reject getting a rethook if RCU is not watching Since the rethook_recycle() will involve the call_rcu() for reclaiming the rethook_instance, the rethook must be set up at the RCU available context (non idle). This rethook_recycle() in the rethook trampoline handler is inevitable, thus the RCU available check must be done before setting the rethook trampoline. This adds a rcu_is_watching() check in the rethook_try_get() so that it will return NULL if it is called when !rcu_is_watching(). Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Daniel Borkmann Acked-by: Steven Rostedt (Google) Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/165461827269.280167.7379263615545598958.stgit@devnote2 --- kernel/trace/rethook.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index b56833700d23..c69d82273ce7 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + fn = freelist_try_get(&rh->pool); if (!fn) return NULL; -- cgit v1.2.3 From cc72b72073ac982a954d3b43519ca1c28f03c27c Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 28 May 2022 00:55:39 +0900 Subject: tracing/kprobes: Check whether get_kretprobe() returns NULL in kretprobe_dispatcher() There is a small chance that get_kretprobe(ri) returns NULL in kretprobe_dispatcher() when another CPU unregisters the kretprobe right after __kretprobe_trampoline_handler(). To avoid this issue, kretprobe_dispatcher() checks the get_kretprobe() return value again. And if it is NULL, it returns soon because that kretprobe is under unregistering process. This issue has been introduced when the kretprobe is decoupled from the struct kretprobe_instance by commit d741bf41d7c7 ("kprobes: Remove kretprobe hash"). Before that commit, the struct kretprob_instance::rp directly points the kretprobe and it is never be NULL. Link: https://lkml.kernel.org/r/165366693881.797669.16926184644089588731.stgit@devnote2 Reported-by: Yonghong Song Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") Cc: Peter Zijlstra Cc: Ingo Molnar Cc: bpf Cc: Kernel Team Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_kprobe.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 93507330462c..a245ea673715 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1718,8 +1718,17 @@ static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); - struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return 0; + tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) -- cgit v1.2.3 From f4b0d318097e45cbac5e14976f8bb56aa2cef504 Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 2 Jun 2022 22:06:13 +0800 Subject: tracing: Simplify conditional compilation code in tracing_set_tracer() Two conditional compilation directives "#ifdef CONFIG_TRACER_MAX_TRACE" are used consecutively, and no other code in between. Simplify conditional the compilation code and only use one "#ifdef CONFIG_TRACER_MAX_TRACE". Link: https://lkml.kernel.org/r/20220602140613.545069-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c95992e2c71..a8cfac0611bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6424,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) synchronize_rcu(); free_snapshot(tr); } -#endif -#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) -- cgit v1.2.3 From 12c3e0c92fd7cb3d3b698d84fdde7dccb6ba8822 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Sun, 12 Jun 2022 07:42:32 -0700 Subject: tracing/uprobes: Remove unwanted initialization in __trace_uprobe_create() Remove the unwanted initialization of variable 'ret'. This fixes the clang scan warning: Value stored to 'ret' is never read [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220612144232.145209-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_uprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..c3dc4f859a6b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -546,7 +546,6 @@ static int __trace_uprobe_create(int argc, const char **argv) bool is_return = false; int i, ret; - ret = 0; ref_ctr_offset = 0; switch (argv[0][0]) { -- cgit v1.2.3 From 202773260023b56e868d09d13d3a417028f1ff5b Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 17 Jun 2022 15:24:02 +0300 Subject: PM: hibernate: Use kernel_can_power_off() Use new kernel_can_power_off() API instead of legacy pm_power_off global variable to fix regressed hibernation to disk where machine no longer powers off when it should because ACPI power driver transitioned to the new sys-off based API and it doesn't use pm_power_off anymore. Fixes: 98f30d0ecf79 ("ACPI: power: Switch to sys-off handler API") Tested-by: Ken Moffat Reported-by: Ken Moffat Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 20a66bf9f465..89c71fce225d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -665,7 +665,7 @@ static void power_down(void) hibernation_platform_enter(); fallthrough; case HIBERNATION_SHUTDOWN: - if (pm_power_off) + if (kernel_can_power_off()) kernel_power_off(); break; } -- cgit v1.2.3 From 3be4562584bba603f33863a00c1c32eecf772ee6 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 22 Jun 2022 12:14:24 -0700 Subject: dma-direct: use the correct size for dma_set_encrypted() The third parameter of dma_set_encrypted() is a size in bytes rather than the number of pages. Fixes: 4d0564785bb0 ("dma-direct: factor out dma_set_{de,en}crypted helpers") Signed-off-by: Dexuan Cui Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index e978f36e6be8..8d0b68a17042 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -357,7 +357,7 @@ void dma_direct_free(struct device *dev, size_t size, } else { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - if (dma_set_encrypted(dev, cpu_addr, 1 << page_order)) + if (dma_set_encrypted(dev, cpu_addr, size)) return; } @@ -392,7 +392,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir) { - unsigned int page_order = get_order(size); void *vaddr = page_address(page); /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ @@ -400,7 +399,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - if (dma_set_encrypted(dev, vaddr, 1 << page_order)) + if (dma_set_encrypted(dev, vaddr, size)) return; __dma_direct_free_pages(dev, page, size); } -- cgit v1.2.3 From 20fb0c8272bbb102d15bdd11aa64f828619dd7cc Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:52 +0200 Subject: Revert "printk: Wait for the global console lock when the system is going down" This reverts commit b87f02307d3cfbda768520f0687c51ca77e14fc3. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-2-pmladek@suse.com --- include/linux/printk.h | 5 ----- kernel/panic.c | 2 -- kernel/printk/internal.h | 2 -- kernel/printk/printk.c | 4 ---- kernel/printk/printk_safe.c | 32 -------------------------------- kernel/reboot.c | 2 -- 6 files changed, 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index c1e07c0652c7..cd26aab0ab2a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -174,7 +174,6 @@ extern void printk_prefer_direct_enter(void); extern void printk_prefer_direct_exit(void); extern bool pr_flush(int timeout_ms, bool reset_on_progress); -extern void try_block_console_kthreads(int timeout_ms); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -239,10 +238,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) return true; } -static inline void try_block_console_kthreads(int timeout_ms) -{ -} - static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/panic.c b/kernel/panic.c index fe73d18ecdf0..6737b2332275 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -273,7 +273,6 @@ void panic(const char *fmt, ...) * unfortunately means it may not be hardened to work in a * panic situation. */ - try_block_console_kthreads(10000); smp_send_stop(); } else { /* @@ -281,7 +280,6 @@ void panic(const char *fmt, ...) * kmsg_dump, we will need architecture dependent extra * works in addition to stopping other CPUs. */ - try_block_console_kthreads(10000); crash_smp_send_stop(); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index e7d8578860ad..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -20,8 +20,6 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -extern bool block_console_kthreads; - __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b095fb5f5f61..45c6c2b0b104 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -250,9 +250,6 @@ static atomic_t console_kthreads_active = ATOMIC_INIT(0); #define console_kthread_printing_exit() \ atomic_dec(&console_kthreads_active) -/* Block console kthreads to avoid processing new messages. */ -bool block_console_kthreads; - /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -3733,7 +3730,6 @@ static bool printer_should_wake(struct console *con, u64 seq) if (con->blocked || console_kthreads_atomically_blocked() || - block_console_kthreads || system_state > SYSTEM_RUNNING || oops_in_progress) { return false; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index caac4de1ea59..ef0f9a2044da 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -8,9 +8,7 @@ #include #include #include -#include #include -#include #include "internal.h" @@ -52,33 +50,3 @@ asmlinkage int vprintk(const char *fmt, va_list args) return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); - -/** - * try_block_console_kthreads() - Try to block console kthreads and - * make the global console_lock() avaialble - * - * @timeout_ms: The maximum time (in ms) to wait. - * - * Prevent console kthreads from starting processing new messages. Wait - * until the global console_lock() become available. - * - * Context: Can be called in any context. - */ -void try_block_console_kthreads(int timeout_ms) -{ - block_console_kthreads = true; - - /* Do not wait when the console lock could not be safely taken. */ - if (this_cpu_read(printk_context) || in_nmi()) - return; - - while (timeout_ms > 0) { - if (console_trylock()) { - console_unlock(); - return; - } - - udelay(1000); - timeout_ms -= 1; - } -} diff --git a/kernel/reboot.c b/kernel/reboot.c index 310363685502..4177645e74d6 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -74,7 +74,6 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; - try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } @@ -263,7 +262,6 @@ static void kernel_shutdown_prepare(enum system_states state) blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; - try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } -- cgit v1.2.3 From 05c96b3713aa2a406d0c4ef0505bf80a6748fedb Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:53 +0200 Subject: Revert "printk: Block console kthreads when direct printing will be required" This reverts commit c3230283e2819a69dad2cf7a63143fde8bab8b5c. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-3-pmladek@suse.com --- kernel/printk/printk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 45c6c2b0b104..ea3dd55709e7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3729,9 +3729,7 @@ static bool printer_should_wake(struct console *con, u64 seq) return true; if (con->blocked || - console_kthreads_atomically_blocked() || - system_state > SYSTEM_RUNNING || - oops_in_progress) { + console_kthreads_atomically_blocked()) { return false; } -- cgit v1.2.3 From 007eeab7e9f03a3108c300c03e11a6c151e430c9 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:54 +0200 Subject: Revert "printk: remove @console_locked" This reverts commit ab406816fca009349b89cbde885daf68a8c77e33. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-4-pmladek@suse.com --- kernel/printk/printk.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ea3dd55709e7..dfd1a19b95d6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -340,7 +340,15 @@ static void console_kthreads_unblock(void) console_kthreads_blocked = false; } -static int console_suspended; +/* + * This is used for debugging the mess that is the VT code by + * keeping track if we have the console semaphore held. It's + * definitely not the perfect debug tool (we don't know if _WE_ + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console semaphore held). + */ +static int console_locked, console_suspended; /* * Array of consoles built from command line options (console=) @@ -2711,6 +2719,7 @@ void console_lock(void) if (console_suspended) return; console_kthreads_block(); + console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); @@ -2735,26 +2744,15 @@ int console_trylock(void) up_console_sem(); return 0; } + console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); -/* - * This is used to help to make sure that certain paths within the VT code are - * running with the console lock held. It is definitely not the perfect debug - * tool (it is not known if the VT code is the task holding the console lock), - * but it helps tracking those weird code paths in the console code such as - * when the console is suspended: where the console is not locked but no - * console printing may occur. - * - * Note: This returns true when the console is suspended but is not locked. - * This is intentional because the VT code must consider that situation - * the same as if the console was locked. - */ int is_console_locked(void) { - return (console_kthreads_blocked || atomic_read(&console_kthreads_active)); + return (console_locked || atomic_read(&console_kthreads_active)); } EXPORT_SYMBOL(is_console_locked); @@ -2810,6 +2808,8 @@ static inline bool console_is_usable(struct console *con) static void __console_unlock(void) { + console_locked = 0; + /* * Depending on whether console_lock() or console_trylock() was used, * appropriately allow the kthread printers to continue. @@ -3127,6 +3127,7 @@ void console_unblank(void) } else console_lock(); + console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) -- cgit v1.2.3 From 2d9ef940f89e0ab4fde7ba6f769d82f2a450c35a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:55 +0200 Subject: Revert "printk: extend console_lock for per-console locking" This reverts commit 8e274732115f63c1d09136284431b3555bd5cc56. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-5-pmladek@suse.com --- include/linux/console.h | 15 --- kernel/printk/printk.c | 261 +++++++++++------------------------------------- 2 files changed, 56 insertions(+), 220 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index 143653090c48..9a251e70c090 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,7 +16,6 @@ #include #include -#include struct vc_data; struct console_font_op; @@ -155,20 +154,6 @@ struct console { u64 seq; unsigned long dropped; struct task_struct *thread; - bool blocked; - - /* - * The per-console lock is used by printing kthreads to synchronize - * this console with callers of console_lock(). This is necessary in - * order to allow printing kthreads to run in parallel to each other, - * while each safely accessing the @blocked field and synchronizing - * against direct printing via console_lock/console_unlock. - * - * Note: For synchronizing against direct printing via - * console_trylock/console_unlock, see the static global - * variable @console_kthreads_active. - */ - struct mutex lock; void *data; struct console *next; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dfd1a19b95d6..ae489dc685a1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -223,33 +223,6 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; -/* - * Used to synchronize printing kthreads against direct printing via - * console_trylock/console_unlock. - * - * Values: - * -1 = console kthreads atomically blocked (via global trylock) - * 0 = no kthread printing, console not locked (via trylock) - * >0 = kthread(s) actively printing - * - * Note: For synchronizing against direct printing via - * console_lock/console_unlock, see the @lock variable in - * struct console. - */ -static atomic_t console_kthreads_active = ATOMIC_INIT(0); - -#define console_kthreads_atomic_tryblock() \ - (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) -#define console_kthreads_atomic_unblock() \ - atomic_cmpxchg(&console_kthreads_active, -1, 0) -#define console_kthreads_atomically_blocked() \ - (atomic_read(&console_kthreads_active) == -1) - -#define console_kthread_printing_tryenter() \ - atomic_inc_unless_negative(&console_kthreads_active) -#define console_kthread_printing_exit() \ - atomic_dec(&console_kthreads_active) - /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -297,49 +270,6 @@ static bool panic_in_progress(void) return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } -/* - * Tracks whether kthread printers are all blocked. A value of true implies - * that the console is locked via console_lock() or the console is suspended. - * Writing to this variable requires holding @console_sem. - */ -static bool console_kthreads_blocked; - -/* - * Block all kthread printers from a schedulable context. - * - * Requires holding @console_sem. - */ -static void console_kthreads_block(void) -{ - struct console *con; - - for_each_console(con) { - mutex_lock(&con->lock); - con->blocked = true; - mutex_unlock(&con->lock); - } - - console_kthreads_blocked = true; -} - -/* - * Unblock all kthread printers from a schedulable context. - * - * Requires holding @console_sem. - */ -static void console_kthreads_unblock(void) -{ - struct console *con; - - for_each_console(con) { - mutex_lock(&con->lock); - con->blocked = false; - mutex_unlock(&con->lock); - } - - console_kthreads_blocked = false; -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2673,6 +2603,13 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + + /* + * While suspended, new records may have been added to the + * ringbuffer. Wake up the kthread printers to print them. + */ + wake_up_klogd(); + pr_flush(1000, true); } @@ -2691,14 +2628,9 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - else { - /* - * If a new CPU comes online, the conditions for - * printer_should_wake() may have changed for some - * kthread printer with !CON_ANYTIME. - */ - wake_up_klogd(); - } + + /* Wake kthread printers. Some may have become usable. */ + wake_up_klogd(); } return 0; } @@ -2718,7 +2650,6 @@ void console_lock(void) down_console_sem(); if (console_suspended) return; - console_kthreads_block(); console_locked = 1; console_may_schedule = 1; } @@ -2740,10 +2671,6 @@ int console_trylock(void) up_console_sem(); return 0; } - if (!console_kthreads_atomic_tryblock()) { - up_console_sem(); - return 0; - } console_locked = 1; console_may_schedule = 0; return 1; @@ -2752,7 +2679,7 @@ EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { - return (console_locked || atomic_read(&console_kthreads_active)); + return console_locked; } EXPORT_SYMBOL(is_console_locked); @@ -2796,7 +2723,7 @@ static inline bool __console_is_usable(short flags) * Check if the given console is currently capable and allowed to print * records. * - * Requires holding the console_lock. + * Requires the console_lock. */ static inline bool console_is_usable(struct console *con) { @@ -2809,22 +2736,6 @@ static inline bool console_is_usable(struct console *con) static void __console_unlock(void) { console_locked = 0; - - /* - * Depending on whether console_lock() or console_trylock() was used, - * appropriately allow the kthread printers to continue. - */ - if (console_kthreads_blocked) - console_kthreads_unblock(); - else - console_kthreads_atomic_unblock(); - - /* - * New records may have arrived while the console was locked. - * Wake the kthread printers to print them. - */ - wake_up_klogd(); - up_console_sem(); } @@ -2842,19 +2753,17 @@ static void __console_unlock(void) * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the - * console_lock. Otherwise it is set to false. A NULL pointer may be provided - * to disable allowing the console_lock to be taken over by a printk waiter. + * console_lock. Otherwise it is set to false. * * Returns false if the given console has no next record to print, otherwise * true. * - * Requires the console_lock if @handover is non-NULL. - * Requires con->lock otherwise. + * Requires the console_lock. */ -static bool __console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) { - static atomic_t panic_console_dropped = ATOMIC_INIT(0); + static int panic_console_dropped; struct printk_info info; struct printk_record r; unsigned long flags; @@ -2863,8 +2772,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); - if (handover) - *handover = false; + *handover = false; if (!prb_read_valid(prb, con->seq, &r)) return false; @@ -2872,8 +2780,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex if (con->seq != r.info->seq) { con->dropped += r.info->seq - con->seq; con->seq = r.info->seq; - if (panic_in_progress() && - atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) { + if (panic_in_progress() && panic_console_dropped++ > 10) { suppress_panic_printk = 1; pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); } @@ -2895,61 +2802,31 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } - if (handover) { - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - /* don't trace irqsoff print latency */ - stop_critical_timings(); - } + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + stop_critical_timings(); /* don't trace print latency */ call_console_driver(con, write_text, len, dropped_text); + start_critical_timings(); con->seq++; - if (handover) { - start_critical_timings(); - *handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - } + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); skip: return true; } -/* - * Print a record for a given console, but allow another printk() caller to - * take over the console_lock and continue printing. - * - * Requires the console_lock, but depending on @handover after the call, the - * caller may no longer have the console_lock. - * - * See __console_emit_next_record() for argument and return details. - */ -static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) -{ - /* - * Handovers are only supported if threaded printers are atomically - * blocked. The context taking over the console_lock may be atomic. - */ - if (!console_kthreads_atomically_blocked()) { - *handover = false; - handover = NULL; - } - - return __console_emit_next_record(con, text, ext_text, dropped_text, handover); -} - /* * Print out all remaining records to all consoles. * @@ -3001,11 +2878,13 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove if (con->flags & CON_EXTENDED) { /* Extended consoles do not print "dropped messages". */ - progress = console_emit_next_record_transferable(con, &text[0], - &ext_text[0], NULL, handover); + progress = console_emit_next_record(con, &text[0], + &ext_text[0], NULL, + handover); } else { - progress = console_emit_next_record_transferable(con, &text[0], - NULL, &dropped_text[0], handover); + progress = console_emit_next_record(con, &text[0], + NULL, &dropped_text[0], + handover); } if (*handover) return false; @@ -3120,10 +2999,6 @@ void console_unblank(void) if (oops_in_progress) { if (down_trylock_console_sem() != 0) return; - if (!console_kthreads_atomic_tryblock()) { - up_console_sem(); - return; - } } else console_lock(); @@ -3206,6 +3081,10 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + + /* Wake the newly enabled kthread printer. */ + wake_up_klogd(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3407,8 +3286,6 @@ void register_console(struct console *newcon) newcon->dropped = 0; newcon->thread = NULL; - newcon->blocked = true; - mutex_init(&newcon->lock); if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ @@ -3709,19 +3586,6 @@ static void printk_fallback_preferred_direct(void) console_unlock(); } -/* - * Print a record for a given console, not allowing another printk() caller - * to take over. This is appropriate for contexts that do not have the - * console_lock. - * - * See __console_emit_next_record() for argument and return details. - */ -static bool console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text) -{ - return __console_emit_next_record(con, text, ext_text, dropped_text, NULL); -} - static bool printer_should_wake(struct console *con, u64 seq) { short flags; @@ -3729,10 +3593,8 @@ static bool printer_should_wake(struct console *con, u64 seq) if (kthread_should_stop() || !printk_kthreads_available) return true; - if (con->blocked || - console_kthreads_atomically_blocked()) { + if (console_suspended) return false; - } /* * This is an unsafe read from con->flags, but a false positive is @@ -3753,6 +3615,7 @@ static int printk_kthread_func(void *data) struct console *con = data; char *dropped_text = NULL; char *ext_text = NULL; + bool handover; u64 seq = 0; char *text; int error; @@ -3802,27 +3665,15 @@ static int printk_kthread_func(void *data) if (error) continue; - error = mutex_lock_interruptible(&con->lock); - if (error) - continue; + console_lock(); - if (con->blocked || - !console_kthread_printing_tryenter()) { - /* Another context has locked the console_lock. */ - mutex_unlock(&con->lock); + if (console_suspended) { + up_console_sem(); continue; } - /* - * Although this context has not locked the console_lock, it - * is known that the console_lock is not locked and it is not - * possible for any other context to lock the console_lock. - * Therefore it is safe to read con->flags. - */ - - if (!__console_is_usable(con->flags)) { - console_kthread_printing_exit(); - mutex_unlock(&con->lock); + if (!console_is_usable(con)) { + __console_unlock(); continue; } @@ -3835,13 +3686,13 @@ static int printk_kthread_func(void *data) * which can conditionally invoke cond_resched(). */ console_may_schedule = 0; - console_emit_next_record(con, text, ext_text, dropped_text); + console_emit_next_record(con, text, ext_text, dropped_text, &handover); + if (handover) + continue; seq = con->seq; - console_kthread_printing_exit(); - - mutex_unlock(&con->lock); + __console_unlock(); } con_printk(KERN_INFO, con, "printing thread stopped\n"); -- cgit v1.2.3 From 5831788afb17b89c5b531fb60cbd798613ccbb63 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:56 +0200 Subject: Revert "printk: add kthread console printers" This reverts commit 09c5ba0aa2fcfdadb17d045c3ee6f86d69270df7. This reverts commit b87f02307d3cfbda768520f0687c51ca77e14fc3. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-6-pmladek@suse.com --- include/linux/console.h | 2 - kernel/printk/printk.c | 329 ++++-------------------------------------------- 2 files changed, 22 insertions(+), 309 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index 9a251e70c090..8c1686e2c233 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,8 +153,6 @@ struct console { uint ospeed; u64 seq; unsigned long dropped; - struct task_struct *thread; - void *data; struct console *next; }; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ae489dc685a1..5a6273e56b4e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -361,13 +361,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); -/* - * A flag to signify if printk_activate_kthreads() has already started the - * kthread printers. If true, any later registered consoles must start their - * own kthread directly. The flag is write protected by the console_lock. - */ -static bool printk_kthreads_available; - #ifdef CONFIG_PRINTK static atomic_t printk_prefer_direct = ATOMIC_INIT(0); @@ -397,39 +390,6 @@ void printk_prefer_direct_exit(void) WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); } -/* - * Calling printk() always wakes kthread printers so that they can - * flush the new message to their respective consoles. Also, if direct - * printing is allowed, printk() tries to flush the messages directly. - * - * Direct printing is allowed in situations when the kthreads - * are not available or the system is in a problematic state. - * - * See the implementation about possible races. - */ -static inline bool allow_direct_printing(void) -{ - /* - * Checking kthread availability is a possible race because the - * kthread printers can become permanently disabled during runtime. - * However, doing that requires holding the console_lock, so any - * pending messages will be direct printed by console_unlock(). - */ - if (!printk_kthreads_available) - return true; - - /* - * Prefer direct printing when the system is in a problematic state. - * The context that sets this state will always see the updated value. - * The other contexts do not care. Anyway, direct printing is just a - * best effort. The direct output is only possible when console_lock - * is not already taken and no kthread printers are actively printing. - */ - return (system_state > SYSTEM_RUNNING || - oops_in_progress || - atomic_read(&printk_prefer_direct)); -} - DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -2320,10 +2280,10 @@ asmlinkage int vprintk_emit(int facility, int level, printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && allow_direct_printing()) { + if (!in_sched) { /* * The caller may be holding system-critical or - * timing-sensitive locks. Disable preemption during direct + * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. @@ -2366,8 +2326,6 @@ EXPORT_SYMBOL(_printk); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); -static void printk_start_kthread(struct console *con); - #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 @@ -2401,8 +2359,6 @@ static void call_console_driver(struct console *con, const char *text, size_t le } static bool suppress_message_printing(int level) { return false; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } -static void printk_start_kthread(struct console *con) { } -static bool allow_direct_printing(void) { return true; } #endif /* CONFIG_PRINTK */ @@ -2603,13 +2559,6 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); - - /* - * While suspended, new records may have been added to the - * ringbuffer. Wake up the kthread printers to print them. - */ - wake_up_klogd(); - pr_flush(1000, true); } @@ -2628,9 +2577,6 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - - /* Wake kthread printers. Some may have become usable. */ - wake_up_klogd(); } return 0; } @@ -2702,9 +2648,18 @@ static bool abandon_console_lock_in_panic(void) return atomic_read(&panic_cpu) != raw_smp_processor_id(); } -static inline bool __console_is_usable(short flags) +/* + * Check if the given console is currently capable and allowed to print + * records. + * + * Requires the console_lock. + */ +static inline bool console_is_usable(struct console *con) { - if (!(flags & CON_ENABLED)) + if (!(con->flags & CON_ENABLED)) + return false; + + if (!con->write) return false; /* @@ -2713,26 +2668,12 @@ static inline bool __console_is_usable(short flags) * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ if (!cpu_online(raw_smp_processor_id()) && - !(flags & CON_ANYTIME)) + !(con->flags & CON_ANYTIME)) return false; return true; } -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_lock. - */ -static inline bool console_is_usable(struct console *con) -{ - if (!con->write) - return false; - - return __console_is_usable(con->flags); -} - static void __console_unlock(void) { console_locked = 0; @@ -2845,8 +2786,8 @@ skip: * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this - * is not the panic CPU or direct printing is not preferred). Regardless the - * reason, the caller should assume it is not useful to immediately try again. + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. * * Requires the console_lock. */ @@ -2863,10 +2804,6 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove *handover = false; do { - /* Let the kthread printers do the work if they can. */ - if (!allow_direct_printing()) - return false; - any_progress = false; for_each_console(con) { @@ -3081,10 +3018,6 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); - - /* Wake the newly enabled kthread printer. */ - wake_up_klogd(); - __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3285,8 +3218,6 @@ void register_console(struct console *newcon) nr_ext_console_drivers++; newcon->dropped = 0; - newcon->thread = NULL; - if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); @@ -3296,10 +3227,6 @@ void register_console(struct console *newcon) /* Begin with next message. */ newcon->seq = prb_next_seq(prb); } - - if (printk_kthreads_available) - printk_start_kthread(newcon); - console_unlock(); console_sysfs_notify(); @@ -3326,7 +3253,6 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { - struct task_struct *thd; struct console *con; int res; @@ -3367,20 +3293,7 @@ int unregister_console(struct console *console) console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; - - /* - * console->thread can only be cleared under the console lock. But - * stopping the thread must be done without the console lock. The - * task that clears @thread is the task that stops the kthread. - */ - thd = console->thread; - console->thread = NULL; - console_unlock(); - - if (thd) - kthread_stop(thd); - console_sysfs_notify(); if (console->exit) @@ -3476,20 +3389,6 @@ static int __init printk_late_init(void) } late_initcall(printk_late_init); -static int __init printk_activate_kthreads(void) -{ - struct console *con; - - console_lock(); - printk_kthreads_available = true; - for_each_console(con) - printk_start_kthread(con); - console_unlock(); - - return 0; -} -early_initcall(printk_activate_kthreads); - #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) @@ -3564,180 +3463,11 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) } EXPORT_SYMBOL(pr_flush); -static void __printk_fallback_preferred_direct(void) -{ - printk_prefer_direct_enter(); - pr_err("falling back to preferred direct printing\n"); - printk_kthreads_available = false; -} - -/* - * Enter preferred direct printing, but never exit. Mark console threads as - * unavailable. The system is then forever in preferred direct printing and - * any printing threads will exit. - * - * Must *not* be called under console_lock. Use - * __printk_fallback_preferred_direct() if already holding console_lock. - */ -static void printk_fallback_preferred_direct(void) -{ - console_lock(); - __printk_fallback_preferred_direct(); - console_unlock(); -} - -static bool printer_should_wake(struct console *con, u64 seq) -{ - short flags; - - if (kthread_should_stop() || !printk_kthreads_available) - return true; - - if (console_suspended) - return false; - - /* - * This is an unsafe read from con->flags, but a false positive is - * not a problem. Worst case it would allow the printer to wake up - * although it is disabled. But the printer will notice that when - * attempting to print and instead go back to sleep. - */ - flags = data_race(READ_ONCE(con->flags)); - - if (!__console_is_usable(flags)) - return false; - - return prb_read_valid(prb, seq, NULL); -} - -static int printk_kthread_func(void *data) -{ - struct console *con = data; - char *dropped_text = NULL; - char *ext_text = NULL; - bool handover; - u64 seq = 0; - char *text; - int error; - - text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); - if (!text) { - con_printk(KERN_ERR, con, "failed to allocate text buffer\n"); - printk_fallback_preferred_direct(); - goto out; - } - - if (con->flags & CON_EXTENDED) { - ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); - if (!ext_text) { - con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n"); - printk_fallback_preferred_direct(); - goto out; - } - } else { - dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); - if (!dropped_text) { - con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n"); - printk_fallback_preferred_direct(); - goto out; - } - } - - con_printk(KERN_INFO, con, "printing thread started\n"); - - for (;;) { - /* - * Guarantee this task is visible on the waitqueue before - * checking the wake condition. - * - * The full memory barrier within set_current_state() of - * prepare_to_wait_event() pairs with the full memory barrier - * within wq_has_sleeper(). - * - * This pairs with __wake_up_klogd:A. - */ - error = wait_event_interruptible(log_wait, - printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */ - - if (kthread_should_stop() || !printk_kthreads_available) - break; - - if (error) - continue; - - console_lock(); - - if (console_suspended) { - up_console_sem(); - continue; - } - - if (!console_is_usable(con)) { - __console_unlock(); - continue; - } - - /* - * Even though the printk kthread is always preemptible, it is - * still not allowed to call cond_resched() from within - * console drivers. The task may become non-preemptible in the - * console driver call chain. For example, vt_console_print() - * takes a spinlock and then can call into fbcon_redraw(), - * which can conditionally invoke cond_resched(). - */ - console_may_schedule = 0; - console_emit_next_record(con, text, ext_text, dropped_text, &handover); - if (handover) - continue; - - seq = con->seq; - - __console_unlock(); - } - - con_printk(KERN_INFO, con, "printing thread stopped\n"); -out: - kfree(dropped_text); - kfree(ext_text); - kfree(text); - - console_lock(); - /* - * If this kthread is being stopped by another task, con->thread will - * already be NULL. That is fine. The important thing is that it is - * NULL after the kthread exits. - */ - con->thread = NULL; - console_unlock(); - - return 0; -} - -/* Must be called under console_lock. */ -static void printk_start_kthread(struct console *con) -{ - /* - * Do not start a kthread if there is no write() callback. The - * kthreads assume the write() callback exists. - */ - if (!con->write) - return; - - con->thread = kthread_run(printk_kthread_func, con, - "pr/%s%d", con->name, con->index); - if (IS_ERR(con->thread)) { - con->thread = NULL; - con_printk(KERN_ERR, con, "unable to start printing thread\n"); - __printk_fallback_preferred_direct(); - return; - } -} - /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_DIRECT_OUTPUT 0x02 +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); @@ -3745,14 +3475,10 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_DIRECT_OUTPUT) { - printk_prefer_direct_enter(); - + if (pending & PRINTK_PENDING_OUTPUT) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - - printk_prefer_direct_exit(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -3777,11 +3503,10 @@ static void __wake_up_klogd(int val) * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * - * This pairs with devkmsg_read:A, syslog_print:A, and - * printk_kthread_func:A. + * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ - (val & PRINTK_PENDING_DIRECT_OUTPUT)) { + (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } @@ -3799,17 +3524,7 @@ void defer_console_output(void) * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ - int val = PRINTK_PENDING_WAKEUP; - - /* - * Make sure that some context will print the messages when direct - * printing is allowed. This happens in situations when the kthreads - * may not be as reliable or perhaps unusable. - */ - if (allow_direct_printing()) - val |= PRINTK_PENDING_DIRECT_OUTPUT; - - __wake_up_klogd(val); + __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) -- cgit v1.2.3 From 07a22b61946f0b80065b0ddcc703b715f84355f5 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:57 +0200 Subject: Revert "printk: add functions to prefer direct printing" This reverts commit 2bb2b7b57f81255c13f4395ea911d6bdc70c9fe2. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-7-pmladek@suse.com --- drivers/tty/sysrq.c | 2 -- include/linux/printk.h | 11 ----------- kernel/hung_task.c | 11 +---------- kernel/panic.c | 4 ---- kernel/printk/printk.c | 28 ---------------------------- kernel/rcu/tree_stall.h | 2 -- kernel/reboot.c | 14 +------------- kernel/watchdog.c | 4 ---- kernel/watchdog_hld.c | 4 ---- 9 files changed, 2 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 2884cd638d64..bbfd004449b5 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -578,7 +578,6 @@ void __handle_sysrq(int key, bool check_mask) rcu_sysrq_start(); rcu_read_lock(); - printk_prefer_direct_enter(); /* * Raise the apparent loglevel to maximum so that the sysrq header * is shown to provide the user with positive feedback. We do not @@ -620,7 +619,6 @@ void __handle_sysrq(int key, bool check_mask) pr_cont("\n"); console_loglevel = orig_log_level; } - printk_prefer_direct_exit(); rcu_read_unlock(); rcu_sysrq_end(); diff --git a/include/linux/printk.h b/include/linux/printk.h index cd26aab0ab2a..091fba7283e1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,9 +170,6 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit -extern void printk_prefer_direct_enter(void); -extern void printk_prefer_direct_exit(void); - extern bool pr_flush(int timeout_ms, bool reset_on_progress); /* @@ -225,14 +222,6 @@ static inline void printk_deferred_exit(void) { } -static inline void printk_prefer_direct_enter(void) -{ -} - -static inline void printk_prefer_direct_exit(void) -{ -} - static inline bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 02a65d554340..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -127,8 +127,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { - printk_prefer_direct_enter(); - if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", @@ -144,8 +142,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; - - printk_prefer_direct_exit(); } touch_nmi_watchdog(); @@ -208,17 +204,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); - if (hung_task_show_lock) { - printk_prefer_direct_enter(); + if (hung_task_show_lock) debug_show_all_locks(); - printk_prefer_direct_exit(); - } if (hung_task_show_all_bt) { hung_task_show_all_bt = false; - printk_prefer_direct_enter(); trigger_all_cpu_backtrace(); - printk_prefer_direct_exit(); } if (hung_task_call_panic) diff --git a/kernel/panic.c b/kernel/panic.c index 6737b2332275..8355b19676f8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -579,8 +579,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - printk_prefer_direct_enter(); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -610,8 +608,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); - - printk_prefer_direct_exit(); } #ifndef __WARN_FLAGS diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5a6273e56b4e..b49c6ff6dca0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -362,34 +362,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK -static atomic_t printk_prefer_direct = ATOMIC_INIT(0); - -/** - * printk_prefer_direct_enter - cause printk() calls to attempt direct - * printing to all enabled consoles - * - * Since it is not possible to call into the console printing code from any - * context, there is no guarantee that direct printing will occur. - * - * This globally effects all printk() callers. - * - * Context: Any context. - */ -void printk_prefer_direct_enter(void) -{ - atomic_inc(&printk_prefer_direct); -} - -/** - * printk_prefer_direct_exit - restore printk() behavior - * - * Context: Any context. - */ -void printk_prefer_direct_exit(void) -{ - WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); -} - DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4995c078cff9..a001e1e7a992 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -647,7 +647,6 @@ static void print_cpu_stall(unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ - printk_prefer_direct_enter(); trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); @@ -685,7 +684,6 @@ static void print_cpu_stall(unsigned long gps) */ set_tsk_need_resched(current); set_preempt_need_resched(); - printk_prefer_direct_exit(); } static void check_cpu_stall(struct rcu_data *rdp) diff --git a/kernel/reboot.c b/kernel/reboot.c index 4177645e74d6..6bcc5d6a6572 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -447,11 +447,9 @@ static int __orderly_reboot(void) ret = run_cmd(reboot_cmd); if (ret) { - printk_prefer_direct_enter(); pr_warn("Failed to start orderly reboot: forcing the issue\n"); emergency_sync(); kernel_restart(NULL); - printk_prefer_direct_exit(); } return ret; @@ -464,7 +462,6 @@ static int __orderly_poweroff(bool force) ret = run_cmd(poweroff_cmd); if (ret && force) { - printk_prefer_direct_enter(); pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* @@ -474,7 +471,6 @@ static int __orderly_poweroff(bool force) */ emergency_sync(); kernel_power_off(); - printk_prefer_direct_exit(); } return ret; @@ -532,8 +528,6 @@ EXPORT_SYMBOL_GPL(orderly_reboot); */ static void hw_failure_emergency_poweroff_func(struct work_struct *work) { - printk_prefer_direct_enter(); - /* * We have reached here after the emergency shutdown waiting period has * expired. This means orderly_poweroff has not been able to shut off @@ -550,8 +544,6 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work) */ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); emergency_restart(); - - printk_prefer_direct_exit(); } static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, @@ -590,13 +582,11 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) { static atomic_t allow_proceed = ATOMIC_INIT(1); - printk_prefer_direct_enter(); - pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) - goto out; + return; /* * Queue a backup emergency shutdown in the event of @@ -604,8 +594,6 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) */ hw_failure_emergency_poweroff(ms_until_forced); orderly_poweroff(true); -out: - printk_prefer_direct_exit(); } EXPORT_SYMBOL_GPL(hw_protection_shutdown); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 40024e03d422..9166220457bc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -424,8 +424,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); - printk_prefer_direct_enter(); - pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -444,8 +442,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); - - printk_prefer_direct_exit(); } return HRTIMER_RESTART; diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 701f35f0e2d4..247bf0b1582c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,8 +135,6 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - printk_prefer_direct_enter(); - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); @@ -157,8 +155,6 @@ static void watchdog_overflow_callback(struct perf_event *event, if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); - printk_prefer_direct_exit(); - __this_cpu_write(hard_watchdog_warn, true); return; } -- cgit v1.2.3