From b2b018ef48675a9a524fa9791ea7d67fdac405f7 Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 1 Dec 2015 20:40:54 -0600 Subject: livepatch: add old_sympos as disambiguator field to klp_func Currently, patching objects with duplicate symbol names fail because the creation of the sysfs function directory collides with the previous attempt. Appending old_addr to the function name is problematic as it reveals the address of the function being patch to a normal user. Using the symbol's occurrence in kallsyms to postfix the function name in the sysfs directory solves the issue of having consistent unique names and ensuring that the address is not exposed to a normal user. In addition, using the symbol position as the user's method to disambiguate symbols instead of addr allows for disambiguating symbols in modules as well for both function addresses and for relocs. This also simplifies much of the code. Special handling for kASLR is no longer needed and can be removed. The klp_find_verify_func_addr function can be replaced by klp_find_object_symbol, and klp_verify_vmlinux_symbol and its callback can be removed completely. In cases of duplicate symbols, old_sympos will be used to disambiguate instead of old_addr. By default old_sympos will be 0, and patching will only succeed if the symbol is unique. Specifying a positive value will ensure that occurrence of the symbol in kallsyms for the patched object will be used for patching if it is valid. In addition, make old_addr an internal structure field not to be specified by the user. Finally, remove klp_find_verify_func_addr as it can be replaced by klp_find_object_symbol directly. Support for symbol position disambiguation for relocations is added in the next patch in this series. Signed-off-by: Chris J Arges Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 72 +++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index db545cbcdb89..e416f96e938d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -135,13 +135,8 @@ struct klp_find_arg { const char *objname; const char *name; unsigned long addr; - /* - * If count == 0, the symbol was not found. If count == 1, a unique - * match was found and addr is set. If count > 1, there is - * unresolvable ambiguity among "count" number of symbols with the same - * name in the same object. - */ unsigned long count; + unsigned long pos; }; static int klp_find_callback(void *data, const char *name, @@ -158,37 +153,48 @@ static int klp_find_callback(void *data, const char *name, if (args->objname && strcmp(args->objname, mod->name)) return 0; - /* - * args->addr might be overwritten if another match is found - * but klp_find_object_symbol() handles this and only returns the - * addr if count == 1. - */ args->addr = addr; args->count++; + /* + * Finish the search when the symbol is found for the desired position + * or the position is not defined for a non-unique symbol. + */ + if ((args->pos && (args->count == args->pos)) || + (!args->pos && (args->count > 1))) + return 1; + return 0; } static int klp_find_object_symbol(const char *objname, const char *name, - unsigned long *addr) + unsigned long sympos, unsigned long *addr) { struct klp_find_arg args = { .objname = objname, .name = name, .addr = 0, - .count = 0 + .count = 0, + .pos = sympos, }; mutex_lock(&module_mutex); kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); - if (args.count == 0) + /* + * Ensure an address was found. If sympos is 0, ensure symbol is unique; + * otherwise ensure the symbol position count matches sympos. + */ + if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); - else if (args.count > 1) + else if (args.count > 1 && sympos == 0) { pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", args.count, name, objname); - else { + } else if (sympos != args.count && sympos > 0) { + pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", + sympos, name, objname ? objname : "vmlinux"); + } else { *addr = args.addr; return 0; } @@ -236,27 +242,6 @@ static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) return 0; } -static int klp_find_verify_func_addr(struct klp_object *obj, - struct klp_func *func) -{ - int ret; - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old_addr accordingly */ - if (kaslr_enabled() && func->old_addr) - func->old_addr += kaslr_offset(); -#endif - - if (!func->old_addr || klp_is_module(obj)) - ret = klp_find_object_symbol(obj->name, func->old_name, - &func->old_addr); - else - ret = klp_verify_vmlinux_symbol(func->old_name, - func->old_addr); - - return ret; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -276,8 +261,11 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, } preempt_enable(); - /* otherwise check if it's in another .o within the patch module */ - return klp_find_object_symbol(pmod->name, name, addr); + /* + * Check if it's in another .o within the patch module. This also + * checks that the external symbol is unique. + */ + return klp_find_object_symbol(pmod->name, name, 0, addr); } static int klp_write_object_relocations(struct module *pmod, @@ -313,7 +301,7 @@ static int klp_write_object_relocations(struct module *pmod, else ret = klp_find_object_symbol(obj->mod->name, reloc->name, - &reloc->val); + 0, &reloc->val); if (ret) return ret; } @@ -756,7 +744,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, } klp_for_each_func(obj, func) { - ret = klp_find_verify_func_addr(obj, func); + ret = klp_find_object_symbol(obj->name, func->old_name, + func->old_sympos, + &func->old_addr); if (ret) return ret; } -- cgit v1.2.3 From 064c89df6247cd829a7880cc8a87b7ed2cdfccd8 Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 1 Dec 2015 20:40:55 -0600 Subject: livepatch: add sympos as disambiguator field to klp_reloc In cases of duplicate symbols, sympos will be used to disambiguate instead of val. By default sympos will be 0, and patching will only succeed if the symbol is unique. Specifying a positive value will ensure that occurrence of the symbol in kallsyms for the patched object will be used for patching if it is valid. For external relocations sympos is not supported. Remove klp_verify_callback, klp_verify_args and klp_verify_vmlinux_symbol as they are no longer used. From the klp_reloc structure remove val, as it can be refactored as a local variable in klp_write_object_relocations. Signed-off-by: Chris J Arges Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- include/linux/livepatch.h | 5 ++- kernel/livepatch/core.c | 84 +++++++++++------------------------------------ 2 files changed, 21 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index b60e8abab0ab..a8828652f794 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -67,8 +67,7 @@ struct klp_func { /** * struct klp_reloc - relocation structure for live patching * @loc: address where the relocation will be written - * @val: address of the referenced symbol (optional, - * vmlinux patches only) + * @sympos: position in kallsyms to disambiguate symbols (optional) * @type: ELF relocation type * @name: name of the referenced symbol (for lookup/verification) * @addend: offset from the referenced symbol @@ -76,7 +75,7 @@ struct klp_func { */ struct klp_reloc { unsigned long loc; - unsigned long val; + unsigned long sympos; unsigned long type; const char *name; int addend; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index e416f96e938d..e842534d3493 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -203,45 +203,6 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -struct klp_verify_args { - const char *name; - const unsigned long addr; -}; - -static int klp_verify_callback(void *data, const char *name, - struct module *mod, unsigned long addr) -{ - struct klp_verify_args *args = data; - - if (!mod && - !strcmp(args->name, name) && - args->addr == addr) - return 1; - - return 0; -} - -static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) -{ - struct klp_verify_args args = { - .name = name, - .addr = addr, - }; - int ret; - - mutex_lock(&module_mutex); - ret = kallsyms_on_each_symbol(klp_verify_callback, &args); - mutex_unlock(&module_mutex); - - if (!ret) { - pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", - name, addr); - return -EINVAL; - } - - return 0; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -272,6 +233,7 @@ static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { int ret; + unsigned long val; struct klp_reloc *reloc; if (WARN_ON(!klp_is_object_loaded(obj))) @@ -281,35 +243,27 @@ static int klp_write_object_relocations(struct module *pmod, return -EINVAL; for (reloc = obj->relocs; reloc->name; reloc++) { - if (!klp_is_module(obj)) { - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old value accordingly */ - if (kaslr_enabled()) - reloc->val += kaslr_offset(); -#endif - ret = klp_verify_vmlinux_symbol(reloc->name, - reloc->val); - if (ret) - return ret; - } else { - /* module, reloc->val needs to be discovered */ - if (reloc->external) - ret = klp_find_external_symbol(pmod, - reloc->name, - &reloc->val); - else - ret = klp_find_object_symbol(obj->mod->name, - reloc->name, - 0, &reloc->val); - if (ret) - return ret; - } + /* discover the address of the referenced symbol */ + if (reloc->external) { + if (reloc->sympos > 0) { + pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", + reloc->name); + return -EINVAL; + } + ret = klp_find_external_symbol(pmod, reloc->name, &val); + } else + ret = klp_find_object_symbol(obj->name, + reloc->name, + reloc->sympos, + &val); + if (ret) + return ret; + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - reloc->val + reloc->addend); + val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, reloc->val, ret); + reloc->name, val, ret); return ret; } } -- cgit v1.2.3 From 444f9e99a840c4050c0530cfef81801a21a59f4c Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 1 Dec 2015 20:40:56 -0600 Subject: livepatch: function,sympos scheme in livepatch sysfs directory The following directory structure will allow for cases when the same function name exists in a single object. /sys/kernel/livepatch/// The sympos number corresponds to the nth occurrence of the symbol name in kallsyms for the patched object. An example of patching multiple symbols can be found here: https://github.com/dynup/kpatch/issues/493 Signed-off-by: Chris J Arges Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- Documentation/ABI/testing/sysfs-kernel-livepatch | 6 +++++- kernel/livepatch/core.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch index 5bf42a840b22..da87f43aec58 100644 --- a/Documentation/ABI/testing/sysfs-kernel-livepatch +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -33,7 +33,7 @@ Description: The object directory contains subdirectories for each function that is patched within the object. -What: /sys/kernel/livepatch/// +What: /sys/kernel/livepatch/// Date: Nov 2014 KernelVersion: 3.19.0 Contact: live-patching@vger.kernel.org @@ -41,4 +41,8 @@ Description: The function directory contains attributes regarding the properties and state of the patched function. + The directory name contains the patched function name and a + sympos number corresponding to the nth occurrence of the symbol + name in kallsyms for the patched object. + There are currently no such attributes. diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index e842534d3493..94893e844e44 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -535,7 +535,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/ * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch// - * /sys/kernel/livepatch/// + * /sys/kernel/livepatch/// */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -680,8 +680,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; + /* The format for the sysfs directory is where sympos + * is the nth occurrence of this symbol in kallsyms for the patched + * object. If the user selects 0 for old_sympos, then 1 will be used + * since a unique symbol will be the first occurrence. + */ return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s", func->old_name); + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* parts of the initialization that is done only when the object is loaded */ -- cgit v1.2.3 From 20ef10c1b3068f105004e247d8e7dd8120fa4b9a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 26 Nov 2015 09:42:08 +1030 Subject: module: Use the same logic for setting and unsetting RO/NX When setting a module's RO and NX permissions, set_section_ro_nx() is used, but when clearing them, unset_module_{init,core}_ro_nx() are used. The unset functions don't have the same checks the set function has for partial page protections. It's probably harmless, but it's still confusingly asymmetrical. Instead, use the same logic to do both. Also add some new set_module_{init,core}_ro_nx() helper functions for more symmetry with the unset functions. Signed-off-by: Josh Poimboeuf Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 57 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8f051a106676..14b224967e7b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1886,7 +1886,9 @@ void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, static void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, - unsigned long total_size) + unsigned long total_size, + int (*set_ro)(unsigned long start, int num_pages), + int (*set_nx)(unsigned long start, int num_pages)) { /* begin and end PFNs of the current subsection */ unsigned long begin_pfn; @@ -1898,7 +1900,7 @@ static void set_section_ro_nx(void *base, * - Do not protect last partial page. */ if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_memory_ro); + set_page_attributes(base, base + ro_size, set_ro); /* * Set NX permissions for module data: @@ -1909,28 +1911,36 @@ static void set_section_ro_nx(void *base, begin_pfn = PFN_UP((unsigned long)base + text_size); end_pfn = PFN_UP((unsigned long)base + total_size); if (end_pfn > begin_pfn) - set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + set_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); } } +static void set_module_core_ro_nx(struct module *mod) +{ + set_section_ro_nx(mod->module_core, mod->core_text_size, + mod->core_ro_size, mod->core_size, + set_memory_ro, set_memory_nx); +} + static void unset_module_core_ro_nx(struct module *mod) { - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); + set_section_ro_nx(mod->module_core, mod->core_text_size, + mod->core_ro_size, mod->core_size, + set_memory_rw, set_memory_x); +} + +static void set_module_init_ro_nx(struct module *mod) +{ + set_section_ro_nx(mod->module_init, mod->init_text_size, + mod->init_ro_size, mod->init_size, + set_memory_ro, set_memory_nx); } static void unset_module_init_ro_nx(struct module *mod) { - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); + set_section_ro_nx(mod->module_init, mod->init_text_size, + mod->init_ro_size, mod->init_size, + set_memory_rw, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1979,7 +1989,8 @@ void set_all_modules_text_ro(void) mutex_unlock(&module_mutex); } #else -static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } +static void set_module_core_ro_nx(struct module *mod) { } +static void set_module_init_ro_nx(struct module *mod) { } static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif @@ -3373,17 +3384,9 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); + /* Set RO and NX regions */ + set_module_init_ro_nx(mod); + set_module_core_ro_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ -- cgit v1.2.3 From c65abf358f211c3f88c8ed714dff25775ab49fc1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 26 Nov 2015 09:43:08 +1030 Subject: gcov: use within_module() helper. An exact mapping would be within_module_core(), but at this stage (MODULE_STATE_GOING) the init section is empty, and this is clearer. Reviewed-by: Peter Oberparleiter Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/gcov/base.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb6c1..2f9df37940a0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -123,11 +123,6 @@ void gcov_enable_events(void) } #ifdef CONFIG_MODULES -static inline int within(void *addr, void *start, unsigned long size) -{ - return ((addr >= start) && (addr < start + size)); -} - /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) @@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within(info, mod->module_core, mod->core_size)) { + if (within_module((unsigned long)info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); -- cgit v1.2.3 From 7523e4dc5057e157212b4741abd6256e03404cf1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 26 Nov 2015 09:44:08 +1030 Subject: module: use a structure to encapsulate layout. Makes it easier to handle init vs core cleanly, though the change is fairly invasive across random architectures. It simplifies the rbtree code immediately, however, while keeping the core data together in the same cachline (now iff the rbtree code is enabled). Acked-by: Peter Zijlstra Reviewed-by: Josh Poimboeuf Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- arch/alpha/kernel/module.c | 2 +- arch/arc/kernel/unwind.c | 4 +- arch/arm/kernel/module-plts.c | 2 +- arch/avr32/kernel/module.c | 12 +-- arch/ia64/kernel/module.c | 14 +-- arch/metag/kernel/module.c | 4 +- arch/mips/kernel/vpe.c | 6 +- arch/parisc/kernel/module.c | 32 +++---- arch/powerpc/kernel/module_32.c | 6 +- arch/s390/kernel/module.c | 22 ++--- arch/x86/kernel/livepatch.c | 6 +- include/linux/module.h | 64 ++++++------- kernel/debug/kdb/kdb_main.c | 4 +- kernel/module.c | 199 +++++++++++++++++++--------------------- 14 files changed, 178 insertions(+), 199 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/module.c b/arch/alpha/kernel/module.c index 2fd00b7077e4..936bc8f89a67 100644 --- a/arch/alpha/kernel/module.c +++ b/arch/alpha/kernel/module.c @@ -160,7 +160,7 @@ apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, /* The small sections were sorted to the end of the segment. The following should definitely cover them. */ - gp = (u64)me->module_core + me->core_size - 0x8000; + gp = (u64)me->core_layout.base + me->core_layout.size - 0x8000; got = sechdrs[me->arch.gotsecindex].sh_addr; for (i = 0; i < n; i++) { diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index 93c6ea52b671..e0034a6656ef 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -372,8 +372,8 @@ void *unwind_add_table(struct module *module, const void *table_start, return NULL; init_unwind_table(table, module->name, - module->module_core, module->core_size, - module->module_init, module->init_size, + module->core_layout.base, module->core_layout.size, + module->init_layout.base, module->init_layout.size, table_start, table_size, NULL, 0); diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c index 097e2e201b9f..0c7efc3446c0 100644 --- a/arch/arm/kernel/module-plts.c +++ b/arch/arm/kernel/module-plts.c @@ -32,7 +32,7 @@ struct plt_entries { static bool in_init(const struct module *mod, u32 addr) { - return addr - (u32)mod->module_init < mod->init_size; + return addr - (u32)mod->init_layout.base < mod->init_layout.size; } u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c index 164efa009e5b..2b4c54c04cb6 100644 --- a/arch/avr32/kernel/module.c +++ b/arch/avr32/kernel/module.c @@ -118,9 +118,9 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, * Increase core size to make room for GOT and set start * offset for GOT. */ - module->core_size = ALIGN(module->core_size, 4); - module->arch.got_offset = module->core_size; - module->core_size += module->arch.got_size; + module->core_layout.size = ALIGN(module->core_layout.size, 4); + module->arch.got_offset = module->core_layout.size; + module->core_layout.size += module->arch.got_size; return 0; @@ -177,7 +177,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, if (!info->got_initialized) { Elf32_Addr *gotent; - gotent = (module->module_core + gotent = (module->core_layout.base + module->arch.got_offset + info->got_offset); *gotent = relocation; @@ -255,8 +255,8 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, */ pr_debug("GOTPC: PC=0x%x, got_offset=0x%lx, core=0x%p\n", relocation, module->arch.got_offset, - module->module_core); - relocation -= ((unsigned long)module->module_core + module->core_layout.base); + relocation -= ((unsigned long)module->core_layout.base + module->arch.got_offset); *location = relocation; break; diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index b15933c31b2f..6ab0ae7d6535 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -486,13 +486,13 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, static inline int in_init (const struct module *mod, uint64_t addr) { - return addr - (uint64_t) mod->module_init < mod->init_size; + return addr - (uint64_t) mod->init_layout.base < mod->init_layout.size; } static inline int in_core (const struct module *mod, uint64_t addr) { - return addr - (uint64_t) mod->module_core < mod->core_size; + return addr - (uint64_t) mod->core_layout.base < mod->core_layout.size; } static inline int @@ -675,7 +675,7 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend, break; case RV_BDREL: - val -= (uint64_t) (in_init(mod, val) ? mod->module_init : mod->module_core); + val -= (uint64_t) (in_init(mod, val) ? mod->init_layout.base : mod->core_layout.base); break; case RV_LTV: @@ -810,15 +810,15 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind * addresses have been selected... */ uint64_t gp; - if (mod->core_size > MAX_LTOFF) + if (mod->core_layout.size > MAX_LTOFF) /* * This takes advantage of fact that SHF_ARCH_SMALL gets allocated * at the end of the module. */ - gp = mod->core_size - MAX_LTOFF / 2; + gp = mod->core_layout.size - MAX_LTOFF / 2; else - gp = mod->core_size / 2; - gp = (uint64_t) mod->module_core + ((gp + 7) & -8); + gp = mod->core_layout.size / 2; + gp = (uint64_t) mod->core_layout.base + ((gp + 7) & -8); mod->arch.gp = gp; DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp); } diff --git a/arch/metag/kernel/module.c b/arch/metag/kernel/module.c index 986331cd0a52..bb8dfba9a763 100644 --- a/arch/metag/kernel/module.c +++ b/arch/metag/kernel/module.c @@ -176,8 +176,8 @@ static uint32_t do_plt_call(void *location, Elf32_Addr val, tramp[1] = 0xac000001 | ((val & 0x0000ffff) << 3); /* Init, or core PLT? */ - if (location >= mod->module_core - && location < mod->module_core + mod->core_size) + if (location >= mod->core_layout.base + && location < mod->core_layout.base + mod->core_layout.size) entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr; else entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr; diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 9067b651c7a2..544ea21bfef9 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -205,11 +205,11 @@ static void layout_sections(struct module *mod, const Elf_Ehdr *hdr, || s->sh_entsize != ~0UL) continue; s->sh_entsize = - get_offset((unsigned long *)&mod->core_size, s); + get_offset((unsigned long *)&mod->core_layout.size, s); } if (m == 0) - mod->core_text_size = mod->core_size; + mod->core_layout.text_size = mod->core_layout.size; } } @@ -641,7 +641,7 @@ static int vpe_elfload(struct vpe *v) layout_sections(&mod, hdr, sechdrs, secstrings); } - v->load_addr = alloc_progmem(mod.core_size); + v->load_addr = alloc_progmem(mod.core_layout.size); if (!v->load_addr) return -ENOMEM; diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 3c63a820fcda..b9d75d9fa9ac 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -42,9 +42,9 @@ * We are not doing SEGREL32 handling correctly. According to the ABI, we * should do a value offset, like this: * if (in_init(me, (void *)val)) - * val -= (uint32_t)me->module_init; + * val -= (uint32_t)me->init_layout.base; * else - * val -= (uint32_t)me->module_core; + * val -= (uint32_t)me->core_layout.base; * However, SEGREL32 is used only for PARISC unwind entries, and we want * those entries to have an absolute address, and not just an offset. * @@ -100,14 +100,14 @@ * or init pieces the location is */ static inline int in_init(struct module *me, void *loc) { - return (loc >= me->module_init && - loc <= (me->module_init + me->init_size)); + return (loc >= me->init_layout.base && + loc <= (me->init_layout.base + me->init_layout.size)); } static inline int in_core(struct module *me, void *loc) { - return (loc >= me->module_core && - loc <= (me->module_core + me->core_size)); + return (loc >= me->core_layout.base && + loc <= (me->core_layout.base + me->core_layout.size)); } static inline int in_local(struct module *me, void *loc) @@ -367,13 +367,13 @@ int module_frob_arch_sections(CONST Elf_Ehdr *hdr, } /* align things a bit */ - me->core_size = ALIGN(me->core_size, 16); - me->arch.got_offset = me->core_size; - me->core_size += gots * sizeof(struct got_entry); + me->core_layout.size = ALIGN(me->core_layout.size, 16); + me->arch.got_offset = me->core_layout.size; + me->core_layout.size += gots * sizeof(struct got_entry); - me->core_size = ALIGN(me->core_size, 16); - me->arch.fdesc_offset = me->core_size; - me->core_size += fdescs * sizeof(Elf_Fdesc); + me->core_layout.size = ALIGN(me->core_layout.size, 16); + me->arch.fdesc_offset = me->core_layout.size; + me->core_layout.size += fdescs * sizeof(Elf_Fdesc); me->arch.got_max = gots; me->arch.fdesc_max = fdescs; @@ -391,7 +391,7 @@ static Elf64_Word get_got(struct module *me, unsigned long value, long addend) BUG_ON(value == 0); - got = me->module_core + me->arch.got_offset; + got = me->core_layout.base + me->arch.got_offset; for (i = 0; got[i].addr; i++) if (got[i].addr == value) goto out; @@ -409,7 +409,7 @@ static Elf64_Word get_got(struct module *me, unsigned long value, long addend) #ifdef CONFIG_64BIT static Elf_Addr get_fdesc(struct module *me, unsigned long value) { - Elf_Fdesc *fdesc = me->module_core + me->arch.fdesc_offset; + Elf_Fdesc *fdesc = me->core_layout.base + me->arch.fdesc_offset; if (!value) { printk(KERN_ERR "%s: zero OPD requested!\n", me->name); @@ -427,7 +427,7 @@ static Elf_Addr get_fdesc(struct module *me, unsigned long value) /* Create new one */ fdesc->addr = value; - fdesc->gp = (Elf_Addr)me->module_core + me->arch.got_offset; + fdesc->gp = (Elf_Addr)me->core_layout.base + me->arch.got_offset; return (Elf_Addr)fdesc; } #endif /* CONFIG_64BIT */ @@ -839,7 +839,7 @@ register_unwind_table(struct module *me, table = (unsigned char *)sechdrs[me->arch.unwind_section].sh_addr; end = table + sechdrs[me->arch.unwind_section].sh_size; - gp = (Elf_Addr)me->module_core + me->arch.got_offset; + gp = (Elf_Addr)me->core_layout.base + me->arch.got_offset; DEBUGP("register_unwind_table(), sect = %d at 0x%p - 0x%p (gp=0x%lx)\n", me->arch.unwind_section, table, end, gp); diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index c94d2e018d84..2c01665eb410 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -188,8 +188,8 @@ static uint32_t do_plt_call(void *location, pr_debug("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location); /* Init, or core PLT? */ - if (location >= mod->module_core - && location < mod->module_core + mod->core_size) + if (location >= mod->core_layout.base + && location < mod->core_layout.base + mod->core_layout.size) entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr; else entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr; @@ -296,7 +296,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, } #ifdef CONFIG_DYNAMIC_FTRACE module->arch.tramp = - do_plt_call(module->module_core, + do_plt_call(module->core_layout.base, (unsigned long)ftrace_caller, sechdrs, module); #endif diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 0c1a679314dd..7873e171457c 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -159,11 +159,11 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, /* Increase core size by size of got & plt and set start offsets for got and plt. */ - me->core_size = ALIGN(me->core_size, 4); - me->arch.got_offset = me->core_size; - me->core_size += me->arch.got_size; - me->arch.plt_offset = me->core_size; - me->core_size += me->arch.plt_size; + me->core_layout.size = ALIGN(me->core_layout.size, 4); + me->arch.got_offset = me->core_layout.size; + me->core_layout.size += me->arch.got_size; + me->arch.plt_offset = me->core_layout.size; + me->core_layout.size += me->arch.plt_size; return 0; } @@ -279,7 +279,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, if (info->got_initialized == 0) { Elf_Addr *gotent; - gotent = me->module_core + me->arch.got_offset + + gotent = me->core_layout.base + me->arch.got_offset + info->got_offset; *gotent = val; info->got_initialized = 1; @@ -302,7 +302,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, rc = apply_rela_bits(loc, val, 0, 64, 0); else if (r_type == R_390_GOTENT || r_type == R_390_GOTPLTENT) { - val += (Elf_Addr) me->module_core - loc; + val += (Elf_Addr) me->core_layout.base - loc; rc = apply_rela_bits(loc, val, 1, 32, 1); } break; @@ -315,7 +315,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ if (info->plt_initialized == 0) { unsigned int *ip; - ip = me->module_core + me->arch.plt_offset + + ip = me->core_layout.base + me->arch.plt_offset + info->plt_offset; ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */ ip[1] = 0x100a0004; @@ -334,7 +334,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, val - loc + 0xffffUL < 0x1ffffeUL) || (r_type == R_390_PLT32DBL && val - loc + 0xffffffffULL < 0x1fffffffeULL))) - val = (Elf_Addr) me->module_core + + val = (Elf_Addr) me->core_layout.base + me->arch.plt_offset + info->plt_offset; val += rela->r_addend - loc; @@ -356,7 +356,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_GOTOFF32: /* 32 bit offset to GOT. */ case R_390_GOTOFF64: /* 64 bit offset to GOT. */ val = val + rela->r_addend - - ((Elf_Addr) me->module_core + me->arch.got_offset); + ((Elf_Addr) me->core_layout.base + me->arch.got_offset); if (r_type == R_390_GOTOFF16) rc = apply_rela_bits(loc, val, 0, 16, 0); else if (r_type == R_390_GOTOFF32) @@ -366,7 +366,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, break; case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ - val = (Elf_Addr) me->module_core + me->arch.got_offset + + val = (Elf_Addr) me->core_layout.base + me->arch.got_offset + rela->r_addend - loc; if (r_type == R_390_GOTPC) rc = apply_rela_bits(loc, val, 1, 32, 0); diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index d1d35ccffed3..bcc06e82a593 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -41,8 +41,8 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, int ret, numpages, size = 4; bool readonly; unsigned long val; - unsigned long core = (unsigned long)mod->module_core; - unsigned long core_size = mod->core_size; + unsigned long core = (unsigned long)mod->core_layout.base; + unsigned long core_size = mod->core_layout.size; switch (type) { case R_X86_64_NONE: @@ -72,7 +72,7 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, readonly = false; #ifdef CONFIG_DEBUG_SET_MODULE_RONX - if (loc < core + mod->core_ro_size) + if (loc < core + mod->core_layout.ro_size) readonly = true; #endif diff --git a/include/linux/module.h b/include/linux/module.h index 3a19c79918e0..6e68e8cf4d0d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -302,6 +302,28 @@ struct mod_tree_node { struct latch_tree_node node; }; +struct module_layout { + /* The actual code + data. */ + void *base; + /* Total size. */ + unsigned int size; + /* The size of the executable code. */ + unsigned int text_size; + /* Size of RO section of the module (text+rodata) */ + unsigned int ro_size; + +#ifdef CONFIG_MODULES_TREE_LOOKUP + struct mod_tree_node mtn; +#endif +}; + +#ifdef CONFIG_MODULES_TREE_LOOKUP +/* Only touch one cacheline for common rbtree-for-core-layout case. */ +#define __module_layout_align ____cacheline_aligned +#else +#define __module_layout_align +#endif + struct module { enum module_state state; @@ -366,37 +388,9 @@ struct module { /* Startup function. */ int (*init)(void); - /* - * If this is non-NULL, vfree() after init() returns. - * - * Cacheline align here, such that: - * module_init, module_core, init_size, core_size, - * init_text_size, core_text_size and mtn_core::{mod,node[0]} - * are on the same cacheline. - */ - void *module_init ____cacheline_aligned; - - /* Here is the actual code + data, vfree'd on unload. */ - void *module_core; - - /* Here are the sizes of the init and core sections */ - unsigned int init_size, core_size; - - /* The size of the executable code in each section. */ - unsigned int init_text_size, core_text_size; - -#ifdef CONFIG_MODULES_TREE_LOOKUP - /* - * We want mtn_core::{mod,node[0]} to be in the same cacheline as the - * above entries such that a regular lookup will only touch one - * cacheline. - */ - struct mod_tree_node mtn_core; - struct mod_tree_node mtn_init; -#endif - - /* Size of RO sections of the module (text+rodata) */ - unsigned int init_ro_size, core_ro_size; + /* Core layout: rbtree is accessed frequently, so keep together. */ + struct module_layout core_layout __module_layout_align; + struct module_layout init_layout; /* Arch-specific module values */ struct mod_arch_specific arch; @@ -505,15 +499,15 @@ bool is_module_text_address(unsigned long addr); static inline bool within_module_core(unsigned long addr, const struct module *mod) { - return (unsigned long)mod->module_core <= addr && - addr < (unsigned long)mod->module_core + mod->core_size; + return (unsigned long)mod->core_layout.base <= addr && + addr < (unsigned long)mod->core_layout.base + mod->core_layout.size; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { - return (unsigned long)mod->module_init <= addr && - addr < (unsigned long)mod->module_init + mod->init_size; + return (unsigned long)mod->init_layout.base <= addr && + addr < (unsigned long)mod->init_layout.base + mod->init_layout.size; } static inline bool within_module(unsigned long addr, const struct module *mod) diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4121345498e0..2a20c0dfdafc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv) continue; kdb_printf("%-20s%8u 0x%p ", mod->name, - mod->core_size, (void *)mod); + mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); #endif @@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->module_core); + kdb_printf(" 0x%p", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/module.c b/kernel/module.c index 14b224967e7b..a0a3d6d9d5e8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -108,13 +108,6 @@ static LIST_HEAD(modules); * Use a latched RB-tree for __module_address(); this allows us to use * RCU-sched lookups of the address from any context. * - * Because modules have two address ranges: init and core, we need two - * latch_tree_nodes entries. Therefore we need the back-pointer from - * mod_tree_node. - * - * Because init ranges are short lived we mark them unlikely and have placed - * them outside the critical cacheline in struct module. - * * This is conditional on PERF_EVENTS || TRACING because those can really hit * __module_address() hard by doing a lot of stack unwinding; potentially from * NMI context. @@ -122,24 +115,16 @@ static LIST_HEAD(modules); static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->module_init; - - return (unsigned long)mod->module_core; + return (unsigned long)layout->base; } static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; - - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->init_size; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - return (unsigned long)mod->core_size; + return (unsigned long)layout->size; } static __always_inline bool @@ -197,23 +182,23 @@ static void __mod_tree_remove(struct mod_tree_node *node) */ static void mod_tree_insert(struct module *mod) { - mod->mtn_core.mod = mod; - mod->mtn_init.mod = mod; + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; - __mod_tree_insert(&mod->mtn_core); - if (mod->init_size) - __mod_tree_insert(&mod->mtn_init); + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); } static void mod_tree_remove_init(struct module *mod) { - if (mod->init_size) - __mod_tree_remove(&mod->mtn_init); + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); } static void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->mtn_core); + __mod_tree_remove(&mod->core_layout.mtn); mod_tree_remove_init(mod); } @@ -267,9 +252,9 @@ static void __mod_update_bounds(void *base, unsigned int size) static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->module_core, mod->core_size); - if (mod->init_size) - __mod_update_bounds(mod->module_init, mod->init_size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + if (mod->init_layout.size) + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); } #ifdef CONFIG_KGDB_KDB @@ -1214,7 +1199,7 @@ struct module_attribute module_uevent = static ssize_t show_coresize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->core_size); + return sprintf(buffer, "%u\n", mk->mod->core_layout.size); } static struct module_attribute modinfo_coresize = @@ -1223,7 +1208,7 @@ static struct module_attribute modinfo_coresize = static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->init_size); + return sprintf(buffer, "%u\n", mk->mod->init_layout.size); } static struct module_attribute modinfo_initsize = @@ -1917,29 +1902,29 @@ static void set_section_ro_nx(void *base, static void set_module_core_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_core, mod->core_text_size, - mod->core_ro_size, mod->core_size, + set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, + mod->core_layout.ro_size, mod->core_layout.size, set_memory_ro, set_memory_nx); } static void unset_module_core_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_core, mod->core_text_size, - mod->core_ro_size, mod->core_size, + set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, + mod->core_layout.ro_size, mod->core_layout.size, set_memory_rw, set_memory_x); } static void set_module_init_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_init, mod->init_text_size, - mod->init_ro_size, mod->init_size, + set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, + mod->init_layout.ro_size, mod->init_layout.size, set_memory_ro, set_memory_nx); } static void unset_module_init_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_init, mod->init_text_size, - mod->init_ro_size, mod->init_size, + set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, + mod->init_layout.ro_size, mod->init_layout.size, set_memory_rw, set_memory_x); } @@ -1952,14 +1937,14 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, + if ((mod->core_layout.base) && (mod->core_layout.text_size)) { + set_page_attributes(mod->core_layout.base, + mod->core_layout.base + mod->core_layout.text_size, set_memory_rw); } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, + if ((mod->init_layout.base) && (mod->init_layout.text_size)) { + set_page_attributes(mod->init_layout.base, + mod->init_layout.base + mod->init_layout.text_size, set_memory_rw); } } @@ -1975,14 +1960,14 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, + if ((mod->core_layout.base) && (mod->core_layout.text_size)) { + set_page_attributes(mod->core_layout.base, + mod->core_layout.base + mod->core_layout.text_size, set_memory_ro); } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, + if ((mod->init_layout.base) && (mod->init_layout.text_size)) { + set_page_attributes(mod->init_layout.base, + mod->init_layout.base + mod->init_layout.text_size, set_memory_ro); } } @@ -2047,16 +2032,16 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); + module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2259,20 +2244,20 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || strstarts(sname, ".init")) continue; - s->sh_entsize = get_offset(mod, &mod->core_size, s, i); + s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_size = debug_align(mod->core_size); - mod->core_text_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_size = debug_align(mod->core_size); - mod->core_ro_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_size = mod->core_layout.size; break; case 3: /* whole core */ - mod->core_size = debug_align(mod->core_size); + mod->core_layout.size = debug_align(mod->core_layout.size); break; } } @@ -2288,21 +2273,21 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !strstarts(sname, ".init")) continue; - s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) + s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_size = debug_align(mod->init_size); - mod->init_text_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_size = debug_align(mod->init_size); - mod->init_ro_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.ro_size = mod->init_layout.size; break; case 3: /* whole init */ - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); break; } } @@ -2477,7 +2462,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, info->index.sym) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + symsect->sh_name); @@ -2494,16 +2479,16 @@ static void layout_symtab(struct module *mod, struct load_info *info) } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_size += strtab_size; - mod->core_size = debug_align(mod->core_size); + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } @@ -2524,8 +2509,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; + mod->core_symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_strtab = s = mod->core_layout.base + info->stroffs; src = mod->symtab; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || @@ -2975,7 +2960,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc(mod->core_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2985,11 +2970,11 @@ static int move_module(struct module *mod, struct load_info *info) if (!ptr) return -ENOMEM; - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; + memset(ptr, 0, mod->core_layout.size); + mod->core_layout.base = ptr; - if (mod->init_size) { - ptr = module_alloc(mod->init_size); + if (mod->init_layout.size) { + ptr = module_alloc(mod->init_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -2998,13 +2983,13 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); return -ENOMEM; } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + memset(ptr, 0, mod->init_layout.size); + mod->init_layout.base = ptr; } else - mod->module_init = NULL; + mod->init_layout.base = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -3016,10 +3001,10 @@ static int move_module(struct module *mod, struct load_info *info) continue; if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init + dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + shdr->sh_entsize; + dest = mod->core_layout.base + shdr->sh_entsize; if (shdr->sh_type != SHT_NOBITS) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); @@ -3081,12 +3066,12 @@ static void flush_module_icache(const struct module *mod) * Do it before processing of module parameters, so the module * can provide parameter accessor functions of its own. */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); + if (mod->init_layout.base) + flush_icache_range((unsigned long)mod->init_layout.base, + (unsigned long)mod->init_layout.base + + mod->init_layout.size); + flush_icache_range((unsigned long)mod->core_layout.base, + (unsigned long)mod->core_layout.base + mod->core_layout.size); set_fs(old_fs); } @@ -3144,8 +3129,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); - module_memfree(mod->module_core); + module_memfree(mod->init_layout.base); + module_memfree(mod->core_layout.base); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3232,7 +3217,7 @@ static noinline int do_init_module(struct module *mod) ret = -ENOMEM; goto fail; } - freeinit->module_init = mod->module_init; + freeinit->module_init = mod->init_layout.base; /* * We want to find out whether @mod uses async during init. Clear @@ -3292,10 +3277,10 @@ static noinline int do_init_module(struct module *mod) mod_tree_remove_init(mod); unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; + mod->init_layout.base = NULL; + mod->init_layout.size = 0; + mod->init_layout.ro_size = 0; + mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we @@ -3575,7 +3560,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); module_deallocate(mod, info); free_copy: @@ -3653,9 +3638,9 @@ static const char *get_ksymbol(struct module *mod, /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; + nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; else - nextval = (unsigned long)mod->module_core+mod->core_text_size; + nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ @@ -3902,7 +3887,7 @@ static int m_show(struct seq_file *m, void *p) return 0; seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); + mod->name, mod->init_layout.size + mod->core_layout.size); print_unload_info(m, mod); /* Informative for users. */ @@ -3911,7 +3896,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->module_core); + seq_printf(m, " 0x%pK", mod->core_layout.base); /* Taints info */ if (mod->taints) @@ -4054,8 +4039,8 @@ struct module *__module_text_address(unsigned long addr) struct module *mod = __module_address(addr); if (mod) { /* Make sure it's within the text section. */ - if (!within(addr, mod->module_init, mod->init_text_size) - && !within(addr, mod->module_core, mod->core_text_size)) + if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) + && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) mod = NULL; } return mod; -- cgit v1.2.3 From 85c898db6327353d38f3dd428457384cf81f83f8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 26 Nov 2015 09:45:08 +1030 Subject: module: clean up RO/NX handling. Modules have three sections: text, rodata and writable data. The code handled the case where these overlapped, however they never can: debug_align() ensures they are always page-aligned. This is why we got away with manually traversing the pages in set_all_modules_text_rw() without rounding. We create three helper functions: frob_text(), frob_rodata() and frob_writable_data(). We then call these explicitly at every point, so it's clear what we're doing. We also expose module_enable_ro() and module_disable_ro() for livepatch to use. Reviewed-by: Josh Poimboeuf Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- include/linux/module.h | 4 ++ kernel/module.c | 168 +++++++++++++++++++++++-------------------------- 2 files changed, 81 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 6e68e8cf4d0d..4560d8f1545d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -762,9 +762,13 @@ extern int module_sysfs_initialized; #ifdef CONFIG_DEBUG_SET_MODULE_RONX extern void set_all_modules_text_rw(void); extern void set_all_modules_text_ro(void); +extern void module_enable_ro(const struct module *mod); +extern void module_disable_ro(const struct module *mod); #else static inline void set_all_modules_text_rw(void) { } static inline void set_all_modules_text_ro(void) { } +static inline void module_enable_ro(const struct module *mod) { } +static inline void module_disable_ro(const struct module *mod) { } #endif #ifdef CONFIG_GENERIC_BUG diff --git a/kernel/module.c b/kernel/module.c index a0a3d6d9d5e8..77212128f34a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -80,15 +80,6 @@ # define debug_align(X) (X) #endif -/* - * Given BASE and SIZE this macro calculates the number of pages the - * memory regions occupies - */ -#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ - (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ - PFN_DOWN((unsigned long)BASE) + 1) \ - : (0UL)) - /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -1858,74 +1849,75 @@ static void mod_sysfs_teardown(struct module *mod) /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [writable data] + * text_size -----^ ^ ^ + * ro_size ------------------------| | + * size -------------------------------------------| + * + * These values are always page-aligned (as is base) */ -void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - unsigned long begin_pfn = PFN_DOWN((unsigned long)start); - unsigned long end_pfn = PFN_DOWN((unsigned long)end); - - if (end_pfn > begin_pfn) - set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base, + layout->text_size >> PAGE_SHIFT); } -static void set_section_ro_nx(void *base, - unsigned long text_size, - unsigned long ro_size, - unsigned long total_size, - int (*set_ro)(unsigned long start, int num_pages), - int (*set_nx)(unsigned long start, int num_pages)) +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - /* begin and end PFNs of the current subsection */ - unsigned long begin_pfn; - unsigned long end_pfn; - - /* - * Set RO for module text and RO-data: - * - Always protect first page. - * - Do not protect last partial page. - */ - if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_ro); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); +} - /* - * Set NX permissions for module data: - * - Do not protect first partial page. - * - Always protect last page. - */ - if (total_size > text_size) { - begin_pfn = PFN_UP((unsigned long)base + text_size); - end_pfn = PFN_UP((unsigned long)base + total_size); - if (end_pfn > begin_pfn) - set_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); - } +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->size - layout->ro_size) >> PAGE_SHIFT); } -static void set_module_core_ro_nx(struct module *mod) +/* livepatching wants to disable read-only so it can frob module. */ +void module_disable_ro(const struct module *mod) { - set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, - mod->core_layout.ro_size, mod->core_layout.size, - set_memory_ro, set_memory_nx); + frob_text(&mod->core_layout, set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); + frob_rodata(&mod->init_layout, set_memory_rw); } -static void unset_module_core_ro_nx(struct module *mod) +void module_enable_ro(const struct module *mod) { - set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, - mod->core_layout.ro_size, mod->core_layout.size, - set_memory_rw, set_memory_x); + frob_text(&mod->core_layout, set_memory_ro); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); } -static void set_module_init_ro_nx(struct module *mod) +static void module_enable_nx(const struct module *mod) { - set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, - mod->init_layout.ro_size, mod->init_layout.size, - set_memory_ro, set_memory_nx); + frob_rodata(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); } -static void unset_module_init_ro_nx(struct module *mod) +static void module_disable_nx(const struct module *mod) { - set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, - mod->init_layout.ro_size, mod->init_layout.size, - set_memory_rw, set_memory_x); + frob_rodata(&mod->core_layout, set_memory_x); + frob_writable_data(&mod->core_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_x); + frob_writable_data(&mod->init_layout, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1937,16 +1929,9 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->core_layout.base) && (mod->core_layout.text_size)) { - set_page_attributes(mod->core_layout.base, - mod->core_layout.base + mod->core_layout.text_size, - set_memory_rw); - } - if ((mod->init_layout.base) && (mod->init_layout.text_size)) { - set_page_attributes(mod->init_layout.base, - mod->init_layout.base + mod->init_layout.text_size, - set_memory_rw); - } + + frob_text(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); } mutex_unlock(&module_mutex); } @@ -1960,24 +1945,25 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->core_layout.base) && (mod->core_layout.text_size)) { - set_page_attributes(mod->core_layout.base, - mod->core_layout.base + mod->core_layout.text_size, - set_memory_ro); - } - if ((mod->init_layout.base) && (mod->init_layout.text_size)) { - set_page_attributes(mod->init_layout.base, - mod->init_layout.base + mod->init_layout.text_size, - set_memory_ro); - } + + frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); } mutex_unlock(&module_mutex); } + +static void disable_ro_nx(const struct module_layout *layout) +{ + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_rodata(layout, set_memory_x); + frob_writable_data(layout, set_memory_x); +} + #else -static void set_module_core_ro_nx(struct module *mod) { } -static void set_module_init_ro_nx(struct module *mod) { } -static void unset_module_core_ro_nx(struct module *mod) { } -static void unset_module_init_ro_nx(struct module *mod) { } +static void disable_ro_nx(const struct module_layout *layout) { } +static void module_enable_nx(const struct module *mod) { } +static void module_disable_nx(const struct module *mod) { } #endif void __weak module_memfree(void *module_region) @@ -2029,8 +2015,8 @@ static void free_module(struct module *mod) synchronize_sched(); mutex_unlock(&module_mutex); - /* This may be NULL, but that's OK */ - unset_module_init_ro_nx(mod); + /* This may be empty, but that's OK */ + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); kfree(mod->args); @@ -2040,7 +2026,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - unset_module_core_ro_nx(mod); + disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU @@ -3275,7 +3261,7 @@ static noinline int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif mod_tree_remove_init(mod); - unset_module_init_ro_nx(mod); + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; @@ -3370,8 +3356,8 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); /* Set RO and NX regions */ - set_module_init_ro_nx(mod); - set_module_core_ro_nx(mod); + module_enable_ro(mod); + module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3536,8 +3522,8 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); /* we can't deallocate the module until we clear memory protection */ - unset_module_init_ro_nx(mod); - unset_module_core_ro_nx(mod); + module_disable_ro(mod); + module_disable_nx(mod); ddebug_cleanup: dynamic_debug_remove(info->debug); -- cgit v1.2.3 From e0224418516b4d8a6c2160574bac18447c354ef0 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Thu, 26 Nov 2015 13:18:06 +1030 Subject: module: keep percpu symbols in module's symtab Currently, percpu symbols from .data..percpu ELF section of a module are not copied over and stored in final symtab array of struct module. Consequently such symbol cannot be returned via kallsyms API (for example kallsyms_lookup_name). This can be especially confusing when the percpu symbol is exported. Only its __ksymtab et al. are present in its symtab. The culprit is in layout_and_allocate() function where SHF_ALLOC flag is dropped for .data..percpu section. There is in fact no need to copy the section to final struct module, because kernel module loader allocates extra percpu section by itself. Unfortunately only symbols from SHF_ALLOC sections are copied due to a check in is_core_symbol(). The patch changes is_core_symbol() function to copy over also percpu symbols (their st_shndx points to .data..percpu ELF section). We do it only if CONFIG_KALLSYMS_ALL is set to be consistent with the rest of the function (ELF section is SHF_ALLOC but !SHF_EXECINSTR). Finally elf_type() returns type 'a' for a percpu symbol because its address is absolute. Signed-off-by: Miroslav Benes Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 77212128f34a..912e891e0e2f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2383,7 +2383,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } if (sym->st_shndx == SHN_UNDEF) return 'U'; - if (sym->st_shndx == SHN_ABS) + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) return 'a'; if (sym->st_shndx >= SHN_LORESERVE) return '?'; @@ -2412,7 +2412,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; @@ -2421,6 +2421,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, || !src->st_name) return false; +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + sec = sechdrs + src->st_shndx; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL @@ -2458,7 +2463,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } @@ -2500,7 +2506,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) src = mod->symtab; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_strtab; s += strlcpy(s, &mod->strtab[src[i].st_name], -- cgit v1.2.3 From b56b36ee6751abe7fb3890681e86fc8de2122953 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 3 Dec 2015 16:33:26 -0600 Subject: livepatch: Cleanup module page permission changes Calling set_memory_rw() and set_memory_ro() for every iteration of the loop in klp_write_object_relocations() is messy, inefficient, and error-prone. Change all the read-only pages to read-write before the loop and convert them back to read-only again afterwards. Suggested-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- arch/x86/kernel/livepatch.c | 25 ++----------------------- kernel/livepatch/core.c | 16 +++++++++++----- 2 files changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index bcc06e82a593..92fc1a51f994 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -20,8 +20,6 @@ #include #include -#include -#include #include #include @@ -38,8 +36,7 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value) { - int ret, numpages, size = 4; - bool readonly; + size_t size = 4; unsigned long val; unsigned long core = (unsigned long)mod->core_layout.base; unsigned long core_size = mod->core_layout.size; @@ -69,23 +66,5 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, /* loc does not point to any symbol inside the module */ return -EINVAL; - readonly = false; - -#ifdef CONFIG_DEBUG_SET_MODULE_RONX - if (loc < core + mod->core_layout.ro_size) - readonly = true; -#endif - - /* determine if the relocation spans a page boundary */ - numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; - - if (readonly) - set_memory_rw(loc & PAGE_MASK, numpages); - - ret = probe_kernel_write((void *)loc, &val, size); - - if (readonly) - set_memory_ro(loc & PAGE_MASK, numpages); - - return ret; + return probe_kernel_write((void *)loc, &val, size); } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 94893e844e44..bc2c85c064c1 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,7 @@ #include #include #include +#include /** * struct klp_ops - structure for tracking registered ftrace ops structs @@ -232,7 +233,7 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret; + int ret = 0; unsigned long val; struct klp_reloc *reloc; @@ -242,13 +243,16 @@ static int klp_write_object_relocations(struct module *pmod, if (WARN_ON(!obj->relocs)) return -EINVAL; + module_disable_ro(pmod); + for (reloc = obj->relocs; reloc->name; reloc++) { /* discover the address of the referenced symbol */ if (reloc->external) { if (reloc->sympos > 0) { pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", reloc->name); - return -EINVAL; + ret = -EINVAL; + goto out; } ret = klp_find_external_symbol(pmod, reloc->name, &val); } else @@ -257,18 +261,20 @@ static int klp_write_object_relocations(struct module *pmod, reloc->sympos, &val); if (ret) - return ret; + goto out; ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", reloc->name, val, ret); - return ret; + goto out; } } - return 0; +out: + module_enable_ro(pmod); + return ret; } static void notrace klp_ftrace_handler(unsigned long ip, -- cgit v1.2.3 From 90a545e981267e917b9d698ce07affd69787db87 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 Nov 2015 15:49:03 -0800 Subject: restrict /dev/mem to idle io memory ranges This effectively promotes IORESOURCE_BUSY to IORESOURCE_EXCLUSIVE semantics by default. If userspace really believes it is safe to access the memory region it can also perform the extra step of disabling an active driver. This protects device address ranges with read side effects and otherwise directs userspace to use the driver. Persistent memory presents a large "mistake surface" to /dev/mem as now accidental writes can corrupt a filesystem. In general if a device driver is busily using a memory region it already informs other parts of the kernel to not touch it via request_mem_region(). /dev/mem should honor the same safety restriction by default. Debugging a device driver from userspace becomes more difficult with this enabled. Any application using /dev/mem or mmap of sysfs pci resources will now need to perform the extra step of either: 1/ Disabling the driver, for example: echo > /dev/bus//drivers//unbind 2/ Rebooting with "iomem=relaxed" on the command line 3/ Recompiling with CONFIG_IO_STRICT_DEVMEM=n Traditional users of /dev/mem like dosemu are unaffected because the first 1MB of memory is not subject to the IO_STRICT_DEVMEM restriction. Legacy X configurations use /dev/mem to talk to graphics hardware, but that functionality has since moved to kernel graphics drivers. Cc: Arnd Bergmann Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Acked-by: Kees Cook Acked-by: Ingo Molnar Signed-off-by: Dan Williams --- kernel/resource.c | 11 +++++++++-- lib/Kconfig.debug | 23 ++++++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..09c0597840b0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 289dfcbc14eb..073496dea848 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1869,9 +1869,26 @@ config STRICT_DEVMEM enabled, even in this case there are restrictions on /dev/mem use due to the cache aliasing requirements. + If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem + file only allows userspace access to PCI space and the BIOS code and + data regions. This is sufficient for dosemu and X and all common + users of /dev/mem. + + If in doubt, say Y. + +config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM + default STRICT_DEVMEM + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that + range. Accidental access to this is obviously disastrous, but + specific access can be used by people debugging kernel drivers. + If this option is switched on, the /dev/mem file only allows - userspace access to PCI space and the BIOS code and data regions. - This is sufficient for dosemu and X and all common users of - /dev/mem. + userspace access to *idle* io-memory ranges (see /proc/iomem) This + may break traditional users of /dev/mem (dosemu, legacy X, etc...) + if the driver using a given range cannot be disabled. If in doubt, say Y. -- cgit v1.2.3 From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- drivers/staging/lustre/lustre/llite/super25.c | 3 ++- fs/9p/v9fs.c | 2 +- fs/adfs/super.c | 2 +- fs/affs/super.c | 2 +- fs/afs/super.c | 2 +- fs/befs/linuxvfs.c | 2 +- fs/bfs/inode.c | 2 +- fs/block_dev.c | 2 +- fs/btrfs/inode.c | 3 ++- fs/ceph/super.c | 4 ++-- fs/cifs/cifsfs.c | 2 +- fs/coda/inode.c | 6 +++--- fs/dcache.c | 5 +++-- fs/ecryptfs/main.c | 6 ++++-- fs/efs/super.c | 6 +++--- fs/exofs/super.c | 4 ++-- fs/ext2/super.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/super.c | 5 +++-- fs/fat/inode.c | 2 +- fs/file.c | 7 ++++--- fs/fuse/inode.c | 4 ++-- fs/gfs2/main.c | 3 ++- fs/hfs/super.c | 4 ++-- fs/hfsplus/super.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/super.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/inode.c | 2 +- fs/isofs/inode.c | 2 +- fs/jffs2/super.c | 2 +- fs/jfs/super.c | 2 +- fs/logfs/inode.c | 3 ++- fs/minix/inode.c | 2 +- fs/ncpfs/inode.c | 2 +- fs/nfs/inode.c | 2 +- fs/nilfs2/super.c | 3 ++- fs/ntfs/super.c | 4 ++-- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/ocfs2/super.c | 2 +- fs/openpromfs/inode.c | 2 +- fs/proc/inode.c | 3 ++- fs/qnx4/inode.c | 2 +- fs/qnx6/inode.c | 2 +- fs/reiserfs/super.c | 3 ++- fs/romfs/super.c | 4 ++-- fs/squashfs/super.c | 3 ++- fs/sysv/inode.c | 2 +- fs/ubifs/super.c | 4 ++-- fs/udf/super.c | 3 ++- fs/ufs/super.c | 2 +- fs/xfs/kmem.h | 1 + fs/xfs/xfs_super.c | 4 ++-- include/linux/thread_info.h | 5 +++-- ipc/mqueue.c | 2 +- kernel/cred.c | 4 ++-- kernel/delayacct.c | 2 +- kernel/fork.c | 22 +++++++++++++--------- kernel/pid.c | 2 +- mm/nommu.c | 2 +- mm/rmap.c | 6 ++++-- mm/shmem.c | 2 +- net/socket.c | 2 +- net/sunrpc/rpc_pipe.c | 2 +- 65 files changed, 114 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 11634fa7ab3c..ad4840f86be1 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -767,7 +767,7 @@ static int __init spufs_init(void) ret = -ENOMEM; spufs_inode_cache = kmem_cache_create("spufs_inode_cache", sizeof(struct spufs_inode_info), 0, - SLAB_HWCACHE_ALIGN, spufs_init_once); + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, spufs_init_once); if (!spufs_inode_cache) goto out; diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c index 7a9fafc67693..86c371ef71ea 100644 --- a/drivers/staging/lustre/lustre/llite/super25.c +++ b/drivers/staging/lustre/lustre/llite/super25.c @@ -106,7 +106,8 @@ static int __init init_lustre_lite(void) rc = -ENOMEM; ll_inode_cachep = kmem_cache_create("lustre_inode_cache", sizeof(struct ll_inode_info), - 0, SLAB_HWCACHE_ALIGN, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, + NULL); if (ll_inode_cachep == NULL) goto out_cache; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6caca025019d..072e7599583a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 4d4a0df8344f..c9fdfb112933 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -271,7 +271,7 @@ static int __init init_inodecache(void) adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/super.c b/fs/affs/super.c index 8836df5f1e11..2a6713b6b9f4 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -132,7 +132,7 @@ static int __init init_inodecache(void) affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (affs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fb4a5129f7d..81afefe7d8a6 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -91,7 +91,7 @@ int __init afs_fs_init(void) afs_inode_cachep = kmem_cache_create("afs_inode_cache", sizeof(struct afs_vnode), 0, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, afs_i_init_once); if (!afs_inode_cachep) { printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 25250fa87086..cc0e08252913 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -434,7 +434,7 @@ befs_init_inodecache(void) befs_inode_cachep = kmem_cache_create("befs_inode_cache", sizeof (struct befs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (befs_inode_cachep == NULL) { pr_err("%s: Couldn't initialize inode slabcache\n", __func__); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fdcb4d69f430..1e5c896f6b79 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -270,7 +270,7 @@ static int __init init_inodecache(void) bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/block_dev.c b/fs/block_dev.c index d878e4860fb7..d33071dd683e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -595,7 +595,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b8856e182ae..394017831692 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9161,7 +9161,8 @@ int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f446afada328..ca4d5e8457f1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -639,8 +639,8 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ceph_inode_init_once); if (ceph_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index b7fcb3151103..c4c1169814b2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1092,7 +1092,7 @@ cifs_init_inodecache(void) cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", sizeof(struct cifsInodeInfo), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), cifs_init_once); if (cifs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index cac1390b87a3..57e81cbba0fa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -74,9 +74,9 @@ static void init_once(void *foo) int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", - sizeof(struct coda_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct coda_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (coda_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 8d38cd07b207..b4539e84e577 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; @@ -3415,7 +3416,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4f4d0474bee9..e25b6b06bacf 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -663,6 +663,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; + unsigned long flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { @@ -684,6 +685,7 @@ static struct ecryptfs_cache_info { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), + .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { @@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - *(info->cache) = kmem_cache_create(info->name, info->size, - 0, SLAB_HWCACHE_ALIGN, info->ctor); + *(info->cache) = kmem_cache_create(info->name, info->size, 0, + SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " diff --git a/fs/efs/super.c b/fs/efs/super.c index c8411a30f7da..cb68dac4f9d3 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -94,9 +94,9 @@ static void init_once(void *foo) static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", - sizeof(struct efs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct efs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index b795c567b5e1..6658a50530a0 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -194,8 +194,8 @@ static int init_inodecache(void) { exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - exofs_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, exofs_init_once); if (exofs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 748d35afc902..2a188413a2b0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -203,7 +203,7 @@ static int __init init_inodecache(void) ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9ab67da6e5a..f1b56ff01208 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -966,7 +966,7 @@ static int __init init_inodecache(void) ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3bf990b80026..6134832baaaf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1541,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 509411dd3698..6aece96df19f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void) fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/file.c b/fs/file.c index 1aed0add16a2..1fbc5c0555a9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); + void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | + __GFP_NOWARN | __GFP_NORETRY); if (data != NULL) return data; } - return vmalloc(size); + return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) @@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99..4d69d5c0bedc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 1d709d496364..f99f8e94de3f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void) gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD| + SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) goto fail; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4574fdd3d421..1ca95c232bb5 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -483,8 +483,8 @@ static int __init init_hfs_fs(void) int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", - sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, - hfs_init_once); + sizeof(struct hfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7302d96ae8bf..5d54490a136d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void) int err; hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", - HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f49be23e78aa..cfaa18c7a337 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a561591896bd..458cf463047b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -261,7 +261,7 @@ static int init_inodecache(void) hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d8f51ee8126b..f6820ecf0a11 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1322,7 +1322,7 @@ static int __init init_hugetlbfs_fs(void) error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), - 0, 0, init_once); + 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out2; diff --git a/fs/inode.c b/fs/inode.c index 4230f66b7410..e491e54d2430 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1883,7 +1883,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 61abdc4920da..bcd2d41b318a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -94,7 +94,7 @@ static int __init init_inodecache(void) isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (isofs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d86c5e3176a1..bb080c272149 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void) jffs2_inode_cachep = kmem_cache_create("jffs2_i", sizeof(struct jffs2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), jffs2_i_init_once); if (!jffs2_inode_cachep) { pr_err("error: Failed to initialise inode cache\n"); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8f9176caf098..900925b5eb8c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -898,7 +898,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 0fce46d62b9c..db9cfc598883 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -409,7 +409,8 @@ const struct super_operations logfs_super_operations = { int logfs_init_inode_cache(void) { logfs_inode_cache = kmem_cache_create("logfs_inode_cache", - sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + sizeof(struct logfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, logfs_init_once); if (!logfs_inode_cache) return -ENOMEM; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index cb1789ca1ee6..f975d667c539 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -91,7 +91,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index ce1eb3f9dfe8..1af15fcbe57b 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -82,7 +82,7 @@ static int init_inodecache(void) ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", sizeof(struct ncp_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ncp_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bdb4dc7b4ecd..48fd8a87affe 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1933,7 +1933,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c7343844e6b6..7f5d3d9f1c37 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1416,7 +1416,8 @@ static int __init nilfs_init_cachep(void) { nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", sizeof(struct nilfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + nilfs_inode_init_once); if (!nilfs_inode_cachep) goto fail; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d1a853585b53..2f77f8dfb861 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void) ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - ntfs_big_inode_init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb18a..03768bb3aab1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c62d761d6bf0..faa1365097bc 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1766,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 15e4500cda3e..b61b883c8ff8 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -443,7 +443,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d0e9b9b6223e..42305ddcbaa0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -95,7 +95,8 @@ void __init proc_init_inodecache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_PANIC), init_once); } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index f37b3deb01b4..3a67cfb142d8 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -365,7 +365,7 @@ static int init_inodecache(void) qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 9728b5499e1d..47bb1de07155 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -625,7 +625,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4a62fe8cc3bf..05db7473bcb5 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -626,7 +626,8 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index bb894e78a821..6b00ca357c58 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -619,8 +619,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index dded920cbc8f..5e79bfa4f260 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -419,7 +419,8 @@ static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 07ac18c355e7..d62c423a5a2d 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -346,7 +346,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fd90c079537..a233ba913be4 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2248,8 +2248,8 @@ static int __init ubifs_init(void) ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, - SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, - &inode_slab_ctor); + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; diff --git a/fs/udf/super.c b/fs/udf/super.c index 81155b9b445b..9c64a3ca9837 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -179,7 +179,8 @@ static int __init init_inodecache(void) udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | + SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) return -ENOMEM; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f6390eec02ca..442fd52ebffe 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1427,7 +1427,7 @@ static int __init init_inodecache(void) ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index cc6b768fc068..d1c66e465ca5 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) #define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN #define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT #define KM_ZONE_SPREAD SLAB_MEM_SPREAD +#define KM_ZONE_ACCOUNT SLAB_ACCOUNT #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b35775752b74..59c9b7bd958d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1714,8 +1714,8 @@ xfs_init_zones(void) xfs_inode_zone = kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, - xfs_fs_inode_init_once); + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | + KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3..b4c2a485b28a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -56,9 +56,10 @@ extern long do_no_restart_syscall(struct restart_block *parm); #ifdef __KERNEL__ #ifdef CONFIG_DEBUG_STACK_USAGE -# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ + __GFP_ZERO) #else -# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK) #endif /* diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 161a1807e6ef..f4617cf07069 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1438,7 +1438,7 @@ static int __init init_mqueue_fs(void) mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", sizeof(struct mqueue_inode_info), 0, - SLAB_HWCACHE_ALIGN, init_once); + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once); if (mqueue_inode_cachep == NULL) return -ENOMEM; diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c1d6..0c0cd8a62285 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds); void __init cred_init(void) { /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ef90b04d783f..435c14a45118 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); } diff --git a/kernel/fork.c b/kernel/fork.c index 6774e6b2e96d..51915842f1c0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -300,9 +300,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -1848,16 +1848,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1867,8 +1870,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..f4ad91b746f1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -604,5 +604,5 @@ void __init pidmap_init(void) atomic_dec(&init_pid_ns.pidmap[0].nr_free); init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } diff --git a/mm/nommu.c b/mm/nommu.c index 92be862c859b..fbf6f0f1d6c9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -560,7 +560,7 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); - vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); } /* diff --git a/mm/rmap.c b/mm/rmap.c index b577fbb98d4b..3c3f1d21f075 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -428,8 +428,10 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); - anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, + SLAB_PANIC|SLAB_ACCOUNT); } /* diff --git a/mm/shmem.c b/mm/shmem.c index 5813b7fa85b6..9e60093aca3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3064,7 +3064,7 @@ static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, shmem_init_inode); + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); return 0; } diff --git a/net/socket.c b/net/socket.c index 91c2de6f5020..c044d1e8508c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -294,7 +294,7 @@ static int init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d34558..14f45bf0410c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; -- cgit v1.2.3 From eca56ff906bdd0239485e8b47154a6e73dd9a2f3 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 14 Jan 2016 15:19:26 -0800 Subject: mm, shmem: add internal shmem resident memory accounting Currently looking at /proc//status or statm, there is no way to distinguish shmem pages from pages mapped to a regular file (shmem pages are mapped to /dev/zero), even though their implication in actual memory use is quite different. The internal accounting currently counts shmem pages together with regular files. As a preparation to extend the userspace interfaces, this patch adds MM_SHMEMPAGES counter to mm_rss_stat to account for shmem pages separately from MM_FILEPAGES. The next patch will expose it to userspace - this patch doesn't change the exported values yet, by adding up MM_SHMEMPAGES to MM_FILEPAGES at places where MM_FILEPAGES was used before. The only user-visible change after this patch is the OOM killer message that separates the reported "shmem-rss" from "file-rss". [vbabka@suse.cz: forward-porting, tweak changelog] Signed-off-by: Jerome Marchand Signed-off-by: Vlastimil Babka Acked-by: Konstantin Khlebnikov Acked-by: Michal Hocko Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgtable.c | 5 +---- fs/proc/task_mmu.c | 3 ++- include/linux/mm.h | 18 +++++++++++++++++- include/linux/mm_types.h | 7 ++++--- kernel/events/uprobes.c | 2 +- mm/memory.c | 30 ++++++++++-------------------- mm/oom_kill.c | 5 +++-- mm/rmap.c | 12 +++--------- 8 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 63b039899a5e..aa34af0a0b26 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -603,10 +603,7 @@ static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) else if (is_migration_entry(entry)) { struct page *page = migration_entry_to_page(entry); - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } free_swap_and_cache(entry); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a03759bda38..45eb24145978 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -83,7 +83,8 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES); + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; diff --git a/include/linux/mm.h b/include/linux/mm.h index 00bad7793788..a8ab1fc0e9bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1361,10 +1361,26 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member) atomic_long_dec(&mm->rss_stat.count[member]); } +/* Optimized variant when page is already known not to be PageAnon */ +static inline int mm_counter_file(struct page *page) +{ + if (PageSwapBacked(page)) + return MM_SHMEMPAGES; + return MM_FILEPAGES; +} + +static inline int mm_counter(struct page *page) +{ + if (PageAnon(page)) + return MM_ANONPAGES; + return mm_counter_file(page); +} + static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_ANONPAGES); + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8d1492a114f..207890be93c8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -369,9 +369,10 @@ struct core_state { }; enum { - MM_FILEPAGES, - MM_ANONPAGES, - MM_SWAPENTS, + MM_FILEPAGES, /* Resident file mapping pages */ + MM_ANONPAGES, /* Resident anonymous pages */ + MM_SWAPENTS, /* Anonymous swap entries */ + MM_SHMEMPAGES, /* Resident shared memory pages */ NR_MM_COUNTERS }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913abf..bb0669169716 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); inc_mm_counter(mm, MM_ANONPAGES); } diff --git a/mm/memory.c b/mm/memory.c index c387430f06c3..f7026c035940 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -832,10 +832,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + rss[mm_counter(page)]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -874,10 +871,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + rss[mm_counter(page)]++; } out_set_pte: @@ -1113,9 +1107,8 @@ again: tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else { + + if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; set_page_dirty(page); @@ -1123,8 +1116,8 @@ again: if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); - rss[MM_FILEPAGES]--; } + rss[mm_counter(page)]--; page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); @@ -1146,11 +1139,7 @@ again: struct page *page; page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; + rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); @@ -1460,7 +1449,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, mm_counter_file(page)); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2097,7 +2086,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, MM_FILEPAGES); + dec_mm_counter_fast(mm, + mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else { @@ -2820,7 +2810,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); } set_pte_at(vma->vm_mm, address, pte, entry); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c12680993ff3..dc490c06941b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES))); + K(get_mm_counter(victim->mm, MM_FILEPAGES)), + K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); task_unlock(victim); /* diff --git a/mm/rmap.c b/mm/rmap.c index 3c3f1d21f075..622756c16ac8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1364,10 +1364,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHuge(page)) { hugetlb_count_sub(1 << compound_order(page), mm); } else { - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); @@ -1377,10 +1374,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * interest anymore. Simply discard the pte, vmscan * will take care of the rest. */ - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { swp_entry_t entry; pte_t swp_pte; @@ -1420,7 +1414,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); } else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); page_remove_rmap(page); page_cache_release(page); -- cgit v1.2.3 From d07e22597d1d355829b7b18ac19afa912cf758d1 Mon Sep 17 00:00:00 2001 From: Daniel Cashman Date: Thu, 14 Jan 2016 15:19:53 -0800 Subject: mm: mmap: add new /proc tunable for mmap_base ASLR Address Space Layout Randomization (ASLR) provides a barrier to exploitation of user-space processes in the presence of security vulnerabilities by making it more difficult to find desired code/data which could help an attack. This is done by adding a random offset to the location of regions in the process address space, with a greater range of potential offset values corresponding to better protection/a larger search-space for brute force, but also to greater potential for fragmentation. The offset added to the mmap_base address, which provides the basis for the majority of the mappings for a process, is set once on process exec in arch_pick_mmap_layout() and is done via hard-coded per-arch values, which reflect, hopefully, the best compromise for all systems. The trade-off between increased entropy in the offset value generation and the corresponding increased variability in address space fragmentation is not absolute, however, and some platforms may tolerate higher amounts of entropy. This patch introduces both new Kconfig values and a sysctl interface which may be used to change the amount of entropy used for offset generation on a system. The direct motivation for this change was in response to the libstagefright vulnerabilities that affected Android, specifically to information provided by Google's project zero at: http://googleprojectzero.blogspot.com/2015/09/stagefrightened.html The attack presented therein, by Google's project zero, specifically targeted the limited randomness used to generate the offset added to the mmap_base address in order to craft a brute-force-based attack. Concretely, the attack was against the mediaserver process, which was limited to respawning every 5 seconds, on an arm device. The hard-coded 8 bits used resulted in an average expected success rate of defeating the mmap ASLR after just over 10 minutes (128 tries at 5 seconds a piece). With this patch, and an accompanying increase in the entropy value to 16 bits, the same attack would take an average expected time of over 45 hours (32768 tries), which makes it both less feasible and more likely to be noticed. The introduced Kconfig and sysctl options are limited by per-arch minimum and maximum values, the minimum of which was chosen to match the current hard-coded value and the maximum of which was chosen so as to give the greatest flexibility without generating an invalid mmap_base address, generally a 3-4 bits less than the number of bits in the user-space accessible virtual address space. When decided whether or not to change the default value, a system developer should consider that mmap_base address could be placed anywhere up to 2^(value) bits away from the non-randomized location, which would introduce variable-sized areas above and below the mmap_base address such that the maximum vm_area_struct size may be reduced, preventing very large allocations. This patch (of 4): ASLR only uses as few as 8 bits to generate the random offset for the mmap base address on 32 bit architectures. This value was chosen to prevent a poorly chosen value from dividing the address space in such a way as to prevent large allocations. This may not be an issue on all platforms. Allow the specification of a minimum number of bits so that platforms desiring greater ASLR protection may determine where to place the trade-off. Signed-off-by: Daniel Cashman Cc: Russell King Acked-by: Kees Cook Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Don Zickus Cc: Eric W. Biederman Cc: Heinrich Schuchardt Cc: Josh Poimboeuf Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Thomas Gleixner Cc: David Rientjes Cc: Mark Salyzyn Cc: Jeff Vander Stoep Cc: Nick Kralevich Cc: Catalin Marinas Cc: Will Deacon Cc: "H. Peter Anvin" Cc: Hector Marco-Gisbert Cc: Borislav Petkov Cc: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 29 +++++++++++++++++++ arch/Kconfig | 68 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 11 ++++++++ kernel/sysctl.c | 22 +++++++++++++++ mm/mmap.c | 12 ++++++++ 5 files changed, 142 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f72370b440b1..ee763f3d3b52 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -42,6 +42,8 @@ Currently, these files are in /proc/sys/vm: - min_slab_ratio - min_unmapped_ratio - mmap_min_addr +- mmap_rnd_bits +- mmap_rnd_compat_bits - nr_hugepages - nr_overcommit_hugepages - nr_trim_pages (only if CONFIG_MMU=n) @@ -485,6 +487,33 @@ against future potential kernel bugs. ============================================================== +mmap_rnd_bits: + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations on architectures which support +tuning address space randomization. This value will be bounded +by the architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_bits tunable + +============================================================== + +mmap_rnd_compat_bits: + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations for applications run in +compatibility mode on architectures which support tuning address +space randomization. This value will be bounded by the +architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_compat_bits tunable + +============================================================== + nr_hugepages Change the minimum size of the hugepage pool. diff --git a/arch/Kconfig b/arch/Kconfig index 4e949e58b192..ba1b626bca00 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -511,6 +511,74 @@ config ARCH_HAS_ELF_RANDOMIZE - arch_mmap_rnd() - arch_randomize_brk() +config HAVE_ARCH_MMAP_RND_BITS + bool + help + An arch should select this symbol if it supports setting a variable + number of bits for use in establishing the base address for mmap + allocations, has MMU enabled and provides values for both: + - ARCH_MMAP_RND_BITS_MIN + - ARCH_MMAP_RND_BITS_MAX + +config ARCH_MMAP_RND_BITS_MIN + int + +config ARCH_MMAP_RND_BITS_MAX + int + +config ARCH_MMAP_RND_BITS_DEFAULT + int + +config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT + default ARCH_MMAP_RND_BITS_MIN + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to + determine the random offset to the base address of vma regions + resulting from mmap allocations. This value will be bounded + by the architecture's minimum and maximum supported values. + + This value can be changed after boot using the + /proc/sys/vm/mmap_rnd_bits tunable + +config HAVE_ARCH_MMAP_RND_COMPAT_BITS + bool + help + An arch should select this symbol if it supports running applications + in compatibility mode, supports setting a variable number of bits for + use in establishing the base address for mmap allocations, has MMU + enabled and provides values for both: + - ARCH_MMAP_RND_COMPAT_BITS_MIN + - ARCH_MMAP_RND_COMPAT_BITS_MAX + +config ARCH_MMAP_RND_COMPAT_BITS_MIN + int + +config ARCH_MMAP_RND_COMPAT_BITS_MAX + int + +config ARCH_MMAP_RND_COMPAT_BITS_DEFAULT + int + +config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT + default ARCH_MMAP_RND_COMPAT_BITS_MIN + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to + determine the random offset to the base address of vma regions + resulting from mmap allocations for compatible applications This + value will be bounded by the architecture's minimum and maximum + supported values. + + This value can be changed after boot using the + /proc/sys/vm/mmap_rnd_compat_bits tunable + config HAVE_COPY_THREAD_TLS bool help diff --git a/include/linux/mm.h b/include/linux/mm.h index a8ab1fc0e9bc..d396753c0577 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -51,6 +51,17 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +extern const int mmap_rnd_bits_min; +extern const int mmap_rnd_bits_max; +extern int mmap_rnd_bits __read_mostly; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +extern const int mmap_rnd_compat_bits_min; +extern const int mmap_rnd_compat_bits_max; +extern int mmap_rnd_compat_bits __read_mostly; +#endif + #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5faf89ac9ec0..c810f8afdb7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; diff --git a/mm/mmap.c b/mm/mmap.c index c311bfd8005b..f32b84ad621a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -58,6 +58,18 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -- cgit v1.2.3 From 0eb77e9880321915322d42913c3b53241739c8aa Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 2 ++ kernel/sched/idle.c | 1 + mm/vmstat.c | 69 ++++++++++++++++++++++++++++++++------------------ 3 files changed, 47 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3e5d9075960f..73fae8c4a5fb 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); +void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); @@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } +static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..2489140a7c51 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/mm/vmstat.c b/mm/vmstat.c index c54fd2924f25..83a003bc3cae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w) } } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1449,7 +1468,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { -- cgit v1.2.3 From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 14 Jan 2016 15:22:07 -0800 Subject: mm: rework virtual memory accounting When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which testing the RLIMIT_DATA value to figure out if we're allowed to assign new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been commited that RLIMIT_DATA in a form it's implemented now doesn't do anything useful because most of user-space libraries use mmap() syscall for dynamic memory allocations. Linus suggested to convert RLIMIT_DATA rlimit into something suitable for anonymous memory accounting. But in this patch we go further, and the changes are bundled together as: * keep vma counting if CONFIG_PROC_FS=n, will be used for limits * replace mm->shared_vm with better defined mm->data_vm * account anonymous executable areas as executable * account file-backed growsdown/up areas as stack * drop struct file* argument from vm_stat_account * enforce RLIMIT_DATA for size of data areas This way code looks cleaner: now code/stack/data classification depends only on vm_flags state: VM_EXEC & ~VM_WRITE -> code (VmExe + VmLib in proc) VM_GROWSUP | VM_GROWSDOWN -> stack (VmStk) VM_WRITE & ~VM_SHARED & !stack -> data (VmData) The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called "shared", but that might be strange beast like readonly-private or VM_IO area. - RLIMIT_AS limits whole address space "VmSize" - RLIMIT_STACK limits stack "VmStk" (but each vma individually) - RLIMIT_DATA now limits "VmData" Signed-off-by: Konstantin Khlebnikov Signed-off-by: Cyrill Gorcunov Cc: Quentin Casasnovas Cc: Vegard Nossum Acked-by: Linus Torvalds Cc: Willy Tarreau Cc: Andy Lutomirski Cc: Kees Cook Cc: Vladimir Davydov Cc: Pavel Emelyanov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 3 +-- fs/proc/task_mmu.c | 7 +++--- include/linux/mm.h | 13 +++------- include/linux/mm_types.h | 2 +- kernel/fork.c | 5 ++-- mm/debug.c | 4 +-- mm/mmap.c | 63 +++++++++++++++++++++++----------------------- mm/mprotect.c | 8 ++++-- mm/mremap.c | 7 +++--- 9 files changed, 54 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 60e02f7747ff..9cd607b06964 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2332,8 +2332,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t */ insert_vm_struct(mm, vma); - vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); + vm_stat_account(vma->vm_mm, vma->vm_flags, vma_pages(vma)); up_write(&task->mm->mmap_sem); /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 46d9619d0aea..a353b4c6e86e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -23,7 +23,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -44,7 +44,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); @@ -76,7 +75,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) anon << (PAGE_SHIFT-10), file << (PAGE_SHIFT-10), shmem << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), + mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, @@ -97,7 +96,7 @@ unsigned long task_statm(struct mm_struct *mm, get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; - *data = mm->total_vm - mm->shared_vm; + *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index ec9d4559514d..839d9e9a1c38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1929,7 +1929,9 @@ extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); -extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); +extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); +extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); + extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, @@ -2147,15 +2149,6 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); -#else -static inline void vm_stat_account(struct mm_struct *mm, - unsigned long flags, struct file *file, long pages) -{ - mm->total_vm += pages; -} -#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_PAGEALLOC extern bool _debug_pagealloc_enabled; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 207890be93c8..6bc9a0ce2253 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -427,7 +427,7 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ unsigned long pinned_vm; /* Refcount permanently increased */ - unsigned long shared_vm; /* Shared pages (files) */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; diff --git a/kernel/fork.c b/kernel/fork.c index 51915842f1c0..2e391c754ae7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; diff --git a/mm/debug.c b/mm/debug.c index 668aa35191ca..5d2072ed8d5e 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm) mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, diff --git a/mm/mmap.c b/mm/mmap.c index f32b84ad621a..b3f00b616b81 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1220,24 +1220,6 @@ none: return NULL; } -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) -{ - const unsigned long stack_flags - = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); - - mm->total_vm += pages; - - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; - } else if (flags & stack_flags) - mm->stack_vm += pages; -} -#endif /* CONFIG_PROC_FS */ - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1556,7 +1538,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; /* @@ -1565,7 +1547,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ nr_pages = count_vma_pages_range(mm, addr, addr + len); - if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } @@ -1664,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) @@ -2111,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long new_start, actual_size; /* address space limit tests */ - if (!may_expand_vm(mm, grow)) + if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -2208,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2284,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; @@ -2399,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vm_stat_account(mm, vma->vm_flags, -nrpages); vma = remove_vma(vma); } while (vma); vm_unacct_memory(nr_accounted); @@ -2769,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) } /* Check against address space limits *after* clearing old maps... */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2804,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; @@ -2995,9 +2977,28 @@ out: * Return true if the calling process may expand its vm space by the passed * number of pages */ -int may_expand_vm(struct mm_struct *mm, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { - return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT; + if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) + return false; + + if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & + (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) + return mm->data_vm + npages <= rlimit(RLIMIT_DATA); + + return true; +} + +void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) +{ + mm->total_vm += npages; + + if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) + mm->exec_vm += npages; + else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) + mm->stack_vm += npages; + else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + mm->data_vm += npages; } static int special_mapping_fault(struct vm_area_struct *vma, @@ -3079,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping( if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8eaab00..c764402c464f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { + /* Check space limits when area turns into data. */ + if (!may_expand_vm(mm, newflags, nrpages) && + may_expand_vm(mm, oldflags, nrpages)) + return -ENOMEM; if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; @@ -334,8 +338,8 @@ success: populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); + vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index de824e72c3e8..e55b157865d5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); /* Tell pfnmap has moved from this vma */ if (unlikely(vma->vm_flags & VM_PFNMAP)) @@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EAGAIN); } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + if (!may_expand_vm(mm, vma->vm_flags, + (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { @@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; locked = true; -- cgit v1.2.3