From 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Oct 2016 08:04:03 +0800 Subject: sched/fair: Fix sched domains NULL dereference in select_idle_sibling() Commit: 10e2f1acd01 ("sched/core: Rewrite and improve select_idle_siblings()") ... improved select_idle_sibling(), but also triggered a regression (crash) during CPU-hotplug: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 IP: [] select_idle_sibling+0x1c2/0x4f0 Call Trace: select_task_rq_fair+0x749/0x930 ? select_task_rq_fair+0xb4/0x930 ? __lock_is_held+0x54/0x70 try_to_wake_up+0x19a/0x5b0 default_wake_function+0x12/0x20 autoremove_wake_function+0x12/0x40 __wake_up_common+0x55/0x90 __wake_up+0x39/0x50 wake_up_klogd_work_func+0x40/0x60 irq_work_run_list+0x57/0x80 irq_work_run+0x2c/0x30 smp_irq_work_interrupt+0x2e/0x40 irq_work_interrupt+0x96/0xa0 ? _raw_spin_unlock_irqrestore+0x45/0x80 try_to_wake_up+0x4a/0x5b0 wake_up_state+0x10/0x20 __kthread_unpark+0x67/0x70 kthread_unpark+0x22/0x30 cpuhp_online_idle+0x3e/0x70 cpu_startup_entry+0x6a/0x450 start_secondary+0x154/0x180 This can be reproduced by running the ftrace test case of kselftest, the test case will hot-unplug the CPU and the CPU will attach to the NULL sched-domain during scheduler teardown. The step 2 for the rewrite select_idle_siblings(): | Step 2) tracks the average cost of the scan and compares this to the | average idle time guestimate for the CPU doing the wakeup. If the CPU which doing the wakeup is the going hot-unplug CPU, then NULL sched domain will be dereferenced to acquire the average cost of the scan. This patch fix it by failing the search of an idle CPU in the LLC process if this sched domain is NULL. Tested-by: Catalin Marinas Signed-off-by: Wanpeng Li Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1475971443-3187-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 502e95a6e927..8b03fb5d1b9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { - struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - u64 avg_idle = this_rq()->avg_idle; - u64 avg_cost = this_sd->avg_scan_cost; + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; int cpu, wrap; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. -- cgit v1.2.3 From a705e07b9c80df27b6bb12f7a4cd4cf4ed2f728b Mon Sep 17 00:00:00 2001 From: Joonas Lahtinen Date: Wed, 12 Oct 2016 13:18:56 +0300 Subject: cpu/hotplug: Use distinct name for cpu_hotplug.dep_map Use distinctive name for cpu_hotplug.dep_map to avoid the actual cpu_hotplug.lock appearing as cpu_hotplug.lock#2 in lockdep splats. Signed-off-by: Joonas Lahtinen Reviewed-by: Chris Wilson Acked-by: Gautham R. Shenoy Cc: Andrew Morton Cc: Daniel Vetter Cc: Gautham R . Shenoy Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; -- cgit v1.2.3 From 54e23845e965898f65f76aba79fa9db76d830fa9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 17 Oct 2016 11:47:02 +0200 Subject: alarmtimer: Remove unused but set variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the set but unused variable base in alarm_clock_get to fix the following warning when building with 'W=1': kernel/time/alarmtimer.c: In function ‘alarm_timer_create’: kernel/time/alarmtimer.c:545:21: warning: variable ‘base’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: John Stultz Link: http://lkml.kernel.org/r/20161017094702.10873-1-tklauser@distanz.ch Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } -- cgit v1.2.3 From b5a9b340789b2b24c6896bcf7a065c31a4db671c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 19 Oct 2016 14:45:23 +0200 Subject: sched/fair: Fix incorrect task group ->load_avg A scheduler performance regression has been reported by Joseph Salisbury, which he bisected back to: 3d30544f0212 ("sched/fair: Apply more PELT fixes) The regression triggers when several levels of task groups are involved (read: SystemD) and cpu_possible_mask != cpu_present_mask. The root cause is that group entity's load (tg_child->se[i]->avg.load_avg) is initialized to scale_load_down(se->load.weight). During the creation of a child task group, its group entities on possible CPUs are attached to parent's cfs_rq (tg_parent) and their loads are added to the parent's load (tg_parent->load_avg) with update_tg_load_avg(). But only the load on online CPUs will then be updated to reflect real load, whereas load on other CPUs will stay at the initial value. The result is a tg_parent->load_avg that is higher than the real load, the weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is smaller than it should be, and the task group gets a less running time than what it could expect. ( This situation can be detected with /proc/sched_debug. The ".tg_load_avg" of the task group will be much higher than sum of ".tg_load_avg_contrib" of online cfs_rqs of the task group. ) The load of group entities don't have to be intialized to something else than 0 because their load will increase when an entity is attached. Reported-by: Joseph Salisbury Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: # 4.8.x Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: joonwoop@codeaurora.org Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes) Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76ee7de1859d..d941c97dfbc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway -- cgit v1.2.3 From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 7 +++++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 6 +++++- drivers/infiniband/core/umem_odp.c | 7 +++++-- fs/exec.c | 9 +++++++-- include/linux/mm.h | 2 +- kernel/events/uprobes.c | 6 ++++-- mm/gup.c | 22 +++++++--------------- mm/memory.c | 6 +++++- security/tomoyo/domain.c | 2 +- 9 files changed, 40 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 5ce3603e6eac..0370b842d9cc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -748,19 +748,22 @@ static struct page **etnaviv_gem_userptr_do_get_pages( int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT; struct page **pvec; uintptr_t ptr; + unsigned int flags = 0; pvec = drm_malloc_ab(npages, sizeof(struct page *)); if (!pvec) return ERR_PTR(-ENOMEM); + if (!etnaviv_obj->userptr.ro) + flags |= FOLL_WRITE; + pinned = 0; ptr = etnaviv_obj->userptr.ptr; down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - !etnaviv_obj->userptr.ro, 0, - pvec + pinned, NULL); + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index e537930c64b5..c6f780f5abc9 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -508,6 +508,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) pvec = drm_malloc_gfp(npages, sizeof(struct page *), GFP_TEMPORARY); if (pvec != NULL) { struct mm_struct *mm = obj->userptr.mm->mm; + unsigned int flags = 0; + + if (!obj->userptr.read_only) + flags |= FOLL_WRITE; ret = -EFAULT; if (atomic_inc_not_zero(&mm->mm_users)) { @@ -517,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) (work->task, mm, obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, - !obj->userptr.read_only, 0, + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 75077a018675..1f0fe3217f23 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 off; int j, k, ret = 0, start_idx, npages = 0; u64 base_virt_addr; + unsigned int flags = 0; if (access_mask == 0) return -EINVAL; @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, goto out_put_task; } + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, - 0, local_page_list, NULL); + flags, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f7b137..4e497b9ee71e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index 30bb5d9631bb..ecc4be7b67e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1276,7 +1276,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index bbc2e76cdd77..7aa113c2d373 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -915,9 +915,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). + * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -946,9 +944,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. * * get_user_pages is typically used for fewer-copy IO operations, to get a * handle on the memory by some means other than accesses via the user virtual @@ -965,18 +963,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - unsigned int flags = FOLL_TOUCH | FOLL_REMOTE; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, flags); + NULL, false, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); diff --git a/mm/memory.c b/mm/memory.c index fc1987dfd8cc..20a9adb7b36e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3873,6 +3873,10 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, { struct vm_area_struct *vma; void *old_buf = buf; + unsigned int flags = FOLL_FORCE; + + if (write) + flags |= FOLL_WRITE; down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -3882,7 +3886,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - write, 1, &page, &vma); + flags, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index ade7c6cad172..682b73af7766 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - 0, 1, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; -- cgit v1.2.3 From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:20 +0100 Subject: mm: replace access_process_vm() write parameter with gup_flags This removes the 'write' argument from access_process_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Jesper Nilsson Acked-by: Michal Hocko Acked-by: Michael Ellerman Signed-off-by: Linus Torvalds --- arch/alpha/kernel/ptrace.c | 9 ++++++--- arch/blackfin/kernel/ptrace.c | 5 +++-- arch/cris/arch-v32/kernel/ptrace.c | 4 ++-- arch/ia64/kernel/ptrace.c | 14 +++++++++----- arch/m32r/kernel/ptrace.c | 15 ++++++++++----- arch/mips/kernel/ptrace32.c | 5 +++-- arch/powerpc/kernel/ptrace32.c | 5 +++-- arch/score/kernel/ptrace.c | 10 ++++++---- arch/sparc/kernel/ptrace_64.c | 24 ++++++++++++++++-------- arch/x86/kernel/step.c | 3 ++- arch/x86/um/ptrace_32.c | 3 ++- arch/x86/um/ptrace_64.c | 3 ++- include/linux/mm.h | 3 ++- kernel/ptrace.c | 16 ++++++++++------ mm/memory.c | 8 ++------ mm/nommu.c | 6 +++--- mm/util.c | 5 +++-- 17 files changed, 84 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index d9ee81769899..940dfb406591 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -157,14 +157,16 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data) static inline int read_int(struct task_struct *task, unsigned long addr, int * data) { - int copied = access_process_vm(task, addr, data, sizeof(int), 0); + int copied = access_process_vm(task, addr, data, sizeof(int), + FOLL_FORCE); return (copied == sizeof(int)) ? 0 : -EIO; } static inline int write_int(struct task_struct *task, unsigned long addr, int data) { - int copied = access_process_vm(task, addr, &data, sizeof(int), 1); + int copied = access_process_vm(task, addr, &data, sizeof(int), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(int)) ? 0 : -EIO; } @@ -281,7 +283,8 @@ long arch_ptrace(struct task_struct *child, long request, /* When I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ case PTRACE_PEEKDATA: - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), + FOLL_FORCE); ret = -EIO; if (copied != sizeof(tmp)) break; diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index 8b8fe671b1a6..8d79286ee4e8 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -271,7 +271,7 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &tmp, - to_copy, 0); + to_copy, FOLL_FORCE); if (copied) break; @@ -324,7 +324,8 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &data, - to_copy, 1); + to_copy, + FOLL_FORCE | FOLL_WRITE); break; case BFIN_MEM_ACCESS_DMA: if (safe_dma_memcpy(paddr, &data, to_copy)) diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c index f085229cf870..f0df654ac6fc 100644 --- a/arch/cris/arch-v32/kernel/ptrace.c +++ b/arch/cris/arch-v32/kernel/ptrace.c @@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request, /* The trampoline page is globally mapped, no page table to traverse.*/ tmp = *(unsigned long*)addr; } else { - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; @@ -279,7 +279,7 @@ static int insn_size(struct task_struct *child, unsigned long pc) int opsize = 0; /* Read the opcode at pc (do what PTRACE_PEEKTEXT would do). */ - copied = access_process_vm(child, pc, &opcode, sizeof(opcode), 0); + copied = access_process_vm(child, pc, &opcode, sizeof(opcode), FOLL_FORCE); if (copied != sizeof(opcode)) return 0; diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 6f54d511cc50..31aa8c0f68e1 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -453,7 +453,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, return 0; } } - copied = access_process_vm(child, addr, &ret, sizeof(ret), 0); + copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE); if (copied != sizeof(ret)) return -EIO; *val = ret; @@ -489,7 +489,8 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack, *ia64_rse_skip_regs(krbs, regnum) = val; } } - } else if (access_process_vm(child, addr, &val, sizeof(val), 1) + } else if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; return 0; @@ -543,7 +544,8 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw, ret = ia64_peek(child, sw, user_rbs_end, addr, &val); if (ret < 0) return ret; - if (access_process_vm(child, addr, &val, sizeof(val), 1) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; } @@ -559,7 +561,8 @@ ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw, /* now copy word for word from user rbs to kernel rbs: */ for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) { - if (access_process_vm(child, addr, &val, sizeof(val), 0) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE) != sizeof(val)) return -EIO; @@ -1156,7 +1159,8 @@ arch_ptrace (struct task_struct *child, long request, case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: /* read word at location addr */ - if (access_process_vm(child, addr, &data, sizeof(data), 0) + if (access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE) != sizeof(data)) return -EIO; /* ensure return value is not mistaken for error code */ diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index 51f5e9aa4901..c145605a981f 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -493,7 +493,8 @@ unregister_all_debug_traps(struct task_struct *child) int i; for (i = 0; i < p->nr_trap; i++) - access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), 1); + access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), + FOLL_FORCE | FOLL_WRITE); p->nr_trap = 0; } @@ -537,7 +538,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) unsigned long next_insn, code; unsigned long addr = next_pc & ~3; - if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), 0) + if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), + FOLL_FORCE) != sizeof(next_insn)) { return -1; /* error */ } @@ -546,7 +548,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) if (register_debug_trap(child, next_pc, next_insn, &code)) { return -1; /* error */ } - if (access_process_vm(child, addr, &code, sizeof(code), 1) + if (access_process_vm(child, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE) != sizeof(code)) { return -1; /* error */ } @@ -562,7 +565,8 @@ withdraw_debug_trap(struct pt_regs *regs) addr = (regs->bpc - 2) & ~3; regs->bpc -= 2; if (unregister_debug_trap(current, addr, &code)) { - access_process_vm(current, addr, &code, sizeof(code), 1); + access_process_vm(current, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE); invalidate_cache(); } } @@ -589,7 +593,8 @@ void user_enable_single_step(struct task_struct *child) /* Compute next pc. */ pc = get_stack_long(child, PT_BPC); - if (access_process_vm(child, pc&~3, &insn, sizeof(insn), 0) + if (access_process_vm(child, pc&~3, &insn, sizeof(insn), + FOLL_FORCE) != sizeof(insn)) return; diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 283b5a1967d1..7e71a4e0281b 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -70,7 +70,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *) (unsigned long) data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &data, - sizeof(data), 1) == sizeof(data)) + sizeof(data), + FOLL_FORCE | FOLL_WRITE) == sizeof(data)) break; ret = -EIO; break; diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index f52b7db327c8..010b7b310237 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -74,7 +74,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *)data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 1) == sizeof(tmp)) + sizeof(tmp), + FOLL_FORCE | FOLL_WRITE) == sizeof(tmp)) break; ret = -EIO; break; diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c index 55836188b217..4f7314d5f334 100644 --- a/arch/score/kernel/ptrace.c +++ b/arch/score/kernel/ptrace.c @@ -131,7 +131,7 @@ read_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -142,7 +142,7 @@ read_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -153,7 +153,8 @@ write_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } @@ -164,7 +165,8 @@ write_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 9ddc4928a089..ac082dd8c67d 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -127,7 +127,8 @@ static int get_from_target(struct task_struct *target, unsigned long uaddr, if (copy_from_user(kbuf, (void __user *) uaddr, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 0); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE); if (len2 != len) return -EFAULT; } @@ -141,7 +142,8 @@ static int set_to_target(struct task_struct *target, unsigned long uaddr, if (copy_to_user((void __user *) uaddr, kbuf, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 1); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE | FOLL_WRITE); if (len2 != len) return -EFAULT; } @@ -505,7 +507,8 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - k, sizeof(*k), 0) + k, sizeof(*k), + FOLL_FORCE) != sizeof(*k)) return -EFAULT; k++; @@ -531,12 +534,14 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; @@ -615,7 +620,8 @@ static int genregs32_set(struct task_struct *target, (unsigned long) ®_window[pos], (void *) k, - sizeof(*k), 1) + sizeof(*k), + FOLL_FORCE | FOLL_WRITE) != sizeof(*k)) return -EFAULT; k++; @@ -642,13 +648,15 @@ static int genregs32_set(struct task_struct *target, if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c9a073866ca7..a23ce84a3f6c 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -57,7 +57,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) unsigned char opcode[15]; unsigned long addr = convert_ip_to_linear(child, regs); - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + copied = access_process_vm(child, addr, opcode, sizeof(opcode), + FOLL_FORCE); for (i = 0; i < copied; i++) { switch (opcode[i]) { /* popf and iret */ diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 5766ead6fdb9..60a5a5a85505 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -36,7 +36,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk(KERN_ERR "is_syscall : failed to read " "instruction from 0x%lx\n", addr); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 0b5c184dd5b3..e30202b1716e 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -212,7 +212,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk("is_syscall : failed to read instruction from " "0x%lx\n", addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index f31bf9058587..ffbd72979ee7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1266,7 +1266,8 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/mm/memory.c b/mm/memory.c index bac2d994850e..e18c57bdc75c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3951,20 +3951,16 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; - unsigned int flags = FOLL_FORCE; mm = get_task_mm(tsk); if (!mm) return 0; - if (write) - flags |= FOLL_WRITE; - - ret = __access_remote_vm(tsk, mm, addr, buf, len, flags); + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 93d5bb53fc63..db5fd1795298 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1861,7 +1861,8 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Access another process' address space. * - source/target buffer must be kernel space */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags) { struct mm_struct *mm; @@ -1872,8 +1873,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, - write ? FOLL_WRITE : 0); + len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/util.c b/mm/util.c index 4c685bde5ebc..952cbe7ad7b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -624,7 +624,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen) len = buflen; - res = access_process_vm(task, arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then @@ -639,7 +639,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, - buffer+res, len, 0); + buffer+res, len, + FOLL_FORCE); res = strnlen(buffer, res); } } -- cgit v1.2.3 From 8835ca59dac2bc1e0136791abf3ccd51588803ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 19 Oct 2016 09:11:24 -0700 Subject: printk: suppress empty continuation lines We have a fairly common pattern where you print several things as continuations on one single line in a loop, and then at the end you do printk(KERN_CONT "\n"); to flush the buffered output. But if the output was flushed by something else (concurrent printk activity, or just system logging), we don't want that final flushing to just print an empty line. So just suppress empty continuation lines when they couldn't be merged into the line they are a continuation of. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..de08fc90baaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c cont_flush(); } + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { if (cont_add(facility, level, lflags, text, text_len)) -- cgit v1.2.3 From 3118dac501bc0317de099db81618d589503351e1 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 6 Oct 2016 23:06:43 +0530 Subject: kernel/irq: Export irq_set_parent() The TPS65217 driver grew interrupt support which uses irq_set_parent(). While it's not yet clear why this is used in the first place, building the driver as a module fails with: ERROR: ".irq_set_parent" [drivers/mfd/tps65217.ko] undefined! The correctness of the driver change is still investigated, but for now it's less trouble to export irq_set_parent() than dealing with the build wreckage. [ tglx: Rewrote changelog and made the export GPL ] Fixes: 6556bdacf646 ("mfd: tps65217: Add support for IRQs") Signed-off-by: Sudip Mukherjee Cc: Sudip Mukherjee Cc: Marcin Niestroj Cc: Grygorii Strashko Cc: Tony Lindgren Cc: Lee Jones Link: http://lkml.kernel.org/r/1475775403-27207-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..9c4d30483264 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* -- cgit v1.2.3 From f660f6066716b700148f60dba3461e65efff3123 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Oct 2016 15:10:51 +0300 Subject: softirq: Display IRQ_POLL for irq-poll statistics This library was moved to the generic area and was renamed to irq-poll. Hence, update proc/softirqs output accordingly. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 1bf81ef91375..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; -- cgit v1.2.3 From 1adb469b9b76276d7e5ea36a20a24c47d6618a0b Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Fri, 21 Oct 2016 16:24:09 +0100 Subject: PM / suspend: Fix missing KERN_CONT for suspend message Commit 4bcc595ccd80 (printk: reinstate KERN_CONT for printing continuation lines) exposed a missing KERN_CONT from one of the messages shown on entering suspend. With v4.9-rc1, the 'done.' shown after syncing the filesystems no longer appears as a continuation but a new message with its own timestamp. [ 9.259566] PM: Syncing filesystems ... [ 9.264119] done. Fix this by adding the KERN_CONT log level for the 'done.' part of the message seen after syncing filesystems. While we are at it, convert these suspend printks to pr_info and pr_cont, respectively. Signed-off-by: Jon Hunter Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif -- cgit v1.2.3 From b831275a3553c32091222ac619cfddd73a5553fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:41:56 +0200 Subject: timers: Plug locking race vs. timer migration Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the timer flags. As a consequence the compiler is allowed to reload the flags between the initial check for TIMER_MIGRATION and the following timer base computation and the spin lock of the base. While this has not been observed (yet), we need to make sure that it never happens. Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Peter Zijlstra --- kernel/time/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d47980a1bc4..0d4b91c5a374 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -943,7 +943,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); -- cgit v1.2.3 From 4da9152a4308dcbf611cde399c695c359fc9145f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:55:10 +0200 Subject: timers: Lock base for same bucket optimization Linus stumbled over the unlocked modification of the timer expiry value in mod_timer() which is an optimization for timers which stay in the same bucket - due to the bucket granularity - despite their expiry time getting updated. The optimization itself still makes sense even if we take the lock, because in case that the bucket stays the same, we avoid the pointless queue/enqueue dance. Make the check and the modification of timer->expires protected by the base lock and shuffle the remaining code around so we can keep the lock held when we actually have to requeue the timer to a different bucket. Fixes: f00c0afdfa62 ("timers: Implement optimization for same expiry time in mod_timer()") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Peter Zijlstra --- kernel/time/timer.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0d4b91c5a374..ccf913038f9f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -971,6 +971,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -979,13 +981,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -995,14 +1000,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1035,9 +1040,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); -- cgit v1.2.3 From 041ad7bc758db259bb960ef795197dd14aab19a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2016 11:07:35 +0000 Subject: timers: Prevent base clock rewind when forwarding clock Ashton and Michael reported, that kernel versions 4.8 and later suffer from USB timeouts which are caused by the timer wheel rework. This is caused by a bug in the base clock forwarding mechanism, which leads to timers expiring early. The scenario which leads to this is: run_timers() while (jiffies >= base->clk) { collect_expired_timers(); base->clk++; expire_timers(); } So base->clk = jiffies + 1. Now the cpu goes idle: idle() get_next_timer_interrupt() nextevt = __next_time_interrupt(); if (time_after(nextevt, base->clk)) base->clk = jiffies; jiffies has not advanced since run_timers(), so this assignment effectively decrements base->clk by one. base->clk is the index into the timer wheel arrays. So let's assume the following state after the base->clk increment in run_timers(): jiffies = 0 base->clk = 1 A timer gets enqueued with an expiry delta of 63 ticks (which is the case with the USB timeout and HZ=250) so the resulting bucket index is: base->clk + delta = 1 + 63 = 64 The timer goes into the first wheel level. The array size is 64 so it ends up in bucket 0, which is correct as it takes 63 ticks to advance base->clk to index into bucket 0 again. If the cpu goes idle before jiffies advance, then the bug in the forwarding mechanism sets base->clk back to 0, so the next invocation of run_timers() at the next tick will index into bucket 0 and therefore expire the timer 62 ticks too early. Instead of blindly setting base->clk to jiffies we must make the forwarding conditional on jiffies > base->clk, but we cannot use jiffies for this as we might run into the following issue: if (time_after(jiffies, base->clk) { if (time_after(nextevt, base->clk)) base->clk = jiffies; jiffies can increment between the check and the assigment far enough to advance beyond nextevt. So we need to use a stable value for checking. get_next_timer_interrupt() has the basej argument which is the jiffies value snapshot taken in the calling code. So we can just that. Thanks to Ashton for bisecting and providing trace data! Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Reported-by: Ashton Holmes Reported-by: Michael Thayer Signed-off-by: Thomas Gleixner Cc: Michal Necasek Cc: Peter Zijlstra Cc: knut.osmundsen@oracle.com Cc: stable@vger.kernel.org Cc: stern@rowland.harvard.edu Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161022110552.175308322@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ccf913038f9f..7c446fb5163a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1523,12 +1523,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; -- cgit v1.2.3 From 6bad6bccf2d717f652d37e63cf261eaa23466009 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2016 11:07:37 +0000 Subject: timers: Prevent base clock corruption when forwarding When a timer is enqueued we try to forward the timer base clock. This mechanism has two issues: 1) Forwarding a remote base unlocked The forwarding function is called from get_target_base() with the current timer base lock held. But if the new target base is a different base than the current base (can happen with NOHZ, sigh!) then the forwarding is done on an unlocked base. This can lead to corruption of base->clk. Solution is simple: Invoke the forwarding after the target base is locked. 2) Possible corruption due to jiffies advancing This is similar to the issue in get_net_timer_interrupt() which was fixed in the previous patch. jiffies can advance between check and assignement and therefore advancing base->clk beyond the next expiry value. So we need to read jiffies into a local variable once and do the checks and assignment with the local copy. Fixes: a683f390b93f("timers: Forward the wheel clock whenever possible") Reported-by: Ashton Holmes Reported-by: Michael Thayer Signed-off-by: Thomas Gleixner Cc: Michal Necasek Cc: Peter Zijlstra Cc: knut.osmundsen@oracle.com Cc: stable@vger.kernel.org Cc: stern@rowland.harvard.edu Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161022110552.253640125@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 7c446fb5163a..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -1037,6 +1031,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance -- cgit v1.2.3 From f5d6d2da0d9098a4aa0ebcc187aa0fc167045d6b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 26 Oct 2016 13:37:04 +0200 Subject: sched/fair: Remove unused but set variable 'rq' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit: 8663e24d56dc ("sched/fair: Reorder cgroup creation code") ... the variable 'rq' in alloc_fair_sched_group() is set but no longer used. Remove it to fix the following GCC warning when building with 'W=1': kernel/sched/fair.c:8842:13: warning: variable ‘rq’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026113704.8981-1-tklauser@distanz.ch Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d941c97dfbc3..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8839,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8854,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) -- cgit v1.2.3 From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 30 +------------ kernel/sched/core.c | 16 +++++++ kernel/sched/wait.c | 10 ----- mm/filemap.c | 4 +- mm/memory_hotplug.c | 28 ------------ mm/page_alloc.c | 115 +------------------------------------------------ 6 files changed, 21 insertions(+), 182 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f2ae99e5daf..0f088f3a2fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,33 +440,7 @@ struct zone { seqlock_t span_seqlock; #endif - /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. - */ - wait_queue_head_t *wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) @@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) static inline bool zone_is_initialized(struct zone *zone) { - return !!zone->wait_table; + return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..c7fe2f16503f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..b18dab401be6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2158,20 +2144,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..de7c6e43b1c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif } -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } -- cgit v1.2.3 From b274c0bb394c6a69ac12feac7c2db81f5aff5a55 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Oct 2016 17:46:21 -0700 Subject: kcov: properly check if we are in an interrupt in_interrupt() returns a nonzero value when we are either in an interrupt or have bh disabled via local_bh_disable(). Since we are interested in only ignoring coverage from actual interrupts, do a proper check instead of just calling in_interrupt(). As a result of this change, kcov will start to collect coverage from within local_bh_disable()/local_bh_enable() sections. Link: http://lkml.kernel.org/r/1476115803-20712-1-git-send-email-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Nicolai Stange Cc: Andrey Ryabinin Cc: Kees Cook Cc: James Morse Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..30e6d05aa5a9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit v1.2.3 From 0933840acf7b65d6d30a5b6089d882afea57aca3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Oct 2016 13:10:11 +0200 Subject: perf/core: Protect PMU device removal with a 'pmu_bus_running' check, to fix CONFIG_DEBUG_TEST_DRIVER_REMOVE=y kernel panic CAI Qian reported a crash in the PMU uncore device removal code, enabled by the CONFIG_DEBUG_TEST_DRIVER_REMOVE=y option: https://marc.info/?l=linux-kernel&m=147688837328451 The reason for the crash is that perf_pmu_unregister() tries to remove a PMU device which is not added at this point. We add PMU devices only after pmu_bus is registered, which happens in the perf_event_sysfs_init() call and sets the 'pmu_bus_running' flag. The fix is to get the 'pmu_bus_running' flag state at the point the PMU is taken out of the PMU list and remove the device later only if it's set. Reported-by: CAI Qian Tested-by: CAI Qian Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rob Herring Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161020111011.GA13361@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..a5d2e62faf7e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8855,7 +8855,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8869,10 +8872,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); -- cgit v1.2.3 From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 ++++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 9781c69eae57..03d089b3ed72 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -275,7 +275,7 @@ int hw_breakpoint_handler(struct die_args *args) if (!stepped) { WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " "0x%lx will be disabled.", info->address); - perf_event_disable(bp); + perf_event_disable_inatomic(bp); goto out; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 060d0ede88df..4741ecdb9817 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); +extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index a5d2e62faf7e..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); -- cgit v1.2.3