From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..59e3dcc5b8f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; -- cgit v1.2.3 From f98bafa06a28fdfdd5c49f820f4d6560f636fc46 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:34 -0700 Subject: memcg: kill CONFIG_MM_OWNER CONFIG_MM_OWNER makes no sense. It is not user-selectable, it is only selected by CONFIG_MEMCG automatically. So we can kill this option in init/Kconfig and do s/CONFIG_MM_OWNER/CONFIG_MEMCG/ globally. Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- include/linux/sched.h | 4 ++-- init/Kconfig | 7 ------- kernel/exit.c | 4 ++-- kernel/fork.c | 4 ++-- 5 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..de1627232af0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,7 +406,7 @@ struct mm_struct { spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in diff --git a/include/linux/sched.h b/include/linux/sched.h index 70f67e4e6156..2f2dd7d932a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2967,7 +2967,7 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else @@ -2978,7 +2978,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) diff --git a/init/Kconfig b/init/Kconfig index 4a1822a1a680..0a2f09a80e90 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -933,7 +933,6 @@ config RESOURCE_COUNTERS config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS - select MM_OWNER select EVENTFD help Provides a memory resource controller that manages both anonymous @@ -951,9 +950,6 @@ config MEMCG disable memory resource controller and you can avoid overheads. (and lose benefits of memory resource controller) - This config option also selects MM_OWNER config option, which - could in turn add some fork/exit overhead. - config MEMCG_SWAP bool "Memory Resource Controller Swap Extension" depends on MEMCG && SWAP @@ -1179,9 +1175,6 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. -config MM_OWNER - bool - config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..da1b838de8a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -352,7 +352,7 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -434,7 +434,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 59e3dcc5b8f2..0d53eb0dfb6f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Initialize POSIX timer handling for a single task. -- cgit v1.2.3 From 4e52365f279564cef0ddd41db5237f0471381093 Mon Sep 17 00:00:00 2001 From: Matthew Dempsky Date: Fri, 6 Jun 2014 14:36:42 -0700 Subject: ptrace: fix fork event messages across pid namespaces When tracing a process in another pid namespace, it's important for fork event messages to contain the child's pid as seen from the tracer's pid namespace, not the parent's. Otherwise, the tracer won't be able to correlate the fork event with later SIGTRAP signals it receives from the child. We still risk a race condition if a ptracer from a different pid namespace attaches after we compute the pid_t value. However, sending a bogus fork event message in this unlikely scenario is still a vast improvement over the status quo where we always send bogus fork event messages to debuggers in a different pid namespace than the forking process. Signed-off-by: Matthew Dempsky Acked-by: Oleg Nesterov Cc: Kees Cook Cc: Julien Tinnes Cc: Roland McGrath Cc: Jan Kratochvil Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 32 ++++++++++++++++++++++++++++++++ kernel/fork.c | 10 +++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 07d0df6bf768..077904c8b70d 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -5,6 +5,7 @@ #include /* For struct task_struct. */ #include /* for IS_ERR_VALUE */ #include /* For BUG_ON. */ +#include /* For task_active_pid_ns. */ #include /* @@ -128,6 +129,37 @@ static inline void ptrace_event(int event, unsigned long message) } } +/** + * ptrace_event_pid - possibly stop for a ptrace event notification + * @event: %PTRACE_EVENT_* value to report + * @pid: process identifier for %PTRACE_GETEVENTMSG to return + * + * Check whether @event is enabled and, if so, report @event and @pid + * to the ptrace parent. @pid is reported as the pid_t seen from the + * the ptrace parent's pid namespace. + * + * Called without locks. + */ +static inline void ptrace_event_pid(int event, struct pid *pid) +{ + /* + * FIXME: There's a potential race if a ptracer in a different pid + * namespace than parent attaches between computing message below and + * when we acquire tasklist_lock in ptrace_stop(). If this happens, + * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. + */ + unsigned long message = 0; + struct pid_namespace *ns; + + rcu_read_lock(); + ns = task_active_pid_ns(rcu_dereference(current->parent)); + if (ns) + message = pid_nr_ns(pid, ns); + rcu_read_unlock(); + + ptrace_event(event, message); +} + /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task diff --git a/kernel/fork.c b/kernel/fork.c index 0d53eb0dfb6f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags, */ if (!IS_ERR(p)) { struct completion vfork; + struct pid *pid; trace_sched_process_fork(current, p); - nr = task_pid_vnr(p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags, /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) - ptrace_event(trace, nr); + ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); } else { nr = PTR_ERR(p); } -- cgit v1.2.3 From 4af4206be2bd1933cae20c2b6fb2058dbc887f7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:58:54 +0200 Subject: tracing: Fix syscall_*regfunc() vs copy_process() race syscall_regfunc() and syscall_unregfunc() should set/clear TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race with copy_process() and miss the new child which was not added to the process/thread lists yet. Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT under tasklist. Link: http://lkml.kernel.org/p/20140413185854.GB20668@redhat.com Cc: stable@vger.kernel.org # 2.6.33 Fixes: a871bd33a6c0 "tracing: Add syscall tracepoints" Acked-by: Frederic Weisbecker Acked-by: Paul E. McKenney Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- include/trace/syscall.h | 15 +++++++++++++++ kernel/fork.c | 2 ++ 2 files changed, 17 insertions(+) (limited to 'kernel/fork.c') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index fed853f3d7aa..9674145e2f6a 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -32,4 +33,18 @@ struct syscall_metadata { struct ftrace_event_call *exit_event; }; +#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) +static inline void syscall_tracepoint_update(struct task_struct *p) +{ + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + else + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); +} +#else +static inline void syscall_tracepoint_update(struct task_struct *p) +{ +} +#endif + #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1fc952..6a13c46cd87d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) -- cgit v1.2.3