From 11e17ae423778f48c84da6a2e215f140610e1973 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 22 Mar 2022 14:21:49 +0800 Subject: bpf: Use swap() instead of open coding it Clean the following coccicheck warning: ./kernel/trace/bpf_trace.c:2263:34-35: WARNING opportunity for swap(). ./kernel/trace/bpf_trace.c:2264:40-41: WARNING opportunity for swap(). Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220322062149.109180-1-jiapeng.chong@linux.alibaba.com --- kernel/trace/bpf_trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fa2ebc07f60..836f021cb609 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2254,15 +2254,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void const struct bpf_kprobe_multi_link *link = priv; unsigned long *addr_a = a, *addr_b = b; u64 *cookie_a, *cookie_b; - unsigned long tmp1; - u64 tmp2; cookie_a = link->cookies + (addr_a - link->addrs); cookie_b = link->cookies + (addr_b - link->addrs); /* swap addr_a/addr_b and cookie_a/cookie_b values */ - tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1; - tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2; + swap(*addr_a, *addr_b); + swap(*cookie_a, *cookie_b); } static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) -- cgit v1.2.3 From 8e4e83b2278bdfb55cb2b13de07cf0a721ce8af7 Mon Sep 17 00:00:00 2001 From: Wei Xiao Date: Wed, 23 Feb 2022 19:11:53 +0800 Subject: ftrace: move sysctl_ftrace_enabled to ftrace.c This moves ftrace_enabled to trace/ftrace.c. We move sysctls to places where features actually belong to improve the readability of the code and reduce the risk of code merge conflicts. At the same time, the proc-sysctl maintainers do not want to know what sysctl knobs you wish to add for your owner piece of code, we just care about the core logic. Signed-off-by: Wei Xiao Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- include/linux/ftrace.h | 3 --- kernel/sysctl.c | 9 --------- kernel/trace/ftrace.c | 22 +++++++++++++++++++++- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..088b915853dd 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -101,9 +101,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; -extern int -ftrace_enable_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 10a551f8fcab..21172d3dad6e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1751,15 +1751,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif #ifdef CONFIG_STACK_TRACER { .procname = "stack_tracer_enabled", diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..a5efbbc289b4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7921,7 +7921,8 @@ static bool is_permanent_ops_registered(void) return false; } -int +#ifdef CONFIG_SYSCTL +static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -7964,3 +7965,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static struct ctl_table ftrace_sysctls[] = { + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, + {} +}; + +static int __init ftrace_sysctl_init(void) +{ + register_sysctl_init("kernel", ftrace_sysctls); + return 0; +} +late_initcall(ftrace_sysctl_init); +#endif -- cgit v1.2.3 From 5d79fa0d33258d8e79064316ce8fae37e27bd34b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 7 Apr 2022 15:46:12 +0800 Subject: ftrace: Fix build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SYSCTL and CONFIG_DYNAMIC_FTRACE is n, build warns: kernel/trace/ftrace.c:7912:13: error: ‘is_permanent_ops_registered’ defined but not used [-Werror=unused-function] static bool is_permanent_ops_registered(void) ^~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/trace/ftrace.c:89:12: error: ‘last_ftrace_enabled’ defined but not used [-Werror=unused-variable] static int last_ftrace_enabled; ^~~~~~~~~~~~~~~~~~~ Move is_permanent_ops_registered() to ifdef block and mark last_ftrace_enabled as __maybe_unused to fix this. Fixes: 7cde53da38a3 ("ftrace: move sysctl_ftrace_enabled to ftrace.c") Signed-off-by: YueHaibing Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a5efbbc289b4..db8d553728b6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -86,7 +86,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = { /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; -static int last_ftrace_enabled; +static int __maybe_unused last_ftrace_enabled; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; @@ -7909,6 +7909,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +#ifdef CONFIG_SYSCTL static bool is_permanent_ops_registered(void) { struct ftrace_ops *op; @@ -7921,7 +7922,6 @@ static bool is_permanent_ops_registered(void) return false; } -#ifdef CONFIG_SYSCTL static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From 055eb95533273bc334794dbc598400d10800528f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 14 Apr 2022 09:12:33 -0700 Subject: bpf: Move rcu lock management out of BPF_PROG_RUN routines Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") switched a bunch of BPF_PROG_RUN macros to inline routines. This changed the semantic a bit. Due to arguments expansion of macros, it used to be: rcu_read_lock(); array = rcu_dereference(cgrp->bpf.effective[atype]); ... Now, with with inline routines, we have: array_rcu = rcu_dereference(cgrp->bpf.effective[atype]); /* array_rcu can be kfree'd here */ rcu_read_lock(); array = rcu_dereference(array_rcu); I'm assuming in practice rcu subsystem isn't fast enough to trigger this but let's use rcu API properly. Also, rename to lower caps to not confuse with macros. Additionally, drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY. See [1] for more context. [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u v2 - keep rcu locks inside by passing cgroup_bpf Fixes: 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220414161233.170780-1-sdf@google.com --- drivers/media/rc/bpf-lirc.c | 8 ++- include/linux/bpf.h | 115 +++------------------------------------- kernel/bpf/cgroup.c | 124 ++++++++++++++++++++++++++++++++++++++------ kernel/trace/bpf_trace.c | 5 +- 4 files changed, 124 insertions(+), 128 deletions(-) (limited to 'kernel/trace') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 3eff08d7b8e5..fe17c7f98e81 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -216,8 +216,12 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) raw->bpf_sample = sample; - if (raw->progs) - BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run); + if (raw->progs) { + rcu_read_lock(); + bpf_prog_run_array(rcu_dereference(raw->progs), + &raw->bpf_sample, bpf_prog_run); + rcu_read_unlock(); + } } /* diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb5298735ce..7bf441563ffc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1221,7 +1221,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); + * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. @@ -1315,83 +1315,22 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline int -BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval, u32 *ret_flags) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - u32 func_ret; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - func_ret = run_prog(prog, ctx); - if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - *(ret_flags) |= (func_ret >> 1); - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - -static __always_inline int -BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - static __always_inline u32 -BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + if (unlikely(!array)) - goto out; + return ret; + + migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -1400,50 +1339,10 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, item++; } bpf_reset_run_ctx(old_run_ctx); -out: - rcu_read_unlock(); migrate_enable(); return ret; } -/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs - * so BPF programs can request cwr for TCP packets. - * - * Current cgroup skb programs can only return 0 or 1 (0 to drop the - * packet. This macro changes the behavior so the low order bit - * indicates whether the packet should be dropped (0) or not (1) - * and the next bit is a congestion notification bit. This could be - * used by TCP to call tcp_enter_cwr() - * - * Hence, new allowed return values of CGROUP EGRESS BPF programs are: - * 0: drop packet - * 1: keep packet - * 2: drop packet and cn - * 3: keep packet and cn - * - * This macro then converts it to one of the NET_XMIT or an error - * code that is then interpreted as drop packet (and no cn): - * 0: NET_XMIT_SUCCESS skb should be transmitted - * 1: NET_XMIT_DROP skb should be dropped and cn - * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -err skb should be dropped - */ -#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ - ({ \ - u32 _flags = 0; \ - bool _cn; \ - u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ - _cn = _flags & BPF_RET_SET_CN; \ - if (_ret && !IS_ERR_VALUE((long)_ret)) \ - _ret = -EFAULT; \ - if (!_ret) \ - _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ - else \ - _ret = (_cn ? NET_XMIT_DROP : _ret); \ - _ret; \ - }) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 128028efda64..0cb6211fcb58 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,6 +22,72 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* __always_inline is necessary to prevent indirect call through run_prog + * function pointer. + */ +static __always_inline int +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval, u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 func_ret; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + *(ret_flags) |= (func_ret >> 1); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + +static __always_inline int +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -1075,11 +1141,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); if (atype == CGROUP_INET_EGRESS) { - ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); + u32 flags = 0; + bool cn; + + ret = bpf_prog_run_array_cg_flags( + &cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0, &flags); + + /* Return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * The returned value is then converted to one of the NET_XMIT + * or an error code that is then interpreted as drop packet + * (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -err skb should be dropped + */ + + cn = flags & BPF_RET_SET_CN; + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; + if (!ret) + ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); + else + ret = (cn ? NET_XMIT_DROP : ret); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0); if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; } @@ -1109,8 +1202,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1155,8 +1247,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0, flags); + return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype, + &ctx, bpf_prog_run, 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1182,8 +1274,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, + 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1200,8 +1292,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0); rcu_read_unlock(); return ret; @@ -1366,8 +1457,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1459,7 +1549,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, &ctx, bpf_prog_run, 0); release_sock(sk); @@ -1559,7 +1649,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, &ctx, bpf_prog_run, retval); release_sock(sk); @@ -1608,7 +1698,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, &ctx, bpf_prog_run, retval); if (ret < 0) return ret; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b26f3da943de..f15b826f9899 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -129,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); + rcu_read_lock(); + ret = bpf_prog_run_array(rcu_dereference(call->prog_array), + ctx, bpf_prog_run); + rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3 From f8b7d2b4c192118c37ab24c0540d1134dd0104d8 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 15 Apr 2022 14:29:57 -0700 Subject: ftrace: fix building with SYSCTL=n but DYNAMIC_FTRACE=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One can enable dyanmic tracing but disable sysctls. When this is doen we get the compile kernel warning: CC kernel/trace/ftrace.o kernel/trace/ftrace.c:3086:13: warning: ‘ftrace_shutdown_sysctl’ defined but not used [-Wunused-function] 3086 | static void ftrace_shutdown_sysctl(void) | ^~~~~~~~~~~~~~~~~~~~~~ kernel/trace/ftrace.c:3068:13: warning: ‘ftrace_startup_sysctl’ defined but not used [-Wunused-function] 3068 | static void ftrace_startup_sysctl(void) When CONFIG_DYNAMIC_FTRACE=n the ftrace_startup_sysctl() and routines ftrace_shutdown_sysctl() still compiles, so these are actually more just used for when SYSCTL=y. Fix this then by just moving these routines to when sysctls are enabled. Fixes: 7cde53da38a3 ("ftrace: move sysctl_ftrace_enabled to ftrace.c") Reported-by: kernel test robot Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/trace/ftrace.c | 71 ++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index db8d553728b6..d9424fd9a183 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3065,40 +3065,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_startup_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* Force update next time */ - saved_ftrace_func = NULL; - /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) { - command = FTRACE_UPDATE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_START_FUNC_RET; - ftrace_startup_enable(command); - } -} - -static void ftrace_shutdown_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) { - command = FTRACE_DISABLE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_STOP_FUNC_RET; - ftrace_run_update_code(command); - } -} - static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; unsigned long ftrace_number_of_pages; @@ -7267,9 +7233,6 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_all(int command) { } -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) - static void ftrace_update_trampoline(struct ftrace_ops *ops) { } @@ -7910,6 +7873,40 @@ int unregister_ftrace_function(struct ftrace_ops *ops) EXPORT_SYMBOL_GPL(unregister_ftrace_function); #ifdef CONFIG_SYSCTL +static void ftrace_startup_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } +} + +static void ftrace_shutdown_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } +} + static bool is_permanent_ops_registered(void) { struct ftrace_ops *op; -- cgit v1.2.3 From 8fd7c2144d1292f15c901211750dee021ed5079a Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 21 Apr 2022 11:38:43 -0700 Subject: ftrace: fix building with SYSCTL=y but DYNAMIC_FTRACE=n Ok so hopefully this is the last of it. 0day picked up a build failure [0] when SYSCTL=y but DYNAMIC_FTRACE=n. This can be fixed by just declaring an empty routine for the calls moved just recently. [0] https://lkml.kernel.org/r/202204161203.6dSlgKJX-lkp@intel.com Reported-by: kernel test robot Fixes: f8b7d2b4c192 ("ftrace: fix building with SYSCTL=n but DYNAMIC_FTRACE=y") Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/trace/ftrace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d9424fd9a183..1f89039f0feb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7873,6 +7873,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops) EXPORT_SYMBOL_GPL(unregister_ftrace_function); #ifdef CONFIG_SYSCTL + +#ifdef CONFIG_DYNAMIC_FTRACE static void ftrace_startup_sysctl(void) { int command; @@ -7906,6 +7908,10 @@ static void ftrace_shutdown_sysctl(void) ftrace_run_update_code(command); } } +#else +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ static bool is_permanent_ops_registered(void) { -- cgit v1.2.3 From 217d8c05ec6295947b0b71bc784012d112f0032e Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 7 Feb 2022 05:12:16 -0800 Subject: tracing: Cleanup double word in comment Remove the second 'is' and 'to'. Link: https://lkml.kernel.org/r/20220207131216.2059997-1-trix@redhat.com Signed-off-by: Tom Rix Signed-off-by: Steven Rostedt (Google) --- kernel/trace/pid_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index a2ef1d18126a..95106d02b32d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1, /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test - * @pid: The pid to to see if set in the list. + * @pid: The pid to see if set in the list. * - * Tests if @pid is is set in the @pid_list. This is usually called + * Tests if @pid is set in the @pid_list. This is usually called * from the scheduler when a task is scheduled. Its pid is checked * if it should be traced or not. * -- cgit v1.2.3 From b8cc44a4d3c19296dfd1be1a018a8523e09ab919 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:04 -0600 Subject: tracing: Remove logic for registering multiple event triggers at a time Code for registering triggers assumes it's possible to register more than one trigger at a time. In fact, it's unimplemented and there doesn't seem to be a reason to do that. Remove the n_registered param from event_trigger_register() and fix up callers. Doing so simplifies the logic in event_trigger_register to the point that it just becomes a wrapper calling event_command.reg(). It also removes the problematic call to event_command.unreg() in case of failure. A new function, event_trigger_unregister() is also added for callers to call themselves. The changes to trace_events_hist.c simply allow compilation; a separate patch follows which updates the hist triggers to work correctly with the new changes. Link: https://lkml.kernel.org/r/6149fec7a139d93e84fa4535672fb5bef88006b0.1644010575.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 9 ++-- kernel/trace/trace_events_hist.c | 17 +++---- kernel/trace/trace_events_trigger.c | 96 +++++++++++++------------------------ 3 files changed, 45 insertions(+), 77 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 07d990270e2a..2bc8036b0659 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1629,10 +1629,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops, extern int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *trigger, - struct event_trigger_data *trigger_data, - int *n_registered); + struct event_trigger_data *trigger_data); +extern void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data); /** * struct event_trigger_ops - callbacks for trace event triggers diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 44db5ba9cabb..b18d00905eae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6287,7 +6287,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6296,13 +6296,10 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of triggers registered, - * but if it didn't register any it returns zero. Consider no - * triggers registered a failure too. - */ - if (!ret) { + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; + if (ret == 0) { if (!(attrs->pause || attrs->cont || attrs->clear)) ret = -ENOENT; goto out_free; @@ -6331,15 +6328,13 @@ enable: se = find_synth_event(se_name); if (se) se->ref++; - /* Just return zero, not the number of registered triggers */ - ret = 0; out: if (ret == 0) hist_err_clear(); return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 7eb9d04f1c2e..ea13679054a8 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -587,13 +587,12 @@ static int register_trigger(char *glob, } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; @@ -927,48 +926,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops, * @cmd_ops: The event_command operations for the trigger * @file: The event file for the trigger's event * @glob: The trigger command string, with optional remove(!) operator - * @cmd: The cmd string - * @param: The param string * @trigger_data: The trigger_data for the trigger - * @n_registered: optional outparam, the number of triggers registered * * Register an event trigger. The @cmd_ops are used to call the - * cmd_ops->reg() function which actually does the registration. The - * cmd_ops->reg() function returns the number of triggers registered, - * which is assigned to n_registered, if n_registered is non-NULL. + * cmd_ops->reg() function which actually does the registration. * * Return: 0 on success, errno otherwise */ int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *param, - struct event_trigger_data *trigger_data, - int *n_registered) + struct event_trigger_data *trigger_data) { - int ret; - - if (n_registered) - *n_registered = 0; - - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) { - if (n_registered) - *n_registered = ret; - /* Just return zero, not the number of enabled functions */ - ret = 0; - } + return cmd_ops->reg(glob, trigger_data, file); +} - return ret; +/** + * event_trigger_unregister - unregister an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @trigger_data: The trigger_data for the trigger + * + * Unregister an event trigger. The @cmd_ops are used to call the + * cmd_ops->unreg() function which actually does the unregistration. + */ +void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data) +{ + cmd_ops->unreg(glob, trigger_data, file); } /* @@ -1027,7 +1015,7 @@ event_trigger_parse(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); ret = 0; goto out; @@ -1062,17 +1050,10 @@ event_trigger_parse(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) - ret = 0; + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_ops, trigger_data); @@ -1854,7 +1835,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, trigger_data->private_data = enable_data; if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1901,19 +1882,11 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) goto out_disable; - /* Just return zero, not the number of enabled functions */ - ret = 0; + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1959,13 +1932,12 @@ int event_enable_register_trigger(char *glob, } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; -- cgit v1.2.3 From 476705419518a2f383b6695782ba1f285a2959b9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:05 -0600 Subject: tracing: Remove redundant trigger_ops params Since event_trigger_data contains the .ops trigger_ops field, there's no reason to pass the trigger_ops separately. Remove it as a param from functions whenever event_trigger_data is passed. Link: https://lkml.kernel.org/r/9856c9bc81bde57077f5b8d6f8faa47156c6354a.1644010575.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 14 ++++------- kernel/trace/trace_eprobe.c | 7 ++---- kernel/trace/trace_events_hist.c | 29 ++++++++++------------- kernel/trace/trace_events_trigger.c | 46 +++++++++++++++---------------------- 4 files changed, 36 insertions(+), 60 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2bc8036b0659..f3e4cd1bb60e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1573,10 +1573,8 @@ struct enable_trigger_data { }; extern int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, - struct event_trigger_data *data); -extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param); @@ -1587,8 +1585,7 @@ extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); -extern int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); @@ -1687,12 +1684,9 @@ struct event_trigger_ops { struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_ops *ops, - struct event_trigger_data *data); - void (*free)(struct event_trigger_ops *ops, - struct event_trigger_data *data); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); int (*print)(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data); }; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 541aa13581b9..9b3dfc9da067 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -511,20 +511,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) * functions are just stubs to fulfill what is needed to use the trigger * infrastructure. */ -static int eprobe_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int eprobe_trigger_init(struct event_trigger_data *data) { return 0; } -static void eprobe_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void eprobe_trigger_free(struct event_trigger_data *data) { } static int eprobe_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { /* Do not print eprobe event triggers */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b18d00905eae..40a7e5582dfb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5252,7 +5252,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5484,7 +5484,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5621,7 +5621,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) } static int event_hist_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5729,8 +5728,7 @@ static int event_hist_trigger_print(struct seq_file *m, return 0; } -static int event_hist_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5758,8 +5756,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) } } -static void event_hist_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_free(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5788,25 +5785,23 @@ static struct event_trigger_ops event_hist_trigger_ops = { .free = event_hist_trigger_free, }; -static int event_hist_trigger_named_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_named_init(struct event_trigger_data *data) { data->ref++; save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(ops, data->named_data); + event_hist_trigger_init(data->named_data); return 0; } -static void event_hist_trigger_named_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_named_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; - event_hist_trigger_free(ops, data->named_data); + event_hist_trigger_free(data->named_data); data->ref--; if (!data->ref) { @@ -5993,7 +5988,7 @@ static int hist_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -6111,7 +6106,7 @@ static void hist_unregister_trigger(char *glob, } if (unregistered && test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); if (hist_data->enable_timestamps) { if (!hist_data->remove || unregistered) @@ -6164,7 +6159,7 @@ static void hist_unreg_all(struct trace_event_file *file) if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } @@ -6458,7 +6453,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index ea13679054a8..32202175875d 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -188,7 +188,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); return 0; } @@ -432,7 +432,6 @@ event_trigger_print(const char *name, struct seq_file *m, /** * event_trigger_init - Generic event_trigger_ops @init implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -442,8 +441,7 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_data *data) { data->ref++; return 0; @@ -451,7 +449,6 @@ int event_trigger_init(struct event_trigger_ops *ops, /** * event_trigger_free - Generic event_trigger_ops @free implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -460,8 +457,7 @@ int event_trigger_init(struct event_trigger_ops *ops, * implementations. */ static void -event_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +event_trigger_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; @@ -515,7 +511,7 @@ clear_event_triggers(struct trace_array *tr) trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); if (data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } } } @@ -581,7 +577,7 @@ static int register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -629,7 +625,7 @@ static void unregister_trigger(char *glob, } if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } /* @@ -1049,14 +1045,14 @@ event_trigger_parse(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ - event_trigger_init(trigger_ops, trigger_data); + event_trigger_init(trigger_data); ret = event_trigger_register(cmd_ops, file, glob, trigger_data); if (ret) goto out_free; /* Down the counter of trigger_data or free it if not used anymore */ - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); out: return ret; @@ -1382,16 +1378,14 @@ traceoff_count_trigger(struct event_trigger_data *data, } static int -traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceon", m, (void *)data->count, data->filter_str); } static int -traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceoff", m, (void *)data->count, data->filter_str); @@ -1502,8 +1496,7 @@ register_snapshot_trigger(char *glob, } static int -snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("snapshot", m, (void *)data->count, data->filter_str); @@ -1598,8 +1591,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, } static int -stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("stacktrace", m, (void *)data->count, data->filter_str); @@ -1689,7 +1681,6 @@ event_enable_count_trigger(struct event_trigger_data *data, } int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1714,8 +1705,7 @@ int event_enable_trigger_print(struct seq_file *m, return 0; } -void event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1843,7 +1833,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, } /* Up the trigger_data count to make sure nothing frees it on failure */ - event_trigger_init(trigger_ops, trigger_data); + event_trigger_init(trigger_data); if (trigger) { number = strsep(&trigger, ":"); @@ -1887,7 +1877,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, if (ret) goto out_disable; - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); out: return ret; @@ -1898,7 +1888,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); kfree(enable_data); goto out; } @@ -1926,7 +1916,7 @@ int event_enable_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -1969,7 +1959,7 @@ void event_enable_unregister_trigger(char *glob, } if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } static struct event_trigger_ops * -- cgit v1.2.3 From e1f187d09e11f50a50cbb02ae277d4e8bdfc7db8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:06 -0600 Subject: tracing: Have existing event_command.parse() implementations use helpers Simplify the existing event_command.parse() implementations by having them make use of the helper functions previously introduced. Link: https://lkml.kernel.org/r/b353e3427a81f9d3adafd98fd7d73e78a8209f43.1644010576.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 3 +- kernel/trace/trace_eprobe.c | 3 +- kernel/trace/trace_events_hist.c | 64 ++++++--------- kernel/trace/trace_events_trigger.c | 150 ++++++++++-------------------------- 4 files changed, 69 insertions(+), 151 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3e4cd1bb60e..ff816fb41e48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1577,7 +1577,8 @@ extern int event_enable_trigger_print(struct seq_file *m, extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); extern int event_enable_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 9b3dfc9da067..b045fa9f276c 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -546,7 +546,8 @@ static struct event_trigger_ops eprobe_trigger_ops = { static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { return -1; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 40a7e5582dfb..1b71bdb0bcb1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2785,7 +2785,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, static struct event_command trigger_hist_cmd; static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -6166,17 +6167,17 @@ static void hist_unreg_all(struct trace_event_file *file) static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; struct hist_trigger_attrs *attrs; - struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; + char *param, *filter, *p, *start; struct synth_event *se; const char *se_name; - bool remove = false; - char *trigger, *p, *start; + bool remove; int ret = 0; lockdep_assert_held(&event_mutex); @@ -6185,31 +6186,30 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (strlen(glob)) { hist_err_clear(); - last_cmd_set(file, param); + last_cmd_set(file, param_and_filter); } - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - if (glob[0] == '!') - remove = true; + if (event_trigger_empty_param(param_and_filter)) + return -EINVAL; /* * separate the trigger from the filter (k:v [if filter]) * allowing for whitespace in the trigger */ - p = trigger = param; + p = param = param_and_filter; do { p = strstr(p, "if"); if (!p) break; - if (p == param) + if (p == param_and_filter) return -EINVAL; if (*(p - 1) != ' ' && *(p - 1) != '\t') { p++; continue; } - if (p >= param + strlen(param) - (sizeof("if") - 1) - 1) + if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1) return -EINVAL; if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { p++; @@ -6219,24 +6219,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, } while (1); if (!p) - param = NULL; + filter = NULL; else { *(p - 1) = '\0'; - param = strstrip(p); - trigger = strstrip(trigger); + filter = strstrip(p); + param = strstrip(param); } /* * To simplify arithmetic expression parsing, replace occurrences of * '.sym-offset' modifier with '.symXoffset' */ - start = strstr(trigger, ".sym-offset"); + start = strstr(param, ".sym-offset"); while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); } - attrs = parse_hist_trigger_attrs(file->tr, trigger); + attrs = parse_hist_trigger_attrs(file->tr, param); if (IS_ERR(attrs)) return PTR_ERR(attrs); @@ -6249,29 +6249,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; } - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); - - trigger_data->private_data = hist_data; - - /* if param is non-empty, it's supposed to be a filter */ - if (param && cmd_ops->set_filter) { - ret = cmd_ops->set_filter(param, trigger_data, file); - if (ret < 0) - goto out_free; - } + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; if (remove) { if (!have_hist_trigger_match(trigger_data, file)) @@ -6298,8 +6284,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (!(attrs->pause || attrs->cont || attrs->clear)) ret = -ENOENT; goto out_free; - } else if (ret < 0) - goto out_free; + } if (get_named_trigger_data(trigger_data)) goto enable; @@ -6331,8 +6316,7 @@ enable: out_unreg: event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); remove_hist_vars(hist_data); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 32202175875d..62c44dab8f46 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -965,7 +965,7 @@ void event_trigger_unregister(struct event_command *cmd_ops, * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger * @cmd: The cmd portion of the string used to register the trigger - * @param: The params portion of the string used to register the trigger + * @param_and_filter: The param and filter portion of the string used to register the trigger * * Common implementation for event command parsing and trigger * instantiation. @@ -978,72 +978,39 @@ void event_trigger_unregister(struct event_command *cmd_ops, static int event_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; - char *trigger = NULL; - char *number; + char *param, *filter; + bool remove; int ret; - /* separate the trigger from the filter (t:n [if filter]) */ - if (param && isdigit(param[0])) { - trigger = strsep(¶m, " \t"); - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - } + remove = event_trigger_check_remove(glob); - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, false); + if (ret) + return ret; ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); if (!trigger_data) goto out; - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - trigger_data->private_data = file; - INIT_LIST_HEAD(&trigger_data->list); - INIT_LIST_HEAD(&trigger_data->named_list); - - if (glob[0] == '!') { + if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); ret = 0; goto out; } - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_data); @@ -1057,8 +1024,7 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); kfree(trigger_data); goto out; } @@ -1752,39 +1718,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; struct trace_array *tr = file->tr; + char *param, *filter; + bool enable, remove; const char *system; const char *event; bool hist = false; - char *trigger; - char *number; - bool enable; int ret; - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - /* separate the trigger from the filter (s:e:n [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) + if (event_trigger_empty_param(param_and_filter)) return -EINVAL; - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - system = strsep(&trigger, ":"); - if (!trigger) + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, true); + if (ret) + return ret; + + system = strsep(¶m, ":"); + if (!param) return -EINVAL; - event = strsep(&trigger, ":"); + event = strsep(¶m, ":"); ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); @@ -1800,31 +1760,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, #else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; #endif - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) - goto out; enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); - if (!enable_data) { - kfree(trigger_data); + if (!enable_data) goto out; - } - - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data->private_data = enable_data; - if (glob[0] == '!') { + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + if (!trigger_data) { + kfree(enable_data); + goto out; + } + + if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); kfree(enable_data); @@ -1835,33 +1787,14 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, /* Up the trigger_data count to make sure nothing frees it on failure */ event_trigger_init(trigger_data); - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { @@ -1880,16 +1813,15 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); out: return ret; - out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: trace_event_put_ref(event_enable_file->event_call); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); event_trigger_free(trigger_data); kfree(enable_data); + goto out; } -- cgit v1.2.3 From a7e6b7dcfb19988ad2968a1fafd29b600abbf133 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 4 Feb 2022 16:12:07 -0600 Subject: tracing: Separate hist state updates from hist registration hist_register_trigger() handles both new hist registration as well as existing hist registration through event_command.reg(). Adding a new function, existing_hist_update_only(), that checks and updates existing histograms and exits after doing so allows the confusing logic in event_hist_trigger_parse() to be simplified. Link: https://lkml.kernel.org/r/211b2cd3e3d7e00f4f8ad45ef8b33063da6a7e05.1644010576.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 66 +++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b71bdb0bcb1..998dfe2162fc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5929,6 +5929,48 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } +static bool existing_hist_update_only(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool updated = false; + + if (!hist_data->attrs->pause && !hist_data->attrs->cont && + !hist_data->attrs->clear) + goto out; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) + goto out; + } + } + + if (hist_data->attrs->name && !named_data) + goto out; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + updated = true; + goto out; + } + } + out: + return updated; +} + static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) @@ -5957,19 +5999,11 @@ static int hist_register_trigger(char *glob, list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) - continue; - if (hist_data->attrs->pause) - test->paused = true; - else if (hist_data->attrs->cont) - test->paused = false; - else if (hist_data->attrs->clear) - hist_clear(test); - else { + if (hist_trigger_match(data, test, named_data, false)) { hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; + goto out; } - goto out; } } new: @@ -6008,8 +6042,6 @@ static int hist_register_trigger(char *glob, if (named_data) destroy_hist_data(hist_data); - - ret++; out: return ret; } @@ -6277,14 +6309,12 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret) + if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - if (ret == 0) { - if (!(attrs->pause || attrs->cont || attrs->clear)) - ret = -ENOENT; + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) goto out_free; - } if (get_named_trigger_data(trigger_data)) goto enable; -- cgit v1.2.3 From cf2adec7479d875973fe10782f8bf20377628ef2 Mon Sep 17 00:00:00 2001 From: Oscar Shiang Date: Thu, 17 Feb 2022 00:50:06 +0800 Subject: tracing: Fix inconsistent style of mini-HOWTO Each description should start with a hyphen and a space. Insert spaces to fix it. Link: https://lkml.kernel.org/r/TYCP286MB19130AA4A9C6FC5A8793DED2A1359@TYCP286MB1913.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Oscar Shiang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f4de111fa18f..4f9c499368c4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5475,7 +5475,7 @@ static const char readme_msg[] = " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" - " trace_clock\t\t-change the clock used to order events\n" + " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" @@ -5484,7 +5484,7 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif - "\n timestamp_mode\t-view the mode used to timestamp events\n" + "\n timestamp_mode\t- view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" -- cgit v1.2.3 From 3b57d8477cd0f427198930706958f1312cc3594b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 18 Feb 2022 18:08:49 +0800 Subject: tracing: Fix kernel-doc Fix the following W=1 kernel warnings: kernel/trace/trace.c:1181: warning: expecting prototype for tracing_snapshot_cond_data(). Prototype was for tracing_cond_snapshot_data() instead. Link: https://lkml.kernel.org/r/20220218100849.122038-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f9c499368c4..aceeeea21c11 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1174,7 +1174,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** - * tracing_snapshot_cond_data - get the user data associated with a snapshot + * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using -- cgit v1.2.3 From adaa0a9f06d127b6a0a1b929ec0971df40993a9b Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sat, 2 Apr 2022 15:20:15 +0800 Subject: tracing: Fix tracing_map_sort_entries() kernel-doc comment Add the description of @n_sort_keys and make @sort_key -> @sort_keys in tracing_map_sort_entries() kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. kernel/trace/tracing_map.c:1073: warning: Function parameter or member 'sort_keys' not described in 'tracing_map_sort_entries' kernel/trace/tracing_map.c:1073: warning: Function parameter or member 'n_sort_keys' not described in 'tracing_map_sort_entries' kernel/trace/tracing_map.c:1073: warning: Excess function parameter 'sort_key' description in 'tracing_map_sort_entries' Link: https://lkml.kernel.org/r/20220402072015.45864-1-yang.lee@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9628b5571846..9901708ce6b8 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1045,7 +1045,8 @@ static void sort_secondary(struct tracing_map *map, /** * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map * @map: The tracing_map - * @sort_key: The sort key to use for sorting + * @sort_keys: The sort key to use for sorting + * @n_sort_keys: hitcount, always have at least one * @sort_entries: outval: pointer to allocated and sorted array of entries * * tracing_map_sort_entries() sorts the current set of entries in the -- cgit v1.2.3 From cb1c45fb68b8a4285ccf750842b1136f26cfe267 Mon Sep 17 00:00:00 2001 From: Jeff Xie Date: Sun, 10 Apr 2022 22:50:25 +0800 Subject: tracing: Make tp_printk work on syscall tracepoints Currently the tp_printk option has no effect on syscall tracepoint. When adding the kernel option parameter tp_printk, then: echo 1 > /sys/kernel/debug/tracing/events/syscalls/enable When running any application, no trace information is printed on the terminal. Now added printk for syscall tracepoints. Link: https://lkml.kernel.org/r/20220410145025.681144-1-xiehuan09@gmail.com Signed-off-by: Jeff Xie Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f755bde42fd0..b69e207012c9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ @@ -296,9 +296,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; unsigned long args[6]; int syscall_nr; int size; @@ -321,20 +319,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->enter_event->event.type, size, trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; syscall_get_arguments(current, regs, args); memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,9 +337,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -364,20 +356,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->exit_event->event.type, sizeof(*entry), - trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static int reg_event_syscall_enter(struct trace_event_file *file, -- cgit v1.2.3 From 97a5d2e5e35f119b6d7a37c7538773ae7af82dd5 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Mon, 18 Apr 2022 01:56:29 +0700 Subject: tracing: Return -EINVAL if WARN_ON(!glob) triggered in event_hist_trigger_parse() If `WARN_ON(!glob)` is ever triggered, we will still continue executing the next lines. This will trigger the more serious problem, a NULL pointer dereference bug. Just return -EINVAL if @glob is NULL. Link: https://lkml.kernel.org/r/20220417185630.199062-2-ammarfaizi2@gnuweeb.org Cc: Ingo Molnar Cc: GNU/Weeb Mailing List Signed-off-by: Ammar Faizi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 998dfe2162fc..80c25be23c45 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6214,7 +6214,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); - WARN_ON(!glob); + if (WARN_ON(!glob)) + return -EINVAL; if (strlen(glob)) { hist_err_clear(); -- cgit v1.2.3 From 69686fcbdcc0b6fed3b8d0907e641f282df2f827 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Mon, 18 Apr 2022 01:56:30 +0700 Subject: tracing: Change `if (strlen(glob))` to `if (glob[0])` No need to traverse to the end of string. If the first byte is not a NUL char, it's guaranteed `if (strlen(glob))` is true. Link: https://lkml.kernel.org/r/20220417185630.199062-3-ammarfaizi2@gnuweeb.org Cc: Ingo Molnar Cc: GNU/Weeb Mailing List Signed-off-by: Ammar Faizi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 80c25be23c45..fe10179893c1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6217,7 +6217,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (WARN_ON(!glob)) return -EINVAL; - if (strlen(glob)) { + if (glob[0]) { hist_err_clear(); last_cmd_set(file, param_and_filter); } -- cgit v1.2.3 From 12025abdc8539ed9d5014e2d647a3fd1bd3de5cd Mon Sep 17 00:00:00 2001 From: Jun Miao Date: Tue, 19 Apr 2022 09:39:10 +0800 Subject: tracing: Fix sleeping function called from invalid context on RT kernel When setting bootparams="trace_event=initcall:initcall_start tp_printk=1" in the cmdline, the output_printk() was called, and the spin_lock_irqsave() was called in the atomic and irq disable interrupt context suitation. On the PREEMPT_RT kernel, these locks are replaced with sleepable rt-spinlock, so the stack calltrace will be triggered. Fix it by raw_spin_lock_irqsave when PREEMPT_RT and "trace_event=initcall:initcall_start tp_printk=1" enabled. BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0 preempt_count: 2, expected: 0 RCU nest depth: 0, expected: 0 Preemption disabled at: [] try_to_wake_up+0x7e/0xba0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt17+ #19 34c5812404187a875f32bee7977f7367f9679ea7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl+0x60/0x8c dump_stack+0x10/0x12 __might_resched.cold+0x11d/0x155 rt_spin_lock+0x40/0x70 trace_event_buffer_commit+0x2fa/0x4c0 ? map_vsyscall+0x93/0x93 trace_event_raw_event_initcall_start+0xbe/0x110 ? perf_trace_initcall_finish+0x210/0x210 ? probe_sched_wakeup+0x34/0x40 ? ttwu_do_wakeup+0xda/0x310 ? trace_hardirqs_on+0x35/0x170 ? map_vsyscall+0x93/0x93 do_one_initcall+0x217/0x3c0 ? trace_event_raw_event_initcall_level+0x170/0x170 ? push_cpu_stop+0x400/0x400 ? cblist_init_generic+0x241/0x290 kernel_init_freeable+0x1ac/0x347 ? _raw_spin_unlock_irq+0x65/0x80 ? rest_init+0xf0/0xf0 kernel_init+0x1e/0x150 ret_from_fork+0x22/0x30 Link: https://lkml.kernel.org/r/20220419013910.894370-1-jun.miao@intel.com Signed-off-by: Jun Miao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aceeeea21c11..27bb486c3f97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2835,7 +2835,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) @@ -2863,14 +2863,14 @@ static void output_printk(struct trace_event_buffer *fbuffer) event = &fbuffer->trace_file->event_call->event; - spin_lock_irqsave(&tracepoint_iter_lock, flags); + raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); + raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(struct ctl_table *table, int write, -- cgit v1.2.3 From 4ee51101e93f0c8d83876ae5c0c26ca14934629e Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Sun, 24 Apr 2022 21:19:32 +0800 Subject: tracing: Use WARN instead of printk and WARN_ON Use `WARN(cond, ...)` instead of `if (cond)` + `printk(...)` + `WARN_ON(1)`. Link: https://lkml.kernel.org/r/20220424131932.3606-1-guozhengkui@vivo.com Suggested-by: Steven Rostedt Signed-off-by: Guo Zhengkui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8aa493d25c73..d89e3f7e26eb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -778,9 +778,8 @@ int register_trace_event(struct trace_event *event) list_add_tail(&event->list, list); - } else if (event->type > __TRACE_LAST_TYPE) { - printk(KERN_WARNING "Need to add type to trace.h\n"); - WARN_ON(1); + } else if (WARN(event->type > __TRACE_LAST_TYPE, + "Need to add type to trace.h")) { goto out; } else { /* Is this event already used */ @@ -1571,13 +1570,8 @@ __init static int init_events(void) for (i = 0; events[i]; i++) { event = events[i]; - ret = register_trace_event(event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - event->type); - WARN_ON_ONCE(1); - } + WARN_ONCE(!ret, "event %d failed to register", event->type); } return 0; -- cgit v1.2.3 From ed888241a0ab9a3606bd986be6f0abafa1afdf19 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 26 Apr 2022 15:06:28 +0800 Subject: ring-buffer: Simplify if-if to if-else Use if and else instead of if(A) and if (!A). Link: https://lkml.kernel.org/r/20220426070628.167565-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05dfc7a12d3d..655d6db3e3c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6011,10 +6011,10 @@ static __init int test_ringbuffer(void) pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); - if (total_lost) + if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); - if (!total_lost) { + } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; -- cgit v1.2.3 From ef9188bcc6ca1d8a2ad83e826b548e6820721061 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Tue, 26 Apr 2022 20:24:06 +0800 Subject: tracing: Avoid adding tracer option before update_tracer_options To prepare for support asynchronous tracer_init_tracefs initcall, avoid calling create_trace_option_files before __update_tracer_options. Otherwise, create_trace_option_files will show warning because some tracers in trace_types list are already in tr->topts. For example, hwlat_tracer call register_tracer in late_initcall, and global_trace.dir is already created in tracing_init_dentry, hwlat_tracer will be put into tr->topts. Then if the __update_tracer_options is executed after hwlat_tracer registered, create_trace_option_files find that hwlat_tracer is already in tr->topts. Link: https://lkml.kernel.org/r/20220426122407.17042-2-mark-pk.tsai@mediatek.com Link: https://lore.kernel.org/lkml/20220322133339.GA32582@xsang-OptiPlex-9020/ Reported-by: kernel test robot Signed-off-by: Mark-PK Tsai Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 27bb486c3f97..7275173c55d0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6332,12 +6332,18 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } +static bool tracer_options_updated; + static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir) return; + /* Only create trace option files after update_tracer_options finish */ + if (!tracer_options_updated) + return; + create_trace_option_files(tr, t); } @@ -9176,6 +9182,7 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { mutex_lock(&trace_types_lock); + tracer_options_updated = true; __update_tracer_options(tr); mutex_unlock(&trace_types_lock); } -- cgit v1.2.3 From 6621a7004684bfcff5af4c8e4d37989941f42a6b Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Tue, 26 Apr 2022 20:24:07 +0800 Subject: tracing: make tracer_init_tracefs initcall asynchronous Move trace_eval_init() to subsys_initcall to make it start earlier. And to avoid tracer_init_tracefs being blocked by trace_event_sem which trace_eval_init() hold [1], queue tracer_init_tracefs() to eval_map_wq to let the two works being executed sequentially. It can speed up the initialization of kernel as result of making tracer_init_tracefs asynchronous. On my arm64 platform, it reduce ~20ms of 125ms which total time do_initcalls spend. Link: https://lkml.kernel.org/r/20220426122407.17042-3-mark-pk.tsai@mediatek.com [1]: https://lore.kernel.org/r/68d7b3327052757d0cd6359a6c9015a85b437232.camel@pengutronix.de Signed-off-by: Mark-PK Tsai Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7275173c55d0..400d3e9fe9ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9615,6 +9615,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static struct workqueue_struct *eval_map_wq __initdata; static struct work_struct eval_map_work __initdata; +static struct work_struct tracerfs_init_work __initdata; static void __init eval_map_work_func(struct work_struct *work) { @@ -9640,6 +9641,8 @@ static int __init trace_eval_init(void) return 0; } +subsys_initcall(trace_eval_init); + static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ @@ -9722,15 +9725,8 @@ static struct notifier_block trace_module_nb = { }; #endif /* CONFIG_MODULES */ -static __init int tracer_init_tracefs(void) +static __init void tracer_init_tracefs_work_func(struct work_struct *work) { - int ret; - - trace_access_lock_init(); - - ret = tracing_init_dentry(); - if (ret) - return 0; event_trace_init(); @@ -9752,8 +9748,6 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); - trace_eval_init(); - trace_create_eval_file(NULL); #ifdef CONFIG_MODULES @@ -9768,6 +9762,24 @@ static __init int tracer_init_tracefs(void) create_trace_instances(NULL); update_tracer_options(&global_trace); +} + +static __init int tracer_init_tracefs(void) +{ + int ret; + + trace_access_lock_init(); + + ret = tracing_init_dentry(); + if (ret) + return 0; + + if (eval_map_wq) { + INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); + queue_work(eval_map_wq, &tracerfs_init_work); + } else { + tracer_init_tracefs_work_func(NULL); + } return 0; } -- cgit v1.2.3 From 6695da58f9445f386516ed53a3e870f534d2645d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 27 Apr 2022 15:33:39 -0400 Subject: ring-buffer: Have absolute time stamps handle large numbers There's an absolute timestamp event in the ring buffer, but this only saves 59 bits of the timestamp, as the 5 MSB is used for meta data (stating it is an absolute time stamp). This was never an issue as all the clocks currently in use never used those 5 MSB. But now there's a new clock (TAI) that does. To handle this case, when reading an absolute timestamp, a previous full timestamp is passed in, and the 5 MSB of that timestamp is OR'd to the absolute timestamp (if any of the 5 MSB are set), and then to test for overflow, if the new result is smaller than the passed in previous timestamp, then 1 << 59 is added to it. All the extra processing is done on the reader "slow" path, with the exception of the "too big delta" check, and the reading of timestamps for histograms. Note, libtraceevent will need to be updated to handle this case as well. But this is not a user space regression, as user space was never able to handle any timestamps that used more than 59 bits. Link: https://lore.kernel.org/all/20220426175338.3807ca4f@gandalf.local.home/ Link: https://lkml.kernel.org/r/20220427153339.16c33f75@gandalf.local.home Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Kurt Kanzenbach Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 49 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 655d6db3e3c3..3a0c7ed0e93f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -29,6 +29,14 @@ #include +/* + * The "absolute" timestamp in the buffer is only 59 bits. + * If a clock has the 5 MSBs set, it needs to be saved and + * reinserted. + */ +#define TS_MSB (0xf8ULL << 56) +#define ABS_TS_MASK (~TS_MSB) + static void update_pages_handler(struct work_struct *work); /* @@ -783,6 +791,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, } #endif +/* + * The absolute time stamp drops the 5 MSBs and some clocks may + * require them. The rb_fix_abs_ts() will take a previous full + * time stamp, and add the 5 MSB of that time stamp on to the + * saved absolute time stamp. Then they are compared in case of + * the unlikely event that the latest time stamp incremented + * the 5 MSB. + */ +static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) +{ + if (save_ts & TS_MSB) { + abs |= save_ts & TS_MSB; + /* Check for overflow */ + if (unlikely(abs < save_ts)) + abs += 1ULL << 59; + } + return abs; +} static inline u64 rb_time_stamp(struct trace_buffer *buffer); @@ -811,8 +837,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, u64 ts; /* If the event includes an absolute time, then just use that */ - if (event->type_len == RINGBUF_TYPE_TIME_STAMP) - return rb_event_time_stamp(event); + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + ts = rb_event_time_stamp(event); + return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); + } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); @@ -2754,8 +2782,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { + /* + * Some timers can use more than 59 bits, and when a timestamp + * is added to the buffer, it will lose those bits. + */ + if (abs && (info->ts & TS_MSB)) { + info->delta &= ABS_TS_MASK; + /* did the clock go backwards */ - if (info->before == info->after && info->before > info->ts) { + } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; @@ -3304,7 +3339,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3380,7 +3415,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); break; case RINGBUF_TYPE_PADDING: @@ -4367,6 +4402,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; @@ -4397,6 +4433,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; @@ -4650,6 +4687,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4741,6 +4779,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } -- cgit v1.2.3 From f03f2abce4f3944b9576bc4ad28b75890e907d7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 27 Apr 2022 17:08:12 -0400 Subject: ring-buffer: Have 32 bit time stamps use all 64 bits When the new logic was made to handle deltas of events from interrupts that interrupted other events, it required 64 bit local atomics. Unfortunately, 64 bit local atomics are expensive on 32 bit architectures. Thus, commit 10464b4aa605e ("ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit") created a type of seq lock timer for 32 bits. It used two 32 bit local atomics, but required 2 bits from them each for synchronization, making it only 60 bits. Add a new "msb" field to hold the extra 4 bits that are cut off. Link: https://lore.kernel.org/all/20220426175338.3807ca4f@gandalf.local.home/ Link: https://lkml.kernel.org/r/20220427170812.53cc7139@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3a0c7ed0e93f..d59b6a328b7f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -476,6 +476,7 @@ struct rb_time_struct { local_t cnt; local_t top; local_t bottom; + local_t msb; }; #else #include @@ -577,7 +578,6 @@ struct ring_buffer_iter { * For the ring buffer, 64 bit required operations for the time is * the following: * - * - Only need 59 bits (uses 60 to make it even). * - Reads may fail if it interrupted a modification of the time stamp. * It will succeed if it did not interrupt another write even if * the read itself is interrupted by a write. @@ -602,6 +602,7 @@ struct ring_buffer_iter { */ #define RB_TIME_SHIFT 30 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) +#define RB_TIME_MSB_SHIFT 60 static inline int rb_time_cnt(unsigned long val) { @@ -621,7 +622,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom) static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) { - unsigned long top, bottom; + unsigned long top, bottom, msb; unsigned long c; /* @@ -633,6 +634,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); + msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); @@ -641,7 +643,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) if (*cnt != rb_time_cnt(bottom)) return false; - *ret = rb_time_val(top, bottom); + /* The shift to msb will lose its cnt bits */ + *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); return true; } @@ -657,10 +660,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); } -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, + unsigned long *msb) { *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); + *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); } static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) @@ -671,15 +676,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long static void rb_time_set(rb_time_t *t, u64 val) { - unsigned long cnt, top, bottom; + unsigned long cnt, top, bottom, msb; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); /* Writes always succeed with a valid number even if it gets interrupted. */ do { cnt = local_inc_return(&t->cnt); rb_time_val_set(&t->top, top, cnt); rb_time_val_set(&t->bottom, bottom, cnt); + rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); } while (cnt != local_read(&t->cnt)); } @@ -694,8 +700,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - unsigned long cnt, top, bottom; - unsigned long cnt2, top2, bottom2; + unsigned long cnt, top, bottom, msb; + unsigned long cnt2, top2, bottom2, msb2; u64 val; /* The cmpxchg always fails if it interrupted an update */ @@ -711,16 +717,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) cnt2 = cnt + 1; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); top = rb_time_val_cnt(top, cnt); bottom = rb_time_val_cnt(bottom, cnt); - rb_time_split(set, &top2, &bottom2); + rb_time_split(set, &top2, &bottom2, &msb2); top2 = rb_time_val_cnt(top2, cnt2); bottom2 = rb_time_val_cnt(bottom2, cnt2); if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) return false; + if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) + return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) -- cgit v1.2.3 From c575afe21ccc47c5561d11493ede81ab7d072267 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Thu, 14 Apr 2022 11:18:04 +0200 Subject: tracing: Introduce trace clock tai A fast/NMI safe accessor for CLOCK_TAI has been introduced. Use it for adding the additional trace clock "tai". Link: https://lkml.kernel.org/r/20220414091805.89667-3-kurt@linutronix.de Signed-off-by: Kurt Kanzenbach Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 400d3e9fe9ff..087740f63d92 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1542,6 +1542,7 @@ static struct { { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3 From 1da27a25054f7660f7bb27ad4ccf036ee6ba51e9 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:31 +0200 Subject: tracing: Remove usage of list iterator after the loop body In preparation to limit the scope of the list iterator variable to the traversal loop, use a dedicated pointer to point to the found element [1]. Before, the code implicitly used the head when no element was found when using &pos->list. Since the new variable is only set if an element was found, the head needs to be used explicitly if the variable is NULL. Link: https://lkml.kernel.org/r/20220427170734.819891-2-jakobkoschel@gmail.com Cc: Ingo Molnar Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d89e3f7e26eb..67f47ea27921 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { - struct trace_event *e; + struct trace_event *e = NULL, *iter; int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { @@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list) * We used up all possible max events, * lets see if somebody freed one. */ - list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != next) + list_for_each_entry(iter, &ftrace_event_list, list) { + if (iter->type != next) { + e = iter; break; + } next++; } @@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list) if (next > TRACE_EVENT_TYPE_MAX) return 0; - *list = &e->list; + if (e) + *list = &e->list; + else + *list = &ftrace_event_list; return next; } -- cgit v1.2.3 From 99d8ae4ec8a90ecf6c2a6a141bd4db4a3bc5146c Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:32 +0200 Subject: tracing: Remove usage of list iterator variable after the loop In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to point to the found element [1]. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220427170734.819891-3-jakobkoschel@gmail.com Cc: Ingo Molnar Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e11e167b7809..e4a442060707 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1723,9 +1723,9 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { + struct trace_subsystem_dir *dir = NULL, *iter_dir; + struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; - struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ - struct trace_array *tr; int ret; if (tracing_is_disabled()) @@ -1734,10 +1734,12 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - list_for_each_entry(dir, &tr->systems, list) { - if (dir == inode->i_private) { + list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { + list_for_each_entry(iter_dir, &iter_tr->systems, list) { + if (iter_dir == inode->i_private) { /* Don't open systems with no events */ + tr = iter_tr; + dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; @@ -1753,9 +1755,6 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (!system) return -ENODEV; - /* Some versions of gcc think dir can be uninitialized here */ - WARN_ON(!dir); - /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); -- cgit v1.2.3 From 45e333ce2ad5cbb0ee05686336de09058c6af8ca Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:33 +0200 Subject: tracing: Replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lkml.kernel.org/r/20220427170734.819891-4-jakobkoschel@gmail.com Cc: Ingo Molnar Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 15 +++++++-------- kernel/trace/trace_events_trigger.c | 24 +++++++++++------------- 2 files changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fe10179893c1..038dc591545d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6117,20 +6117,19 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { + struct event_trigger_data *test = NULL, *iter, *named_data = NULL; struct hist_trigger_data *hist_data = data->private_data; - struct event_trigger_data *test, *named_data = NULL; - bool unregistered = false; lockdep_assert_held(&event_mutex); if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, iter, named_data, false)) continue; - unregistered = true; + test = iter; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -6138,11 +6137,11 @@ static void hist_unregister_trigger(char *glob, } } - if (unregistered && test->ops->free) + if (test && test->ops->free) test->ops->free(test); if (hist_data->enable_timestamps) { - if (!hist_data->remove || unregistered) + if (!hist_data->remove || test) tracing_set_filter_buffering(file->tr, false); } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 62c44dab8f46..21592bed2e89 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -609,14 +609,13 @@ static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { - struct event_trigger_data *data; - bool unregistered = false; + struct event_trigger_data *data = NULL, *iter; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { - unregistered = true; + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -624,7 +623,7 @@ static void unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) + if (data && data->ops->free) data->ops->free(data); } @@ -1870,19 +1869,18 @@ void event_enable_unregister_trigger(char *glob, struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; + struct event_trigger_data *data = NULL, *iter; struct enable_trigger_data *enable_data; - struct event_trigger_data *data; - bool unregistered = false; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - enable_data = data->private_data; + list_for_each_entry(iter, &file->triggers, list) { + enable_data = iter->private_data; if (enable_data && - (data->cmd_ops->trigger_type == + (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { - unregistered = true; + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -1890,7 +1888,7 @@ void event_enable_unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) + if (data && data->ops->free) data->ops->free(data); } -- cgit v1.2.3 From ba27d8555867b0e02e15709f4ddb79aec5cf2efc Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Wed, 27 Apr 2022 19:07:34 +0200 Subject: tracing: Remove check of list iterator against head past the loop body When list_for_each_entry() completes the iteration over the whole list without breaking the loop, the iterator value will be a bogus pointer computed based on the head element. While it is safe to use the pointer to determine if it was computed based on the head element, either with list_entry_is_head() or &pos->member == head, using the iterator variable after the loop should be avoided. In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to point to the found element [1]. Link: https://lkml.kernel.org/r/20220427170734.819891-5-jakobkoschel@gmail.com Cc: Ingo Molnar Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 20 ++++++++++++-------- kernel/trace/trace_eprobe.c | 14 ++++++++------ kernel/trace/trace_events.c | 12 ++++++------ 3 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..5c465e70d146 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4560,8 +4560,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops, void *data) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; @@ -4580,11 +4580,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) { + if (!probe) { probe = kzalloc(sizeof(*probe), GFP_KERNEL); if (!probe) { mutex_unlock(&ftrace_lock); @@ -4702,9 +4704,9 @@ int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; @@ -4732,11 +4734,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) + if (!probe) goto err_unlock_ftrace; ret = -EINVAL; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b045fa9f276c..7d4478525c66 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -648,7 +648,7 @@ static struct trace_event_functions eprobe_funcs = { static int disable_eprobe(struct trace_eprobe *ep, struct trace_array *tr) { - struct event_trigger_data *trigger; + struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; struct eprobe_data *edata; @@ -656,14 +656,16 @@ static int disable_eprobe(struct trace_eprobe *ep, if (!file) return -ENOENT; - list_for_each_entry(trigger, &file->triggers, list) { - if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + list_for_each_entry(iter, &file->triggers, list) { + if (!(iter->flags & EVENT_TRIGGER_FL_PROBE)) continue; - edata = trigger->private_data; - if (edata->ep == ep) + edata = iter->private_data; + if (edata->ep == ep) { + trigger = iter; break; + } } - if (list_entry_is_head(trigger, &file->triggers, list)) + if (!trigger) return -ENODEV; list_del_rcu(&trigger->list); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e4a442060707..78f313b7b315 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2279,8 +2279,8 @@ static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { + struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ @@ -2294,13 +2294,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } /* Now see if the system itself exists. */ - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + system = NULL; + list_for_each_entry(iter, &event_subsystems, list) { + if (strcmp(iter->name, name) == 0) { + system = iter; break; + } } - /* Reset system variable when not found */ - if (&system->list == &event_subsystems) - system = NULL; dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) -- cgit v1.2.3 From bed0d9a50dacee6fcf785c555cfb0d2573355afc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:13 +0200 Subject: ftrace: Add ftrace_lookup_symbols function Adding ftrace_lookup_symbols function that resolves array of symbols with single pass over kallsyms. The user provides array of string pointers with count and pointer to allocated array for resolved values. int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) It iterates all kallsyms symbols and tries to loop up each in provided symbols array with bsearch. The symbols array needs to be sorted by name for this reason. We also check each symbol to pass ftrace_location, because this API will be used for fprobe symbols resolving. Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 6 +++++ kernel/kallsyms.c | 1 + kernel/trace/ftrace.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..820500430eae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -303,6 +303,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct ftrace_regs *fregs); + +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs); #else /* !CONFIG_FUNCTION_TRACER */ /* * (un)register_ftrace_function must be a macro since the ops parameter @@ -313,6 +315,10 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } +static inline int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FUNCTION_TRACER */ struct ftrace_func_entry { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fdfd308bebc4..fbdf8d3279ac 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * These will be re-linked against their real values diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..07d87c7a525d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7964,3 +7964,65 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct kallsyms_data { + unsigned long *addrs; + const char **syms; + size_t cnt; + size_t found; +}; + +static int kallsyms_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct kallsyms_data *args = data; + + if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp)) + return 0; + + addr = ftrace_location(addr); + if (!addr) + return 0; + + args->addrs[args->found++] = addr; + return args->found == args->cnt ? 1 : 0; +} + +/** + * ftrace_lookup_symbols - Lookup addresses for array of symbols + * + * @sorted_syms: array of symbols pointers symbols to resolve, + * must be alphabetically sorted + * @cnt: number of symbols/addresses in @syms/@addrs arrays + * @addrs: array for storing resulting addresses + * + * This function looks up addresses for array of symbols provided in + * @syms array (must be alphabetically sorted) and stores them in + * @addrs array, which needs to be big enough to store at least @cnt + * addresses. + * + * This function returns 0 if all provided symbols are found, + * -ESRCH otherwise. + */ +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + struct kallsyms_data args; + int err; + + args.addrs = addrs; + args.syms = sorted_syms; + args.cnt = cnt; + args.found = 0; + err = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (err < 0) + return err; + return args.found == args.cnt ? 0 : -ESRCH; +} -- cgit v1.2.3 From 8be9253344a1d8cd91b22655e55de41e3288aaf9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:14 +0200 Subject: fprobe: Resolve symbols with ftrace_lookup_symbols Using ftrace_lookup_symbols to speed up symbols lookup in register_fprobe_syms API. Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/fprobe.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 89d9f994ebb0..aac63ca9c3d1 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -85,39 +85,31 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, } NOKPROBE_SYMBOL(fprobe_exit_handler); +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + /* Convert ftrace location address from symbols */ static unsigned long *get_ftrace_locations(const char **syms, int num) { - unsigned long addr, size; unsigned long *addrs; - int i; /* Convert symbols to symbol address */ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return ERR_PTR(-ENOMEM); - for (i = 0; i < num; i++) { - addr = kallsyms_lookup_name(syms[i]); - if (!addr) /* Maybe wrong symbol */ - goto error; - - /* Convert symbol address to ftrace location. */ - if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size) - goto error; + /* ftrace_lookup_symbols expects sorted symbols */ + sort(syms, num, sizeof(*syms), symbols_cmp, NULL); - addr = ftrace_location_range(addr, addr + size - 1); - if (!addr) /* No dynamic ftrace there. */ - goto error; + if (!ftrace_lookup_symbols(syms, num, addrs)) + return addrs; - addrs[i] = addr; - } - - return addrs; - -error: kfree(addrs); - return ERR_PTR(-ENOENT); } -- cgit v1.2.3 From 0236fec57a15dc2a068dfe4488e0c2ab4559b1ec Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:15 +0200 Subject: bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link Using kallsyms_lookup_names function to speed up symbols lookup in kprobe multi link attachment and replacing with it the current kprobe_multi_resolve_syms function. This speeds up bpftrace kprobe attachment: # perf stat -r 5 -e cycles ./src/bpftrace -e 'kprobe:x* { } i:ms:1 { exit(); }' ... 6.5681 +- 0.0225 seconds time elapsed ( +- 0.34% ) After: # perf stat -r 5 -e cycles ./src/bpftrace -e 'kprobe:x* { } i:ms:1 { exit(); }' ... 0.5661 +- 0.0275 seconds time elapsed ( +- 4.85% ) Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 112 ++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 46 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f15b826f9899..7fd11c17558d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2229,6 +2229,59 @@ struct bpf_kprobe_multi_run_ctx { unsigned long entry_ip; }; +struct user_syms { + const char **syms; + char *buf; +}; + +static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) +{ + unsigned long __user usymbol; + const char **syms = NULL; + char *buf = NULL, *p; + int err = -ENOMEM; + unsigned int i; + + syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL); + if (!syms) + goto error; + + buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL); + if (!buf) + goto error; + + for (p = buf, i = 0; i < cnt; i++) { + if (__get_user(usymbol, usyms + i)) { + err = -EFAULT; + goto error; + } + err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + syms[i] = p; + p += err + 1; + } + + us->syms = syms; + us->buf = buf; + return 0; + +error: + if (err) { + kvfree(syms); + kvfree(buf); + } + return err; +} + +static void free_user_syms(struct user_syms *us) +{ + kvfree(us->syms); + kvfree(us->buf); +} + static void bpf_kprobe_multi_link_release(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; @@ -2349,53 +2402,12 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, kprobe_multi_link_prog_run(link, entry_ip, regs); } -static int -kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt, - unsigned long *addrs) +static int symbols_cmp(const void *a, const void *b) { - unsigned long addr, size; - const char __user **syms; - int err = -ENOMEM; - unsigned int i; - char *func; - - size = cnt * sizeof(*syms); - syms = kvzalloc(size, GFP_KERNEL); - if (!syms) - return -ENOMEM; + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; - func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - goto error; - - if (copy_from_user(syms, usyms, size)) { - err = -EFAULT; - goto error; - } - - for (i = 0; i < cnt; i++) { - err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN); - if (err == KSYM_NAME_LEN) - err = -E2BIG; - if (err < 0) - goto error; - err = -EINVAL; - addr = kallsyms_lookup_name(func); - if (!addr) - goto error; - if (!kallsyms_lookup_size_offset(addr, &size, NULL)) - goto error; - addr = ftrace_location_range(addr, addr + size - 1); - if (!addr) - goto error; - addrs[i] = addr; - } - - err = 0; -error: - kvfree(syms); - kfree(func); - return err; + return strcmp(*str_a, *str_b); } int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -2441,7 +2453,15 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; } } else { - err = kprobe_multi_resolve_syms(usyms, cnt, addrs); + struct user_syms us; + + err = copy_user_syms(&us, usyms, cnt); + if (err) + goto error; + + sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL); + err = ftrace_lookup_symbols(us.syms, cnt, addrs); + free_user_syms(&us); if (err) goto error; } -- cgit v1.2.3 From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:21 -0700 Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm. Pass a cookie along with BPF_LINK_CREATE requests. Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie. The cookie of a bpf_tracing_link is available by calling bpf_get_attach_cookie when running the BPF program of the attached link. The value of a cookie will be set at bpf_tramp_run_ctx by the trampoline of the link. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 5 +++-- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/bpf_lsm.c | 17 +++++++++++++++++ kernel/bpf/syscall.c | 12 ++++++++---- kernel/bpf/trampoline.c | 7 +++++-- kernel/trace/bpf_trace.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 8 files changed, 69 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1fbc5cf1c7a7..a2b6d197c226 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1769,9 +1769,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); struct bpf_prog *p = l->link.prog; + u64 cookie = l->cookie; - /* mov rdi, 0 */ - emit_mov_imm64(&prog, BPF_REG_1, 0, 0); + /* mov rdi, cookie */ + emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie); /* Prepare struct bpf_tramp_run_ctx. * diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 256fb802e580..aba7ded56436 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,7 @@ struct bpf_link_ops { struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; + u64 cookie; }; struct bpf_tracing_link { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3d032ea1b6a3..bc7f89948f54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1490,6 +1490,15 @@ union bpf_attr { __aligned_u64 addrs; __aligned_u64 cookies; } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; }; } link_create; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 064eccba641d..c1351df9f7ee 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -117,6 +117,21 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_1(bpf_get_attach_cookie, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto = { + .func = bpf_get_attach_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -141,6 +156,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; case BPF_FUNC_ima_file_hash: return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d48165fccf49..72e53489165d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2921,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, - u32 btf_id) + u32 btf_id, + u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -2986,6 +2987,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -3271,7 +3273,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0); + return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) @@ -4524,7 +4526,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_EXT: ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, - attr->link_create.target_btf_id); + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: @@ -4539,7 +4542,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, - attr->link_create.target_btf_id); + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index baf1b65d523e..0e9b3aefc34a 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex); bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; + enum bpf_prog_type ptype = prog->type; - return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN; + return (ptype == BPF_PROG_TYPE_TRACING && + (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN)) || + (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void *bpf_jit_alloc_exec_page(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fd11c17558d..2eaac094caf8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1091,6 +1091,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { + .func = bpf_get_attach_cookie_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { #ifndef CONFIG_X86 @@ -1719,6 +1734,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3d032ea1b6a3..bc7f89948f54 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1490,6 +1490,15 @@ union bpf_attr { __aligned_u64 addrs; __aligned_u64 cookies; } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; }; } link_create; -- cgit v1.2.3 From 07343110b293456d30393e89b86c4dee1ac051c8 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Wed, 11 May 2022 17:38:53 +0800 Subject: bpf: add bpf_map_lookup_percpu_elem for percpu map Add new ebpf helpers bpf_map_lookup_percpu_elem. The implementation method is relatively simple, refer to the implementation method of map_lookup_elem of percpu map, increase the parameters of cpu, and obtain it according to the specified cpu. Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220511093854.411-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/arraymap.c | 15 +++++++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/hashtab.c | 32 ++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 18 ++++++++++++++++++ kernel/bpf/verifier.c | 17 +++++++++++++++-- kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 9 files changed, 103 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ded8711457f..5061ccd8b2dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -89,6 +89,7 @@ struct bpf_map_ops { int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); int (*map_pop_elem)(struct bpf_map *map, void *value); int (*map_peek_elem)(struct bpf_map *map, void *value); + void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -2184,6 +2185,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; +extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc7f89948f54..0210f85131b3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5164,6 +5164,14 @@ union bpf_attr { * if not NULL, is a reference which must be released using its * corresponding release function, or moved into a BPF map before * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5361,6 +5369,7 @@ union bpf_attr { FN(skb_set_tstamp), \ FN(ima_file_hash), \ FN(kptr_xchg), \ + FN(map_lookup_percpu_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 724613da6576..fe40d3b9458f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -243,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (cpu >= nr_cpu_ids) + return NULL; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); +} + int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -725,6 +739,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 13e9dbeeedf3..76f68d0a7ae8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2619,6 +2619,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 705841279d16..17fb69c0e0dc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2199,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + else + return NULL; +} + static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -2211,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; @@ -2300,6 +2330,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, @@ -2318,6 +2349,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3e709fed5306..d5f104a39092 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -119,6 +119,22 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, }; +BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); +} + +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { + .func = bpf_map_lookup_percpu_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, @@ -1420,6 +1436,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c27fee73a2cb..05c1b6656824 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6137,6 +6137,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) goto error; break; + case BPF_FUNC_map_lookup_percpu_elem: + if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH) + goto error; + break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) @@ -6750,7 +6756,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_for_each_map_elem && - func_id != BPF_FUNC_redirect_map) + func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_percpu_elem) return 0; if (map == NULL) { @@ -13810,7 +13817,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_pop_elem || insn->imm == BPF_FUNC_map_peek_elem || insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem)) { + insn->imm == BPF_FUNC_for_each_map_elem || + insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -13859,6 +13867,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, + (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); patch_map_ops_generic: switch (insn->imm) { @@ -13886,6 +13896,9 @@ patch_map_ops_generic: case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); continue; + case BPF_FUNC_map_lookup_percpu_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); + continue; } goto patch_call_imm; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2eaac094caf8..7141ca8a1c2d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1197,6 +1197,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bc7f89948f54..0210f85131b3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5164,6 +5164,14 @@ union bpf_attr { * if not NULL, is a reference which must be released using its * corresponding release function, or moved into a BPF map before * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5361,6 +5369,7 @@ union bpf_attr { FN(skb_set_tstamp), \ FN(ima_file_hash), \ FN(kptr_xchg), \ + FN(map_lookup_percpu_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 May 2022 16:30:10 -0700 Subject: bpf: Add bpf_skc_to_mptcp_sock_proto This patch implements a new struct bpf_func_proto, named bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP, and a new helper bpf_skc_to_mptcp_sock(), which invokes another new helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct mptcp_sock from a given subflow socket. v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c, remove build check for CONFIG_BPF_JIT v5: Drop EXPORT_SYMBOL (Martin) Co-developed-by: Nicolas Rybowski Co-developed-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com --- include/linux/bpf.h | 1 + include/linux/btf_ids.h | 3 ++- include/net/mptcp.h | 6 ++++++ include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/verifier.c | 1 + kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 18 ++++++++++++++++++ net/mptcp/Makefile | 2 ++ net/mptcp/bpf.c | 21 +++++++++++++++++++++ scripts/bpf_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 7 +++++++ 11 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 net/mptcp/bpf.c (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c107392b0ba7..a3ef078401cf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2231,6 +2231,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index bc5d9cc34e4c..335a19092368 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -178,7 +178,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) enum { #define BTF_SOCK_TYPE(name, str) name, diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8b1afd6f5cc4..2ba09de955c7 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -284,4 +284,10 @@ static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif +#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); +#else +static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } +#endif + #endif /* __NET_MPTCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0210f85131b3..56688bee20d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5172,6 +5172,12 @@ union bpf_attr { * Return * Map value associated to *key* on *cpu*, or **NULL** if no entry * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5370,6 +5376,7 @@ union bpf_attr { FN(ima_file_hash), \ FN(kptr_xchg), \ FN(map_lookup_percpu_elem), \ + FN(skc_to_mptcp_sock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b59581026f8..14e8c17d3d8d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -509,6 +509,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_sock || func_id == BPF_FUNC_skc_to_tcp6_sock || func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_mptcp_sock || func_id == BPF_FUNC_skc_to_tcp_timewait_sock || func_id == BPF_FUNC_skc_to_tcp_request_sock; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7141ca8a1c2d..10b157a6d73e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1705,6 +1705,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_udp6_sock_proto; case BPF_FUNC_skc_to_unix_sock: return &bpf_skc_to_unix_sock_proto; + case BPF_FUNC_skc_to_mptcp_sock: + return &bpf_skc_to_mptcp_sock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: diff --git a/net/core/filter.c b/net/core/filter.c index fe0da529d00f..5af58eb48587 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -78,6 +78,7 @@ #include #include #include +#include static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -11281,6 +11282,20 @@ const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; +BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) +{ + BTF_TYPE_EMIT(struct mptcp_sock); + return (unsigned long)bpf_mptcp_sock_from_subflow(sk); +} + +const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { + .func = bpf_skc_to_mptcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], +}; + BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); @@ -11323,6 +11338,9 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_skc_to_mptcp_sock: + func = &bpf_skc_to_mptcp_sock_proto; + break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index e54daceac58b..99dddf08ca73 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -10,3 +10,5 @@ obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o + +obj-$(CONFIG_BPF_SYSCALL) += bpf.o diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c new file mode 100644 index 000000000000..5a0a84ad94af --- /dev/null +++ b/net/mptcp/bpf.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Tessares SA. + * Copyright (c) 2022, SUSE. + * + * Author: Nicolas Rybowski + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include +#include "protocol.h" + +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) +{ + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + return mptcp_sk(mptcp_subflow_ctx(sk)->conn); + + return NULL; +} diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 096625242475..d5452f7eb996 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -633,6 +633,7 @@ class PrinterHelpers(Printer): 'struct socket', 'struct file', 'struct bpf_timer', + 'struct mptcp_sock', ] known_types = { '...', @@ -682,6 +683,7 @@ class PrinterHelpers(Printer): 'struct socket', 'struct file', 'struct bpf_timer', + 'struct mptcp_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0210f85131b3..56688bee20d9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5172,6 +5172,12 @@ union bpf_attr { * Return * Map value associated to *key* on *cpu*, or **NULL** if no entry * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5370,6 +5376,7 @@ union bpf_attr { FN(ima_file_hash), \ FN(kptr_xchg), \ FN(map_lookup_percpu_elem), \ + FN(skc_to_mptcp_sock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 499f12168aebd6da8fa32c9b7d6203ca9b5eb88d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Apr 2022 14:56:32 -0400 Subject: tracing: Have event format check not flag %p* on __get_dynamic_array() The print fmt check against trace events to make sure that the format does not use pointers that may be freed from the time of the trace to the time the event is read, gives a false positive on %pISpc when reading data that was saved in __get_dynamic_array() when it is perfectly fine to do so, as the data being read is on the ring buffer. Link: https://lore.kernel.org/all/20220407144524.2a592ed6@canb.auug.org.au/ Cc: stable@vger.kernel.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78f313b7b315..d5913487821a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -392,12 +392,6 @@ static void test_event_printk(struct trace_event_call *call) if (!(dereference_flags & (1ULL << arg))) goto next_arg; - /* Check for __get_sockaddr */; - if (str_has_prefix(fmt + i, "__get_sockaddr(")) { - dereference_flags &= ~(1ULL << arg); - goto next_arg; - } - /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); @@ -413,7 +407,14 @@ static void test_event_printk(struct trace_event_call *call) a = strchr(fmt + i, '&'); if ((a && (a < r)) || test_field(r, call)) dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); } + next_arg: i--; arg++; -- cgit v1.2.3 From e35c2d8e22745751cf304ec3fe39616643db2e0a Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Wed, 27 Apr 2022 11:41:19 +0800 Subject: tracing: Reset the function filter after completing trampoline/graph selftest The direct trampoline and graph coexistence test sets global_ops to trace only 'trace_selftest_dynamic_test_func', but does not reset it after the test is completed, resulting in the function filter being set already after the system starts. Although it can be reset through the tracefs interface, it is more or less confusing to the user, and we should reset it to trace all functions after the trampoline/graph test completes. Link: https://lkml.kernel.org/r/20220427034119.24668-1-lihuafei1@huawei.com Link: https://lore.kernel.org/all/20220418073958.104029-1-lihuafei1@huawei.com/ Fixes: 130c08065848 ("tracing: Add trampoline/graph selftest") Signed-off-by: Li Huafei Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index abcadbe933bb..a2d301f58ced 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -895,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace, ret = -1; goto out; } + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); #endif /* Don't test dynamic tracing, the function tracer already did */ -- cgit v1.2.3 From e4931b824a6f36cae9cc5632709f015cfa748a25 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 14 Jan 2022 21:10:52 +0800 Subject: tracing: Use trace_create_file() to simplify creation of tracefs entries Creating tracefs entries with tracefs_create_file() followed by pr_warn() is tedious and repetitive, we can use trace_create_file() to simplify this process and make the code more readable. Link: https://lkml.kernel.org/r/20220114131052.534382-1-ytcoode@gmail.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 9 +++------ kernel/trace/trace_dynevent.c | 9 ++------- kernel/trace/trace_events.c | 29 ++++++++++------------------- kernel/trace/trace_kprobe.c | 15 ++++----------- kernel/trace/trace_recursion_record.c | 7 ++----- 5 files changed, 21 insertions(+), 48 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5c465e70d146..6c5b12669e06 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -952,7 +952,6 @@ static struct tracer_stat function_stats __initdata = { static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; - struct dentry *entry; char *name; int ret; int cpu; @@ -983,11 +982,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", - TRACE_MODE_WRITE, d_tracer, NULL, - &ftrace_profile_fops); - if (!entry) - pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); + trace_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); } #else /* CONFIG_FUNCTION_PROFILER */ diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e34e8182ee4b..076b447a1b88 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -255,19 +255,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *entry; int ret; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, - NULL, &dynamic_events_ops); - - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'dynamic_events' entry\n"); + trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, + NULL, &dynamic_events_ops); return 0; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d5913487821a..9a88de9d7815 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3546,12 +3546,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) { - pr_warn("Could not create tracefs 'set_event' entry\n"); + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) return -ENOMEM; - } d_events = tracefs_create_dir("events", parent); if (!d_events) { @@ -3566,16 +3564,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_pid' entry\n"); + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); - entry = tracefs_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ trace_create_file("header_page", TRACE_MODE_READ, d_events, @@ -3790,17 +3784,14 @@ static __init int event_trace_init_fields(void) __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *entry; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", TRACE_MODE_READ, - NULL, tr, &ftrace_avail_fops); - if (!entry) - pr_warn("Could not create tracefs 'available_events' entry\n"); + trace_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 47cebef78532..93507330462c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1907,25 +1907,18 @@ core_initcall(init_kprobe_trace_early); static __init int init_kprobe_trace(void) { int ret; - struct dentry *entry; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, - NULL, NULL, &kprobe_events_ops); - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'kprobe_events' entry\n"); + trace_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, - NULL, NULL, &kprobe_profile_ops); - - if (!entry) - pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); + trace_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); setup_boot_kprobe_events(); diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index 4d4b78c8ca25..a520b11afb0d 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = { __init static int create_recursed_functions(void) { - struct dentry *dentry; - dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, - NULL, NULL, &recursed_functions_fops); - if (!dentry) - pr_warn("WARNING: Failed to create recursed_functions\n"); + trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); return 0; } -- cgit v1.2.3 From 2889c658b2fbc7ad4c5d541734fdc1d97b130753 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 20 Jan 2022 06:59:49 +0000 Subject: ftrace: Deal with error return code of the ftrace_process_locs() function The ftrace_process_locs() function may return -ENOMEM error code, which should be handled by the callers. Link: https://lkml.kernel.org/r/20220120065949.1813231-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c5b12669e06..737e895e2aad 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6864,11 +6864,16 @@ void ftrace_module_enable(struct module *mod) void ftrace_module_init(struct module *mod) { + int ret; + if (ftrace_disabled || !mod->num_ftrace_callsites) return; - ftrace_process_locs(mod, mod->ftrace_callsites, - mod->ftrace_callsites + mod->num_ftrace_callsites); + ret = ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); + if (ret) + pr_warn("ftrace: failed to allocate entries for module '%s' functions\n", + mod->name); } static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, @@ -7201,15 +7206,19 @@ void __init ftrace_init(void) pr_info("ftrace: allocating %ld entries in %ld pages\n", count, count / ENTRIES_PER_PAGE + 1); - last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); + if (ret) { + pr_warn("ftrace: failed to allocate entries for functions\n"); + goto failed; + } pr_info("ftrace: allocated %ld pages with %ld groups\n", ftrace_number_of_pages, ftrace_number_of_groups); + last_ftrace_enabled = ftrace_enabled = 1; + set_ftrace_early_filters(); return; -- cgit v1.2.3 From cb24693d94ceaf658944ad2e922203c0503775d2 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 21 Jan 2022 09:56:23 +0000 Subject: tracing: Use strim() to remove whitespace instead of doing it manually The tracing_set_trace_write() function just removes the trailing whitespace from the user supplied tracer name, but the leading whitespace should also be removed. In addition, if the user supplied tracer name contains only a few whitespace characters, the first one will not be removed using the current method, which results it a single whitespace character left in the buf. To fix all of these issues, we use strim() to correctly remove both the leading and trailing whitespace. Link: https://lkml.kernel.org/r/20220121095623.1826679-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 087740f63d92..498ae22d4ffa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6461,7 +6461,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; - int i; + char *name; size_t ret; int err; @@ -6475,11 +6475,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + name = strim(buf); - err = tracing_set_tracer(tr, buf); + err = tracing_set_tracer(tr, name); if (err) return err; -- cgit v1.2.3 From 99696a2592bca641eb88cc9a80c90e591afebd0f Mon Sep 17 00:00:00 2001 From: Keita Suzuki Date: Mon, 25 Apr 2022 06:37:38 +0000 Subject: tracing: Fix potential double free in create_var_ref() In create_var_ref(), init_var_ref() is called to initialize the fields of variable ref_field, which is allocated in the previous function call to create_hist_field(). Function init_var_ref() allocates the corresponding fields such as ref_field->system, but frees these fields when the function encounters an error. The caller later calls destroy_hist_field() to conduct error handling, which frees the fields and the variable itself. This results in double free of the fields which are already freed in the previous function. Fix this by storing NULL to the corresponding fields when they are freed in init_var_ref(). Link: https://lkml.kernel.org/r/20220425063739.3859998-1-keitasuzuki.park@sslab.ics.keio.ac.jp Fixes: 067fe038e70f ("tracing: Add variable reference handling to hist triggers") CC: stable@vger.kernel.org Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Signed-off-by: Keita Suzuki Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 038dc591545d..c6a65738feb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2093,8 +2093,11 @@ static int init_var_ref(struct hist_field *ref_field, return err; free: kfree(ref_field->system); + ref_field->system = NULL; kfree(ref_field->event_name); + ref_field->event_name = NULL; kfree(ref_field->name); + ref_field->name = NULL; goto out; } -- cgit v1.2.3 From b27f266f74fbda4ee36c2b2b04d15992860cf23b Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 3 May 2022 14:05:46 +0900 Subject: tracing: Fix return value of trace_pid_write() Setting set_event_pid with trailing whitespace lead to endless write system calls like below. $ strace echo "123 " > /sys/kernel/debug/tracing/set_event_pid execve("/usr/bin/echo", ["echo", "123 "], ...) = 0 ... write(1, "123 \n", 5) = 4 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 write(1, "\n", 1) = 0 .... This is because, the result of trace_get_user's are not returned when it read at least one pid. To fix it, update read variable even if parser->idx == 0. The result of applied patch is below. $ strace echo "123 " > /sys/kernel/debug/tracing/set_event_pid execve("/usr/bin/echo", ["echo", "123 "], ...) = 0 ... write(1, "123 \n", 5) = 5 close(1) = 0 Link: https://lkml.kernel.org/r/20220503050546.288911-1-vvghjk1234@gmail.com Cc: Ingo Molnar Cc: Baik Song An Cc: Hong Yeon Kim Cc: Taeung Song Cc: linuxgeek@linuxgeek.io Cc: stable@vger.kernel.org Fixes: 4909010788640 ("tracing: Add set_event_pid directory for future use") Signed-off-by: Wonhyuk Yang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 498ae22d4ffa..4825883b2ffd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -721,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); - if (ret < 0 || !trace_parser_loaded(&parser)) + if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; + if (!trace_parser_loaded(&parser)) + break; + ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -753,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); - read = ret; pid_list = NULL; } -- cgit v1.2.3 From aa748949b4e665f473bc5abdc5f66029cb5f5522 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 10 May 2022 11:45:23 +0200 Subject: tracing/timerlat: Notify IRQ new max latency only if stop tracing is set Currently, the notification of a new max latency is sent from timerlat's IRQ handler anytime a new max latency is found. While this behavior is not wrong, the send IPI overhead itself will increase the thread latency and that is not the desired effect (tracing overhead). Moreover, the thread will notify a new max latency again because the thread latency as it is always higher than the IRQ latency that woke it up. The only case in which it is helpful to notify a new max latency from IRQ is when stop tracing (for the IRQ) is set, as in this case, the thread will not be dispatched. Notify a new max latency from the IRQ handler only if stop tracing is set for the IRQ handler. Link: https://lkml.kernel.org/r/2c2d9a56c0886c8402ba320de32856cbbb10c2bb.1652175637.git.bristot@kernel.org Cc: Juri Lelli Cc: Ingo Molnar Reported-by: Clark Williams Fixes: a955d7eac177 ("trace: Add timerlat tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e9ae1f33a7f0..6494ca27ea6f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1578,11 +1578,12 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) trace_timerlat_sample(&s); - notify_new_max_latency(diff); - - if (osnoise_data.stop_tracing) - if (time_to_us(diff) >= osnoise_data.stop_tracing) + if (osnoise_data.stop_tracing) { + if (time_to_us(diff) >= osnoise_data.stop_tracing) { osnoise_stop_tracing(); + notify_new_max_latency(diff); + } + } wake_up_process(tlat->kthread); -- cgit v1.2.3 From 4dd2aea24ed7613735664feadc9879d37f718c23 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 10 May 2022 11:45:24 +0200 Subject: tracing/timerlat: Print stacktrace in the IRQ handler if needed If print_stack and stop_tracing_us are set, and stop_tracing_us is hit with latency higher than or equal to print_stack, print the stack at the IRQ handler as it is useful to define the root cause for the IRQ latency. Link: https://lkml.kernel.org/r/fd04530ce98ae9270e41bb124ee5bf67b05ecfed.1652175637.git.bristot@kernel.org Cc: Juri Lelli Cc: Clark Williams Cc: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/timerlat-tracer.rst | 5 +++-- kernel/trace/trace_osnoise.c | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst index 64d1fe6e9b93..d643c95c01eb 100644 --- a/Documentation/trace/timerlat-tracer.rst +++ b/Documentation/trace/timerlat-tracer.rst @@ -74,8 +74,9 @@ directory. The timerlat configs are: - stop_tracing_total_us: stop the system tracing if a timer latency at the *thread* context is higher than the configured value happens. Writing 0 disables this option. - - print_stack: save the stack of the IRQ occurrence, and print - it after the *thread context* event". + - print_stack: save the stack of the IRQ occurrence. The stack is printed + after the *thread context* event, or at the IRQ handler if *stop_tracing_us* + is hit. timerlat and osnoise ---------------------------- diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 6494ca27ea6f..9b204ee3c6f5 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1580,6 +1580,19 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { + + /* + * At this point, if stop_tracing is set and <= print_stack, + * print_stack is set and would be printed in the thread handler. + * + * Thus, print the stack trace as it is helpful to define the + * root cause of an IRQ latency. + */ + if (osnoise_data.stop_tracing <= osnoise_data.print_stack) { + timerlat_save_stack(0); + timerlat_dump_stack(time_to_us(diff)); + } + osnoise_stop_tracing(); notify_new_max_latency(diff); } -- cgit v1.2.3 From 9c556e5a4dd5cfc0939a0575577d0517118f98af Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 10 May 2022 11:45:25 +0200 Subject: tracing/timerlat: Do not wakeup the thread if the trace stops at the IRQ There is no need to wakeup the timerlat/ thread if stop tracing is hit at the timerlat's IRQ handler. Return before waking up timerlat's thread. Link: https://lkml.kernel.org/r/b392356c91b56aedd2b289513cc56a84cf87e60d.1652175637.git.bristot@kernel.org Cc: Juri Lelli Cc: Clark Williams Cc: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 9b204ee3c6f5..035ec8b84e12 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1595,6 +1595,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + + return HRTIMER_NORESTART; } } -- cgit v1.2.3 From 2d601b98643dd2846e2958d931826e7b7af44969 Mon Sep 17 00:00:00 2001 From: liqiong Date: Thu, 12 May 2022 22:32:30 +0800 Subject: tracing: Change "char *" string form to "char []" The "char []" string form declares a single variable. It is better than "char *" which creates two variables in the final assembly. Link: https://lkml.kernel.org/r/20220512143230.28796-1-liqiong@nfschina.com Signed-off-by: liqiong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4825883b2ffd..e38a7ca4cdd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4252,7 +4252,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; + static const char space[] = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c6a65738feb3..48e82e141d54 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4165,7 +4165,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } -static const char *no_comm = "(no comm)"; +static const char no_comm[] = "(no comm)"; static u64 hist_field_execname(struct hist_field *hist_field, struct tracing_map_elt *elt, -- cgit v1.2.3 From 2decd16f47e3df3234b5486fe89a9aa5a1102af1 Mon Sep 17 00:00:00 2001 From: liqiong Date: Fri, 13 May 2022 15:52:21 +0800 Subject: tracing: Cleanup code by removing init "char *name" The pointer is assigned to "type->name" anyway. no need to initialize with "preemption". Link: https://lkml.kernel.org/r/20220513075221.26275-1-liqiong@nfschina.com Signed-off-by: liqiong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e38a7ca4cdd0..f400800bc910 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4276,9 +4276,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct tracer *type = iter->trace; unsigned long entries; unsigned long total; - const char *name = "preemption"; - - name = type->name; + const char *name = type->name; get_total_entries(buf, &total, &entries); -- cgit v1.2.3 From 3a2bfec0b02f2226ff3376a5d2ff604d799bd7ea Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 18 May 2022 10:36:40 +0800 Subject: ftrace: Remove return value of ftrace_arch_modify_*() All instances of the function ftrace_arch_modify_prepare() and ftrace_arch_modify_post_process() return zero. There's no point in checking their return value. Just have them be void functions. Link: https://lkml.kernel.org/r/20220518023639.4065-1-kunyu@nfschina.com Signed-off-by: Li kunyu Signed-off-by: Steven Rostedt (Google) --- arch/arm/kernel/ftrace.c | 6 ++---- arch/riscv/kernel/ftrace.c | 6 ++---- arch/s390/kernel/ftrace.c | 3 +-- arch/x86/kernel/ftrace.c | 6 ++---- include/linux/ftrace.h | 4 ++-- kernel/trace/ftrace.c | 16 ++++------------ 6 files changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 83cc068586bc..a0b6d1e3812f 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -79,16 +79,14 @@ static unsigned long __ref adjust_address(struct dyn_ftrace *rec, return (unsigned long)&ftrace_regs_caller_from_init; } -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) { - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) { /* Make sure any TLB misses during machine stop are cleared. */ flush_tlb_all(); - return 0; } static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr, diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 4716f4cdc038..2086f6585773 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -12,16 +12,14 @@ #include #ifdef CONFIG_DYNAMIC_FTRACE -int ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { mutex_lock(&text_mutex); - return 0; } -int ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { mutex_unlock(&text_mutex); - return 0; } static int ftrace_check_current_call(unsigned long hook_pos, diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 1852d46babb1..416b5a94353d 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -225,14 +225,13 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) { /* * Flush any pre-fetched instructions on all * CPUs to make the new code visible. */ text_poke_sync_lock(); - return 0; } #ifdef CONFIG_MODULES diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1e31c7d21597..73d2719ed12c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -37,7 +37,7 @@ static int ftrace_poke_late = 0; -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { /* @@ -47,10 +47,9 @@ int ftrace_arch_code_modify_prepare(void) */ mutex_lock(&text_mutex); ftrace_poke_late = 1; - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { /* @@ -61,7 +60,6 @@ int ftrace_arch_code_modify_post_process(void) text_poke_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); - return 0; } static const char *ftrace_nop_replace(void) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..a5f74f6e7e4e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -449,8 +449,8 @@ static inline void stack_tracer_enable(void) { } #ifdef CONFIG_DYNAMIC_FTRACE -int ftrace_arch_code_modify_prepare(void); -int ftrace_arch_code_modify_post_process(void); +void ftrace_arch_code_modify_prepare(void); +void ftrace_arch_code_modify_post_process(void); enum ftrace_bug_type { FTRACE_BUG_UNKNOWN, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 737e895e2aad..852c731cbd9a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2704,18 +2704,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) * archs can override this function if they must do something * before the modifying code is performed. */ -int __weak ftrace_arch_code_modify_prepare(void) +void __weak ftrace_arch_code_modify_prepare(void) { - return 0; } /* * archs can override this function if they must do something * after the modifying code is performed. */ -int __weak ftrace_arch_code_modify_post_process(void) +void __weak ftrace_arch_code_modify_post_process(void) { - return 0; } void ftrace_modify_all_code(int command) @@ -2801,12 +2799,7 @@ void __weak arch_ftrace_update_code(int command) static void ftrace_run_update_code(int command) { - int ret; - - ret = ftrace_arch_code_modify_prepare(); - FTRACE_WARN_ON(ret); - if (ret) - return; + ftrace_arch_code_modify_prepare(); /* * By default we use stop_machine() to modify the code. @@ -2816,8 +2809,7 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - ret = ftrace_arch_code_modify_post_process(); - FTRACE_WARN_ON(ret); + ftrace_arch_code_modify_post_process(); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, -- cgit v1.2.3 From 50c697819d59c5013a66728940015348919a0c0c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:31 +0200 Subject: ftrace: Fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20220521111145.81697-81-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 852c731cbd9a..fb8f08b4bd41 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -119,7 +119,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */ void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); -- cgit v1.2.3 From 154827f8e53d8c492b3fb0cb757fbcadb5d516b5 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Sat, 21 May 2022 23:18:26 -0700 Subject: tracing: Initialize integer variable to prevent garbage return value Initialize the integer variable to 0 to fix the clang scan warning: Undefined or garbage value returned to caller [core.uninitialized.UndefReturn] return ret; Link: https://lkml.kernel.org/r/20220522061826.1751-1-gautammenghani201@gmail.com Cc: stable@vger.kernel.org Fixes: 8993665abcce ("tracing/boot: Support multiple handlers for per-event histogram") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Gautam Menghani Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0580287d7a0d..778200dd8ede 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, { struct xbc_node *node; const char *p, *handler; - int ret; + int ret = 0; handler = xbc_node_get_data(hnode); -- cgit v1.2.3 From bb5eb8f3b329789fbf22c85328dcf696a3e97ffb Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Mon, 23 May 2022 06:30:33 +0000 Subject: tracing: Disable kcov on trace_preemptirq.c Functions in trace_preemptirq.c could be invoked from early interrupt code that bypasses kcov trace function's in_task() check. Disable kcov on this file to reduce random code coverage. Link: https://lkml.kernel.org/r/20220523063033.1778974-1-liu3101@purdue.edu Acked-by: Dmitry Vyukov Signed-off-by: Congyu Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d77cd8032213..0d261774d6f3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +# Functions in this file could be invoked from early interrupt +# code and produce random code coverage. +KCOV_INSTRUMENT_trace_preemptirq.o := n + CFLAGS_bpf_trace.o := -I$(src) CFLAGS_trace_benchmark.o := -I$(src) -- cgit v1.2.3 From 0a54f556b035e5217e21724d82dd98ce695bd6d6 Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 24 May 2022 14:39:37 +0800 Subject: tracing: Fix comments of create_filter() The name in comments of parameter "filter_string" in function create_filter is annotated as "filter_str", just fix it. Link: https://lkml.kernel.org/r/20220524063937.52873-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b458a9afa2c0..4b1057ab9d96 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1816,7 +1816,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * create_filter - create a filter for a trace_event_call * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for - * @filter_str: filter string + * @filter_string: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) * Must be a pointer that references a NULL pointer. -- cgit v1.2.3 From 7d54c15cb89a29a5f59e5ffc9ee62e6591769ef1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 24 May 2022 10:08:39 -0700 Subject: ftrace: Clean up hash direct_functions on register failures We see the following GPF when register_ftrace_direct fails: [ ] general protection fault, probably for non-canonical address \ 0x200000000000010: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [...] [ ] RIP: 0010:ftrace_find_rec_direct+0x53/0x70 [ ] Code: 48 c1 e0 03 48 03 42 08 48 8b 10 31 c0 48 85 d2 74 [...] [ ] RSP: 0018:ffffc9000138bc10 EFLAGS: 00010206 [ ] RAX: 0000000000000000 RBX: ffffffff813e0df0 RCX: 000000000000003b [ ] RDX: 0200000000000000 RSI: 000000000000000c RDI: ffffffff813e0df0 [ ] RBP: ffffffffa00a3000 R08: ffffffff81180ce0 R09: 0000000000000001 [ ] R10: ffffc9000138bc18 R11: 0000000000000001 R12: ffffffff813e0df0 [ ] R13: ffffffff813e0df0 R14: ffff888171b56400 R15: 0000000000000000 [ ] FS: 00007fa9420c7780(0000) GS:ffff888ff6a00000(0000) knlGS:000000000 [ ] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ ] CR2: 000000000770d000 CR3: 0000000107d50003 CR4: 0000000000370ee0 [ ] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ ] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ ] Call Trace: [ ] [ ] register_ftrace_direct+0x54/0x290 [ ] ? render_sigset_t+0xa0/0xa0 [ ] bpf_trampoline_update+0x3f5/0x4a0 [ ] ? 0xffffffffa00a3000 [ ] bpf_trampoline_link_prog+0xa9/0x140 [ ] bpf_tracing_prog_attach+0x1dc/0x450 [ ] bpf_raw_tracepoint_open+0x9a/0x1e0 [ ] ? find_held_lock+0x2d/0x90 [ ] ? lock_release+0x150/0x430 [ ] __sys_bpf+0xbd6/0x2700 [ ] ? lock_is_held_type+0xd8/0x130 [ ] __x64_sys_bpf+0x1c/0x20 [ ] do_syscall_64+0x3a/0x80 [ ] entry_SYSCALL_64_after_hwframe+0x44/0xae [ ] RIP: 0033:0x7fa9421defa9 [ ] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 9 f8 [...] [ ] RSP: 002b:00007ffed743bd78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 [ ] RAX: ffffffffffffffda RBX: 00000000069d2480 RCX: 00007fa9421defa9 [ ] RDX: 0000000000000078 RSI: 00007ffed743bd80 RDI: 0000000000000011 [ ] RBP: 00007ffed743be00 R08: 0000000000bb7270 R09: 0000000000000000 [ ] R10: 00000000069da210 R11: 0000000000000246 R12: 0000000000000001 [ ] R13: 00007ffed743c4b0 R14: 00000000069d2480 R15: 0000000000000001 [ ] [ ] Modules linked in: klp_vm(OK) [ ] ---[ end trace 0000000000000000 ]--- One way to trigger this is: 1. load a livepatch that patches kernel function xxx; 2. run bpftrace -e 'kfunc:xxx {}', this will fail (expected for now); 3. repeat #2 => gpf. This is because the entry is added to direct_functions, but not removed. Fix this by remove the entry from direct_functions when register_ftrace_direct fails. Also remove the last trailing space from ftrace.c, so we don't have to worry about it anymore. Link: https://lkml.kernel.org/r/20220524170839.900849-1-song@kernel.org Cc: stable@vger.kernel.org Fixes: 763e34e74bb7 ("ftrace: Add register_ftrace_direct()") Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb8f08b4bd41..d653ef4febc5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4454,7 +4454,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to remove the data from * * Returns the data if it is found, otherwise NULL. - * Note, if the data pointer is used as the data itself, (see + * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. */ @@ -5188,8 +5188,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { ret = register_ftrace_function(&direct_ops); @@ -5198,6 +5196,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(&direct->next); -- cgit v1.2.3 From 8d4a21b5ac9dad5d5221c60060b3ac28d22f57ca Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 26 May 2022 15:29:57 +0800 Subject: tracing: Fix comments for event_trigger_separate_filter() The parameter name in comments of event_trigger_separate_filter() is inconsistent with actual parameter name, fix it. Link: https://lkml.kernel.org/r/20220526072957.165655-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 21592bed2e89..cb866c3141af 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -738,15 +738,15 @@ bool event_trigger_empty_param(const char *param) /** * event_trigger_separate_filter - separate an event trigger from a filter - * @param: The param string containing trigger and possibly filter - * @trigger: outparam, will be filled with a pointer to the trigger + * @param_and_filter: String containing trigger and possibly filter + * @param: outparam, will be filled with a pointer to the trigger * @filter: outparam, will be filled with a pointer to the filter * @param_required: Specifies whether or not the param string is required * * Given a param string of the form '[trigger] [if filter]', this * function separates the filter from the trigger and returns the - * trigger in *trigger and the filter in *filter. Either the *trigger - * or the *filter may be set to NULL by this function - if not set to + * trigger in @param and the filter in @filter. Either the @param + * or the @filter may be set to NULL by this function - if not set to * NULL, they will contain strings corresponding to the trigger and * filter. * -- cgit v1.2.3 From b39181f7c6907dc66ff937b74758671fa6ba430c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 26 May 2022 14:19:12 -0400 Subject: ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function If an unused weak function was traced, it's call to fentry will still exist, which gets added into the __mcount_loc table. Ftrace will use kallsyms to retrieve the name for each location in __mcount_loc to display it in the available_filter_functions and used to enable functions via the name matching in set_ftrace_filter/notrace. Enabling these functions do nothing but enable an unused call to ftrace_caller. If a traced weak function is overridden, the symbol of the function would be used for it, which will either created duplicate names, or if the previous function was not traced, it would be incorrectly be listed in available_filter_functions as a function that can be traced. This became an issue with BPF[1] as there are tooling that enables the direct callers via ftrace but then checks to see if the functions were actually enabled. The case of one function that was marked notrace, but was followed by an unused weak function that was traced. The unused function's call to fentry was added to the __mcount_loc section, and kallsyms retrieved the untraced function's symbol as the weak function was overridden. Since the untraced function would not get traced, the BPF check would detect this and fail. The real fix would be to fix kallsyms to not show addresses of weak functions as the function before it. But that would require adding code in the build to add function size to kallsyms so that it can know when the function ends instead of just using the start of the next known symbol. In the mean time, this is a work around. Add a FTRACE_MCOUNT_MAX_OFFSET macro that if defined, ftrace will ignore any function that has its call to fentry/mcount that has an offset from the symbol that is greater than FTRACE_MCOUNT_MAX_OFFSET. If CONFIG_HAVE_FENTRY is defined for x86, define FTRACE_MCOUNT_MAX_OFFSET to zero (unless IBT is enabled), which will have ftrace ignore all locations that are not at the start of the function (or one after the ENDBR instruction). A worker thread is added at boot up to scan all the ftrace record entries, and will mark any that fail the FTRACE_MCOUNT_MAX_OFFSET test as disabled. They will still appear in the available_filter_functions file as: __ftrace_invalid_address___ (showing the offset that caused it to be invalid). This is required for tools that use libtracefs (like trace-cmd does) that scan the available_filter_functions and enable set_ftrace_filter and set_ftrace_notrace using indexes of the function listed in the file (this is a speedup, as enabling thousands of files via names is an O(n^2) operation and can take minutes to complete, where the indexing takes less than a second). The invalid functions cannot be removed from available_filter_functions as the names there correspond to the ftrace records in the array that manages them (and the indexing depends on this). [1] https://lore.kernel.org/all/20220412094923.0abe90955e5db486b7bca279@kernel.org/ Link: https://lkml.kernel.org/r/20220526141912.794c2786@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 7 +++ kernel/trace/ftrace.c | 141 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 024d9797646e..b5ef474be858 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -9,6 +9,13 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +/* Ignore unused weak functions which will have non zero offsets */ +#ifdef CONFIG_HAVE_FENTRY +# include +/* Add offset for endbr64 if IBT enabled */ +# define FTRACE_MCOUNT_MAX_OFFSET ENDBR_INSN_SIZE +#endif + #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d653ef4febc5..c5088c76a108 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,6 +45,8 @@ #include "trace_output.h" #include "trace_stat.h" +#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" + #define FTRACE_WARN_ON(cond) \ ({ \ int ___r = cond; \ @@ -3654,6 +3656,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, seq_printf(m, " ->%pS", ptr); } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +/* + * Weak functions can still have an mcount/fentry that is saved in + * the __mcount_loc section. These can be detected by having a + * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the + * symbol found by kallsyms is not the function that the mcount/fentry + * is part of. The offset is much greater in these cases. + * + * Test the record to make sure that the ip points to a valid kallsyms + * and if not, mark it disabled. + */ +static int test_for_valid_rec(struct dyn_ftrace *rec) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned long offset; + const char *ret; + + ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str); + + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + rec->flags |= FTRACE_FL_DISABLED; + return 0; + } + return 1; +} + +static struct workqueue_struct *ftrace_check_wq __initdata; +static struct work_struct ftrace_check_work __initdata; + +/* + * Scan all the mcount/fentry entries to make sure they are valid. + */ +static __init void ftrace_check_work_func(struct work_struct *work) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + test_for_valid_rec(rec); + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static int __init ftrace_check_for_weak_functions(void) +{ + INIT_WORK(&ftrace_check_work, ftrace_check_work_func); + + ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0); + + queue_work(ftrace_check_wq, &ftrace_check_work); + return 0; +} + +static int __init ftrace_check_sync(void) +{ + /* Make sure the ftrace_check updates are finished */ + if (ftrace_check_wq) + destroy_workqueue(ftrace_check_wq); + return 0; +} + +late_initcall_sync(ftrace_check_sync); +subsys_initcall(ftrace_check_for_weak_functions); + +static int print_rec(struct seq_file *m, unsigned long ip) +{ + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(ip, NULL, &offset, &modname, str); + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld", + FTRACE_INVALID_FUNCTION, offset); + ret = NULL; + } + + seq_puts(m, str); + if (modname) + seq_printf(m, " [%s]", modname); + return ret == NULL ? -1 : 0; +} +#else +static inline int test_for_valid_rec(struct dyn_ftrace *rec) +{ + return 1; +} + +static inline int print_rec(struct seq_file *m, unsigned long ip) +{ + seq_printf(m, "%ps", (void *)ip); + return 0; +} +#endif + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -3678,7 +3779,13 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps", (void *)rec->ip); + if (print_rec(m, rec->ip)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); + seq_putc(m, '\n'); + return 0; + } + if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; @@ -3996,6 +4103,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, return 0; } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + unsigned long offset; + + kallsyms_lookup(ip, NULL, &offset, modname, str); + if (offset > FTRACE_MCOUNT_MAX_OFFSET) + return -1; + return 0; +} +#else +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + kallsyms_lookup(ip, NULL, NULL, modname, str); + return 0; +} +#endif + static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) @@ -4003,7 +4128,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, char str[KSYM_SYMBOL_LEN]; char *modname; - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + if (lookup_ip(rec->ip, &modname, str)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING && + !(rec->flags & FTRACE_FL_DISABLED)); + return 0; + } if (mod_g) { int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; @@ -6819,6 +6949,13 @@ void ftrace_module_enable(struct module *mod) !within_module_init(rec->ip, mod)) break; + /* Weak functions should still be ignored */ + if (!test_for_valid_rec(rec)) { + /* Clear all other flags. Should not be enabled anyway */ + rec->flags = FTRACE_FL_DISABLED; + continue; + } + cnt = 0; /* -- cgit v1.2.3 From ff979b2a9d9779382030023bfc4e3b1989c8c314 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 6 May 2022 11:27:37 +0800 Subject: ftrace/fgraph: fix increased missing-prototypes warnings After commit e999995c84c3 ("ftrace: cleanup ftrace_graph_caller enable and disable") merged into the linux-next tree, the kernel test robot (lkp@intel.com) has send out report that there are increased missing-prototypes warnings caused by that commit. COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 \ O=build_dir ARCH=sh SHELL=/bin/bash kernel/trace/ warning: no previous prototype for 'ftrace_enable_ftrace_graph_caller' [-Wmissing-prototypes] warning: no previous prototype for 'ftrace_disable_ftrace_graph_caller' [-Wmissing-prototypes] warning: no previous prototype for 'ftrace_return_to_handler' [-Wmissing-prototypes] warning: no previous prototype for 'ftrace_graph_sleep_time_control' [-Wmissing-prototypes] BTW there are so many missing-prototypes warnings if build kernel with "W=1". The increased warnings for 'ftrace_[enable,disable]_ftrace_graph_caller' is caused by CONFIG_FUNCTION_GRAPH_TRACER && !CONFIG_DYNAMIC_FTRACE, so the declarations in can't be seen in fgraph.c. And this warning can't reproduce on x86_64 since x86_64 select HAVE_FUNCTION_GRAPH_TRACER only when DYNAMIC_FTRACE, so fgraph.c will always see the declarations in . This patch fix the increased warnings by put the definitions in CONFIG_DYNAMIC_FTRACE although there are no real problems exist. Signed-off-by: Chengming Zhou Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20220506032737.23375-1-zhouchengming@bytedance.com Signed-off-by: Catalin Marinas --- kernel/trace/fgraph.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 289311680c29..2cd374294be7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -30,6 +30,7 @@ int ftrace_graph_active; /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; +#ifdef CONFIG_DYNAMIC_FTRACE /* * archs can override this function if they must do something * to enable hook for graph tracer. @@ -47,6 +48,7 @@ int __weak ftrace_disable_ftrace_graph_caller(void) { return 0; } +#endif /** * ftrace_graph_stop - set to permanently disable function graph tracing -- cgit v1.2.3 From fd58f7df2415ef747782e01f94880fefad1247cf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 May 2022 13:24:05 +0300 Subject: bpf: Use safer kvmalloc_array() where possible The kvmalloc_array() function is safer because it has a check for integer overflows. These sizes come from the user and I was not able to see any bounds checking so an integer overflow seems like a realistic concern. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Signed-off-by: Dan Carpenter Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/Yo9VRVMeHbALyjUH@kili Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 10b157a6d73e..7a13e6ac6327 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2263,11 +2263,11 @@ static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 int err = -ENOMEM; unsigned int i; - syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL); + syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); if (!syms) goto error; - buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL); + buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); if (!buf) goto error; @@ -2464,7 +2464,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; size = cnt * sizeof(*addrs); - addrs = kvmalloc(size, GFP_KERNEL); + addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; @@ -2489,7 +2489,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { - cookies = kvmalloc(size, GFP_KERNEL); + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!cookies) { err = -ENOMEM; goto error; -- cgit v1.2.3 From eb1b2985fe5c5f02e43e4c0d47bbe7ed835007f3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:16 +0200 Subject: ftrace: Keep address offset in ftrace_lookup_symbols We want to store the resolved address on the same index as the symbol string, because that's the user (bpf kprobe link) code assumption. Also making sure we don't store duplicates that might be present in kallsyms. Acked-by: Song Liu Acked-by: Steven Rostedt (Google) Fixes: bed0d9a50dac ("ftrace: Add ftrace_lookup_symbols function") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/ftrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e750fe141a60..601ccf1b2f09 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name, struct module *mod, unsigned long addr) { struct kallsyms_data *args = data; + const char **sym; + int idx; - if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp)) + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) return 0; addr = ftrace_location(addr); if (!addr) return 0; - args->addrs[args->found++] = addr; + args->addrs[idx] = addr; + args->found++; return args->found == args->cnt ? 1 : 0; } @@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int err; + memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; args.cnt = cnt; -- cgit v1.2.3 From eb5fb0325698d05f0bf78d322de82c451a3685a2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:17 +0200 Subject: bpf: Force cookies array to follow symbols sorting When user specifies symbols and cookies for kprobe_multi link interface it's very likely the cookies will be misplaced and returned to wrong functions (via get_attach_cookie helper). The reason is that to resolve the provided functions we sort them before passing them to ftrace_lookup_symbols, but we do not do the same sort on the cookie values. Fixing this by using sort_r function with custom swap callback that swaps cookie values as well. Fixes: 0236fec57a15 ("bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 60 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a13e6ac6327..88589d74a892 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, kprobe_multi_link_prog_run(link, entry_ip, regs); } -static int symbols_cmp(const void *a, const void *b) +static int symbols_cmp_r(const void *a, const void *b, const void *priv) { const char **str_a = (const char **) a; const char **str_b = (const char **) b; @@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b) return strcmp(*str_a, *str_b); } +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; + +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; + + swap(*name_a, *name_b); + + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2468,38 +2490,46 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!addrs) return -ENOMEM; + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + if (uaddrs) { if (copy_from_user(addrs, uaddrs, size)) { err = -EFAULT; goto error; } } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; struct user_syms us; err = copy_user_syms(&us, usyms, cnt); if (err) goto error; - sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL); + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + err = ftrace_lookup_symbols(us.syms, cnt, addrs); free_user_syms(&us); if (err) goto error; } - ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); - if (ucookies) { - cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); - if (!cookies) { - err = -ENOMEM; - goto error; - } - if (copy_from_user(cookies, ucookies, size)) { - err = -EFAULT; - goto error; - } - } - link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; -- cgit v1.2.3 From 5cf9c91ba927119fc6606b938b1895bb2459d3bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 09:48:25 +0200 Subject: block: serialize all debugfs operations using q->debugfs_mutex Various places like I/O schedulers or the QOS infrastructure try to register debugfs files on demans, which can race with creating and removing the main queue debugfs directory. Use the existing debugfs_mutex to serialize all debugfs operations that rely on q->debugfs_dir or the directories hanging off it. To make the teardown code a little simpler declare all debugfs dentry pointers and not just the main one uncoditionally in blkdev.h. Move debugfs_mutex next to the dentries that it protects and document what it is used for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220614074827.458955-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 25 ++++++++++++++++++++----- block/blk-mq-debugfs.h | 5 ----- block/blk-mq-sched.c | 11 +++++++++++ block/blk-rq-qos.c | 2 ++ block/blk-rq-qos.h | 7 ++++++- block/blk-sysfs.c | 20 +++++++++----------- include/linux/blkdev.h | 8 ++++---- kernel/trace/blktrace.c | 3 --- 8 files changed, 52 insertions(+), 29 deletions(-) (limited to 'kernel/trace') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7e4136a60e1c..f0fcfe1387cb 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -711,11 +711,6 @@ void blk_mq_debugfs_register(struct request_queue *q) } } -void blk_mq_debugfs_unregister(struct request_queue *q) -{ - q->sched_debugfs_dir = NULL; -} - static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { @@ -746,6 +741,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; @@ -773,6 +770,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. @@ -790,6 +789,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) void blk_mq_debugfs_unregister_sched(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } @@ -811,6 +812,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { + lockdep_assert_held(&rqos->q->debugfs_mutex); + + if (!rqos->q->debugfs_dir) + return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; } @@ -820,6 +825,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) struct request_queue *q = rqos->q; const char *dir_name = rq_qos_id_to_name(rqos->id); + lockdep_assert_held(&q->debugfs_mutex); + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; @@ -835,6 +842,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->rqos_debugfs_dir); q->rqos_debugfs_dir = NULL; } @@ -844,6 +853,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs @@ -863,6 +874,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { + lockdep_assert_held(&hctx->queue->debugfs_mutex); + + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 69918f4170d6..771d45832878 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); void blk_mq_debugfs_register(struct request_queue *q); -void blk_mq_debugfs_unregister(struct request_queue *q); void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); @@ -42,10 +41,6 @@ static inline void blk_mq_debugfs_register(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister(struct request_queue *q) -{ -} - static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index eb3c65a21362..a4f7c101b53b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -594,7 +594,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (ret) goto err_free_map_and_rqs; + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); + mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { @@ -607,7 +609,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return ret; } } + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } return 0; @@ -648,14 +652,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } + + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index e83af7bc7591..249a6f05dd3b 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -294,7 +294,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_queue_rqos(q); + mutex_unlock(&q->debugfs_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 68267007da1c..0e46052b018a 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); - if (rqos->ops->debugfs_attrs) + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) @@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); } typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 88bd41d4cb59..6e4801b217a7 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -779,14 +779,13 @@ static void blk_release_queue(struct kobject *kobj) if (queue_is_mq(q)) blk_mq_release(q); - blk_trace_shutdown(q); mutex_lock(&q->debugfs_mutex); + blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); - if (queue_is_mq(q)) - blk_mq_debugfs_unregister(q); - bioset_exit(&q->bio_split); if (blk_queue_has_srcu(q)) @@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + if (queue_is_mq(q)) + __blk_mq_register_dev(dev, q); + mutex_lock(&q->sysfs_lock); + mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), blk_debugfs_root); - mutex_unlock(&q->debugfs_mutex); - - if (queue_is_mq(q)) { - __blk_mq_register_dev(dev, q); + if (queue_is_mq(q)) blk_mq_debugfs_register(q); - } - - mutex_lock(&q->sysfs_lock); + mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk, NULL); if (ret) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb6e3c31b3b7..73c886eba8e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -482,7 +482,6 @@ struct request_queue { #endif /* CONFIG_BLK_DEV_ZONED */ int node; - struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif @@ -526,11 +525,12 @@ struct request_queue { struct bio_set bio_split; struct dentry *debugfs_dir; - -#ifdef CONFIG_BLK_DEBUG_FS struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; -#endif + /* + * Serializes all debugfs metadata operations using the above dentries. + */ + struct mutex debugfs_mutex; bool mq_sysfs_init_done; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 10a32b0f2deb..fe04c6f96ca5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -770,14 +770,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - - mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From c0f3bb4054ef036e5f67e27f2e3cad9e6512cf00 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 8 Jun 2022 01:11:12 +0900 Subject: rethook: Reject getting a rethook if RCU is not watching Since the rethook_recycle() will involve the call_rcu() for reclaiming the rethook_instance, the rethook must be set up at the RCU available context (non idle). This rethook_recycle() in the rethook trampoline handler is inevitable, thus the RCU available check must be done before setting the rethook trampoline. This adds a rcu_is_watching() check in the rethook_try_get() so that it will return NULL if it is called when !rcu_is_watching(). Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Daniel Borkmann Acked-by: Steven Rostedt (Google) Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/165461827269.280167.7379263615545598958.stgit@devnote2 --- kernel/trace/rethook.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index b56833700d23..c69d82273ce7 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + fn = freelist_try_get(&rh->pool); if (!fn) return NULL; -- cgit v1.2.3 From cc72b72073ac982a954d3b43519ca1c28f03c27c Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 28 May 2022 00:55:39 +0900 Subject: tracing/kprobes: Check whether get_kretprobe() returns NULL in kretprobe_dispatcher() There is a small chance that get_kretprobe(ri) returns NULL in kretprobe_dispatcher() when another CPU unregisters the kretprobe right after __kretprobe_trampoline_handler(). To avoid this issue, kretprobe_dispatcher() checks the get_kretprobe() return value again. And if it is NULL, it returns soon because that kretprobe is under unregistering process. This issue has been introduced when the kretprobe is decoupled from the struct kretprobe_instance by commit d741bf41d7c7 ("kprobes: Remove kretprobe hash"). Before that commit, the struct kretprob_instance::rp directly points the kretprobe and it is never be NULL. Link: https://lkml.kernel.org/r/165366693881.797669.16926184644089588731.stgit@devnote2 Reported-by: Yonghong Song Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") Cc: Peter Zijlstra Cc: Ingo Molnar Cc: bpf Cc: Kernel Team Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_kprobe.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 93507330462c..a245ea673715 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1718,8 +1718,17 @@ static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); - struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return 0; + tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) -- cgit v1.2.3 From f4b0d318097e45cbac5e14976f8bb56aa2cef504 Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 2 Jun 2022 22:06:13 +0800 Subject: tracing: Simplify conditional compilation code in tracing_set_tracer() Two conditional compilation directives "#ifdef CONFIG_TRACER_MAX_TRACE" are used consecutively, and no other code in between. Simplify conditional the compilation code and only use one "#ifdef CONFIG_TRACER_MAX_TRACE". Link: https://lkml.kernel.org/r/20220602140613.545069-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c95992e2c71..a8cfac0611bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6424,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) synchronize_rcu(); free_snapshot(tr); } -#endif -#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) -- cgit v1.2.3 From 12c3e0c92fd7cb3d3b698d84fdde7dccb6ba8822 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Sun, 12 Jun 2022 07:42:32 -0700 Subject: tracing/uprobes: Remove unwanted initialization in __trace_uprobe_create() Remove the unwanted initialization of variable 'ret'. This fixes the clang scan warning: Value stored to 'ret' is never read [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220612144232.145209-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_uprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..c3dc4f859a6b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -546,7 +546,6 @@ static int __trace_uprobe_create(int argc, const char **argv) bool is_return = false; int i, ret; - ret = 0; ref_ctr_offset = 0; switch (argv[0][0]) { -- cgit v1.2.3