From 8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:46 +0000 Subject: bpf: implement sleepable uprobes by chaining gps uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu. In order for uprobe bpf programs to become sleepable, it has to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Therefore, the free path for bpf_prog_array now chains a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that the tasks_trace latency affects all perf_event-attached bpf programs (and not just uprobe ones). This is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes an rcu read section, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 4 ++-- kernel/trace/trace_uprobe.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 10b157a6d73e..d1c22594dbf9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); @@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); } bpf_prog_put(event->prog); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..0282c119b1b2 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_probe.h" @@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (bpf_prog_array_valid(call)) { u32 ret; - preempt_disable(); - ret = trace_call_bpf(call, regs); - preempt_enable(); + ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run); if (!ret) return; } -- cgit v1.2.3 From aca80dd95e20f1fa0daa212afc83c9fa0ad239e5 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Mon, 20 Jun 2022 21:47:55 +0000 Subject: uprobe: gate bpf call behind BPF_EVENTS The call into bpf from uprobes needs to be gated now that it doesn't use the trace_events.h helpers. Randy found this as a randconfig build failure on linux-next [1]. [1]: https://lore.kernel.org/linux-next/2de99180-7d55-2fdf-134d-33198c27cc58@infradead.org/ Reported-by: Randy Dunlap Signed-off-by: Delyan Kratunov Tested-by: Randy Dunlap Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/cb8bfbbcde87ed5d811227a393ef4925f2aadb7b.camel@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/trace_uprobe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0282c119b1b2..326235fd2346 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1344,6 +1344,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; +#ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { u32 ret; @@ -1351,6 +1352,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (!ret) return; } +#endif /* CONFIG_BPF_EVENTS */ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); -- cgit v1.2.3 From cc5c516df028b221d94c65c47c5ae8d20f61b6f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:45 +0200 Subject: block: simplify blktrace sysfs attribute creation Add the trace attributes to the default gendisk attributes, just like we already do for partitions. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 11 +---------- block/blk.h | 2 ++ block/genhd.c | 3 +++ block/partitions/core.c | 1 - include/linux/blktrace_api.h | 10 ---------- kernel/trace/blktrace.c | 11 ----------- 6 files changed, 6 insertions(+), 32 deletions(-) (limited to 'kernel/trace') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9b211e519de8..5f3f73115988 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -810,21 +810,14 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - ret = blk_trace_init_sysfs(dev); - if (ret) - return ret; - mutex_lock(&q->sysfs_dir_lock); ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); - if (ret < 0) { - blk_trace_remove_sysfs(dev); + if (ret < 0) goto unlock; - } ret = sysfs_create_group(&q->kobj, &queue_attr_group); if (ret) { - blk_trace_remove_sysfs(dev); kobject_del(&q->kobj); kobject_put(&dev->kobj); goto unlock; @@ -890,7 +883,6 @@ put_dev: mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; @@ -931,7 +923,6 @@ void blk_unregister_queue(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); blk_crypto_sysfs_unregister(q); - blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); elv_unregister_queue(q); diff --git a/block/blk.h b/block/blk.h index 1a0d3e6a4a63..74d59435870c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -452,6 +452,8 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; +extern struct attribute_group blk_trace_attr_group; + long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/genhd.c b/block/genhd.c index bf9be06af2c8..b1fb7e058b9c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1134,6 +1134,9 @@ static struct attribute_group disk_attr_group = { static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif NULL }; diff --git a/block/partitions/core.c b/block/partitions/core.c index 8a0ec929023b..7dc487f5b03c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include "check.h" diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 623e22492afa..f6f9b544365a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -77,10 +77,6 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); -extern void blk_trace_remove_sysfs(struct device *dev); -extern int blk_trace_init_sysfs(struct device *dev); - -extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ # define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) @@ -91,13 +87,7 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) # define blk_add_cgroup_trace_msg(q, cg, fmt, ...) do { } while (0) -# define blk_trace_remove_sysfs(dev) do { } while (0) # define blk_trace_note_message_enabled(q) (false) -static inline int blk_trace_init_sysfs(struct device *dev) -{ - return 0; -} - #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_COMPAT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fe04c6f96ca5..c584effe5fe9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1867,17 +1867,6 @@ out_unlock_bdev: out: return ret ? ret : count; } - -int blk_trace_init_sysfs(struct device *dev) -{ - return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); -} - -void blk_trace_remove_sysfs(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); -} - #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING -- cgit v1.2.3 From 6f0e6c1598b1a3d19fc30db86b6e26d6f881b43d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:26 +0200 Subject: context_tracking: Take IRQ eqs entrypoints over RCU The RCU dynticks counter is going to be merged into the context tracking subsystem. Prepare with moving the IRQ extended quiescent states entrypoints to context tracking. For now those are dumb redirection to existing RCU calls. [ paulmck: Apply Stephen Rothwell feedback from -next. ] [ paulmck: Apply Nathan Chancellor feedback. ] Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- .../RCU/Design/Requirements/Requirements.rst | 10 ++++----- Documentation/RCU/stallwarn.rst | 4 ++-- arch/Kconfig | 2 +- arch/arm64/kernel/entry-common.c | 6 +++--- arch/x86/mm/fault.c | 2 +- drivers/cpuidle/cpuidle-psci.c | 8 ++++---- drivers/cpuidle/cpuidle-riscv-sbi.c | 8 ++++---- include/linux/context_tracking_irq.h | 17 +++++++++++++++ include/linux/context_tracking_state.h | 1 + include/linux/entry-common.h | 10 ++++----- include/linux/rcupdate.h | 5 +++-- include/linux/tracepoint.h | 4 ++-- kernel/cfi.c | 4 ++-- kernel/context_tracking.c | 24 ++++++++++++++++++++-- kernel/cpu_pm.c | 8 ++++---- kernel/entry/common.c | 12 +++++------ kernel/softirq.c | 4 ++-- kernel/trace/trace.c | 6 +++--- 18 files changed, 87 insertions(+), 48 deletions(-) create mode 100644 include/linux/context_tracking_irq.h (limited to 'kernel/trace') diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 04ed8bf27a0e..074810c73936 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -1844,10 +1844,10 @@ that meets this requirement. Furthermore, NMI handlers can be interrupted by what appear to RCU to be normal interrupts. One way that this can happen is for code that -directly invokes rcu_irq_enter() and rcu_irq_exit() to be called +directly invokes ct_irq_enter() and ct_irq_exit() to be called from an NMI handler. This astonishing fact of life prompted the current -code structure, which has rcu_irq_enter() invoking -rcu_nmi_enter() and rcu_irq_exit() invoking rcu_nmi_exit(). +code structure, which has ct_irq_enter() invoking +rcu_nmi_enter() and ct_irq_exit() invoking rcu_nmi_exit(). And yes, I also learned of this requirement the hard way. Loadable Modules @@ -2195,7 +2195,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be: sections, and RCU believes this CPU to be idle, no problem. This sort of thing is used by some architectures for light-weight exception handlers, which can then avoid the overhead of - rcu_irq_enter() and rcu_irq_exit() at exception entry and + ct_irq_enter() and ct_irq_exit() at exception entry and exit, respectively. Some go further and avoid the entireties of irq_enter() and irq_exit(). Just make very sure you are running some of your tests with @@ -2226,7 +2226,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be: +-----------------------------------------------------------------------+ | **Answer**: | +-----------------------------------------------------------------------+ -| One approach is to do ``rcu_irq_exit();rcu_irq_enter();`` every so | +| One approach is to do ``ct_irq_exit();ct_irq_enter();`` every so | | often. But given that long-running interrupt handlers can cause other | | problems, not least for response time, shouldn't you work to keep | | your interrupt handler's runtime within reasonable bounds? | diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index b95bda7755fa..ce1f58a9d954 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -98,11 +98,11 @@ warnings: - A low-level kernel issue that either fails to invoke one of the variants of rcu_user_enter(), rcu_user_exit(), ct_idle_enter(), - ct_idle_exit(), rcu_irq_enter(), or rcu_irq_exit() on the one + ct_idle_exit(), ct_irq_enter(), or ct_irq_exit() on the one hand, or that invokes one of them too many times on the other. Historically, the most frequent issue has been an omission of either irq_enter() or irq_exit(), which in turn invoke - rcu_irq_enter() or rcu_irq_exit(), respectively. Building your + ct_irq_enter() or ct_irq_exit(), respectively. Building your kernel with CONFIG_RCU_EQS_DEBUG=y can help track down these types of issues, which sometimes arise in architecture-specific code. diff --git a/arch/Kconfig b/arch/Kconfig index 154b7b78da09..342642be105f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -782,7 +782,7 @@ config HAVE_CONTEXT_TRACKING_USER Syscalls need to be wrapped inside user_exit()-user_enter(), either optimized behind static key or through the slow path using TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs are already - protected inside rcu_irq_enter/rcu_irq_exit() but preemption or signal + protected inside ct_irq_enter/ct_irq_exit() but preemption or signal handling on irq exit still need to be protected. config HAVE_CONTEXT_TRACKING_USER_OFFSTACK diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 56cefd33eb8e..8dabe9ec10f1 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -41,7 +41,7 @@ static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); + ct_irq_enter(); trace_hardirqs_off_finish(); regs->exit_rcu = true; @@ -76,7 +76,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) if (regs->exit_rcu) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); - rcu_irq_exit(); + ct_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); return; } @@ -84,7 +84,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) trace_hardirqs_on(); } else { if (regs->exit_rcu) - rcu_irq_exit(); + ct_irq_exit(); } } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fad8faa29d04..971977c438fc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1526,7 +1526,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) /* * Entry handling for valid #PF from kernel mode is slightly - * different: RCU is already watching and rcu_irq_enter() must not + * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index 540105ca0781..57bc3e3ae391 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -69,12 +69,12 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, return -1; /* Do runtime PM to manage a hierarchical CPU toplogy. */ - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_suspend(pd_dev); else pm_runtime_put_sync_suspend(pd_dev); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); state = psci_get_domain_state(); if (!state) @@ -82,12 +82,12 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, ret = psci_cpu_suspend_enter(state) ? -1 : idx; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); cpu_pm_exit(); diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index 1151e5e2ba82..862a2876f1c9 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -116,12 +116,12 @@ static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, return -1; /* Do runtime PM to manage a hierarchical CPU toplogy. */ - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_suspend(pd_dev); else pm_runtime_put_sync_suspend(pd_dev); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); if (sbi_is_domain_state_available()) state = sbi_get_domain_state(); @@ -130,12 +130,12 @@ static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, ret = sbi_suspend(state) ? -1 : idx; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); cpu_pm_exit(); diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h new file mode 100644 index 000000000000..62f62bbd1a50 --- /dev/null +++ b/include/linux/context_tracking_irq.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CONTEXT_TRACKING_IRQ_H +#define _LINUX_CONTEXT_TRACKING_IRQ_H + +#ifdef CONFIG_CONTEXT_TRACKING_IDLE +void ct_irq_enter(void); +void ct_irq_exit(void); +void ct_irq_enter_irqson(void); +void ct_irq_exit_irqson(void); +#else +static inline void ct_irq_enter(void) { } +static inline void ct_irq_exit(void) { } +static inline void ct_irq_enter_irqson(void) { } +static inline void ct_irq_exit_irqson(void) { } +#endif + +#endif diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 2b46afe105a9..9c16a8b2c194 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -4,6 +4,7 @@ #include #include +#include struct context_tracking { /* diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c92ac75d6556..84a466b176cf 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -357,7 +357,7 @@ void irqentry_exit_to_user_mode(struct pt_regs *regs); /** * struct irqentry_state - Opaque object for exception state storage * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the - * exit path has to invoke rcu_irq_exit(). + * exit path has to invoke ct_irq_exit(). * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that * lockdep state is restored correctly on exit from nmi. * @@ -395,12 +395,12 @@ typedef struct irqentry_state { * * For kernel mode entries RCU handling is done conditional. If RCU is * watching then the only RCU requirement is to check whether the tick has - * to be restarted. If RCU is not watching then rcu_irq_enter() has to be - * invoked on entry and rcu_irq_exit() on exit. + * to be restarted. If RCU is not watching then ct_irq_enter() has to be + * invoked on entry and ct_irq_exit() on exit. * - * Avoiding the rcu_irq_enter/exit() calls is an optimization but also + * Avoiding the ct_irq_enter/exit() calls is an optimization but also * solves the problem of kernel mode pagefaults which can schedule, which - * is not possible after invoking rcu_irq_enter() without undoing it. + * is not possible after invoking ct_irq_enter() without undoing it. * * For user mode entries irqentry_enter_from_user_mode() is invoked to * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6ebe754501c3..f1562d91c67d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -29,6 +29,7 @@ #include #include #include +#include #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) @@ -143,9 +144,9 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { } */ #define RCU_NONIDLE(a) \ do { \ - rcu_irq_enter_irqson(); \ + ct_irq_enter_irqson(); \ do { a; } while (0); \ - rcu_irq_exit_irqson(); \ + ct_irq_exit_irqson(); \ } while (0) /* diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 28031b15f878..55717a2eda08 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -200,13 +200,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) */ \ if (rcuidle) { \ __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ - rcu_irq_enter_irqson(); \ + ct_irq_enter_irqson(); \ } \ \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ if (rcuidle) { \ - rcu_irq_exit_irqson(); \ + ct_irq_exit_irqson(); \ srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ } \ \ diff --git a/kernel/cfi.c b/kernel/cfi.c index 08102d19ec15..2046276ee234 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -295,7 +295,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) rcu_idle = !rcu_is_watching(); if (rcu_idle) { local_irq_save(flags); - rcu_irq_enter(); + ct_irq_enter(); } if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) @@ -304,7 +304,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) fn = find_module_check_fn(ptr); if (rcu_idle) { - rcu_irq_exit(); + ct_irq_exit(); local_irq_restore(flags); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index c0b3798d4e94..72bd71a02c44 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -35,6 +35,26 @@ void ct_idle_exit(void) rcu_idle_exit(); } EXPORT_SYMBOL_GPL(ct_idle_exit); + +noinstr void ct_irq_enter(void) +{ + rcu_irq_enter(); +} + +noinstr void ct_irq_exit(void) +{ + rcu_irq_exit(); +} + +void ct_irq_enter_irqson(void) +{ + rcu_irq_enter_irqson(); +} + +void ct_irq_exit_irqson(void) +{ + rcu_irq_exit_irqson(); +} #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER @@ -90,7 +110,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be * any RCU read-side critical section until the next call to - * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * user_exit() or ct_irq_enter(). Let's remove RCU's dependency * on the tick. */ if (state == CONTEXT_USER) { @@ -136,7 +156,7 @@ void ct_user_enter(enum ctx_state state) /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * ct_irq_enter() rcu_user_exit() rcu_user_exit() ct_irq_exit() * This would mess up the dyntick_nesting count though. And rcu_irq_*() * helpers are enough to protect RCU uses inside the exception. So * just return immediately if we detect we are in an IRQ. diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 246efc74e3f3..ba4ba71facf9 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -35,11 +35,11 @@ static int cpu_pm_notify(enum cpu_pm_event event) * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know * this. */ - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +49,11 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev unsigned long flags; int ret; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); return notifier_to_errno(ret); } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 032f164abe7c..667ba5d581ff 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -321,7 +321,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) } /* - * If this entry hit the idle task invoke rcu_irq_enter() whether + * If this entry hit the idle task invoke ct_irq_enter() whether * RCU is watching or not. * * Interrupts can nest when the first interrupt invokes softirq @@ -332,12 +332,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * not nested into another interrupt. * * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * interrupt to invoke ct_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interrupt and eventually claim * quiescent state and end grace periods prematurely. * - * Unconditionally invoke rcu_irq_enter() so RCU state stays + * Unconditionally invoke ct_irq_enter() so RCU state stays * consistent. * * TINY_RCU does not support EQS, so let the compiler eliminate @@ -350,7 +350,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); + ct_irq_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -418,7 +418,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); - rcu_irq_exit(); + ct_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); return; } @@ -436,7 +436,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) * was not watching on entry. */ if (state.exit_rcu) - rcu_irq_exit(); + ct_irq_exit(); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 9f0aef8aa9ff..c8a6913c067d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -620,7 +620,7 @@ void irq_enter_rcu(void) */ void irq_enter(void) { - rcu_irq_enter(); + ct_irq_enter(); irq_enter_rcu(); } @@ -672,7 +672,7 @@ void irq_exit_rcu(void) void irq_exit(void) { __irq_exit_rcu(); - rcu_irq_exit(); + ct_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c95992e2c71..fe78a6818126 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3107,15 +3107,15 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, /* * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI - * triggered someplace critical, and rcu_irq_enter() should + * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); } /** -- cgit v1.2.3 From 493c1822825f00025d6754ec0632990a27edc6f8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2022 16:40:27 +0200 Subject: context_tracking: Take NMI eqs entrypoints over RCU The RCU dynticks counter is going to be merged into the context tracking subsystem. Prepare with moving the NMI extended quiescent states entrypoints to context tracking. For now those are dumb redirection to existing RCU calls. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Nicolas Saenz Julienne Cc: Marcelo Tosatti Cc: Xiongfeng Wang Cc: Yu Liao Cc: Phil Auld Cc: Paul Gortmaker Cc: Alex Belits Signed-off-by: Paul E. McKenney Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne --- Documentation/RCU/Design/Requirements/Requirements.rst | 2 +- arch/Kconfig | 2 +- arch/arm64/kernel/entry-common.c | 8 ++++---- include/linux/context_tracking_irq.h | 4 ++++ include/linux/hardirq.h | 4 ++-- kernel/context_tracking.c | 10 ++++++++++ kernel/entry/common.c | 4 ++-- kernel/extable.c | 4 ++-- kernel/trace/trace.c | 2 +- 9 files changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 074810c73936..a0f8164c8513 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -1847,7 +1847,7 @@ normal interrupts. One way that this can happen is for code that directly invokes ct_irq_enter() and ct_irq_exit() to be called from an NMI handler. This astonishing fact of life prompted the current code structure, which has ct_irq_enter() invoking -rcu_nmi_enter() and ct_irq_exit() invoking rcu_nmi_exit(). +ct_nmi_enter() and ct_irq_exit() invoking ct_nmi_exit(). And yes, I also learned of this requirement the hard way. Loadable Modules diff --git a/arch/Kconfig b/arch/Kconfig index 342642be105f..f56f7c0e924d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -797,7 +797,7 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK - Critical entry code isn't preemptible (or better yet: not interruptible). - - No use of RCU read side critical sections, unless rcu_nmi_enter() + - No use of RCU read side critical sections, unless ct_nmi_enter() got called. - No use of instrumentation, unless instrumentation_begin() got called. diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 8dabe9ec10f1..c75ca36b4a49 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -161,7 +161,7 @@ static void noinstr arm64_enter_nmi(struct pt_regs *regs) __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); lockdep_hardirq_enter(); - rcu_nmi_enter(); + ct_nmi_enter(); trace_hardirqs_off_finish(); ftrace_nmi_enter(); @@ -182,7 +182,7 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) lockdep_hardirqs_on_prepare(); } - rcu_nmi_exit(); + ct_nmi_exit(); lockdep_hardirq_exit(); if (restore) lockdep_hardirqs_on(CALLER_ADDR0); @@ -199,7 +199,7 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); lockdep_hardirqs_off(CALLER_ADDR0); - rcu_nmi_enter(); + ct_nmi_enter(); trace_hardirqs_off_finish(); } @@ -218,7 +218,7 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) lockdep_hardirqs_on_prepare(); } - rcu_nmi_exit(); + ct_nmi_exit(); if (restore) lockdep_hardirqs_on(CALLER_ADDR0); } diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h index 62f62bbd1a50..c50b5670c4a5 100644 --- a/include/linux/context_tracking_irq.h +++ b/include/linux/context_tracking_irq.h @@ -7,11 +7,15 @@ void ct_irq_enter(void); void ct_irq_exit(void); void ct_irq_enter_irqson(void); void ct_irq_exit_irqson(void); +void ct_nmi_enter(void); +void ct_nmi_exit(void); #else static inline void ct_irq_enter(void) { } static inline void ct_irq_exit(void) { } static inline void ct_irq_enter_irqson(void) { } static inline void ct_irq_exit_irqson(void) { } +static inline void ct_nmi_enter(void) { } +static inline void ct_nmi_exit(void) { } #endif #endif diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 76878b357ffa..345cdbe9c1b7 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -124,7 +124,7 @@ extern void rcu_nmi_exit(void); do { \ __nmi_enter(); \ lockdep_hardirq_enter(); \ - rcu_nmi_enter(); \ + ct_nmi_enter(); \ instrumentation_begin(); \ ftrace_nmi_enter(); \ instrumentation_end(); \ @@ -143,7 +143,7 @@ extern void rcu_nmi_exit(void); instrumentation_begin(); \ ftrace_nmi_exit(); \ instrumentation_end(); \ - rcu_nmi_exit(); \ + ct_nmi_exit(); \ lockdep_hardirq_exit(); \ __nmi_exit(); \ } while (0) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 72bd71a02c44..b8a731f20778 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -55,6 +55,16 @@ void ct_irq_exit_irqson(void) { rcu_irq_exit_irqson(); } + +noinstr void ct_nmi_enter(void) +{ + rcu_nmi_enter(); +} + +noinstr void ct_nmi_exit(void) +{ + rcu_nmi_exit(); +} #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 667ba5d581ff..063068a9ea9b 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -449,7 +449,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); lockdep_hardirq_enter(); - rcu_nmi_enter(); + ct_nmi_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); @@ -469,7 +469,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) } instrumentation_end(); - rcu_nmi_exit(); + ct_nmi_exit(); lockdep_hardirq_exit(); if (irq_state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); diff --git a/kernel/extable.c b/kernel/extable.c index bda5e9761541..71f482581cab 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -114,7 +114,7 @@ int kernel_text_address(unsigned long addr) /* Treat this like an NMI as it can happen anywhere */ if (no_rcu) - rcu_nmi_enter(); + ct_nmi_enter(); if (is_module_text_address(addr)) goto out; @@ -127,7 +127,7 @@ int kernel_text_address(unsigned long addr) ret = 0; out: if (no_rcu) - rcu_nmi_exit(); + ct_nmi_exit(); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fe78a6818126..5fc7f17f5ec7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3105,7 +3105,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } /* - * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), + * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. -- cgit v1.2.3 From 94c255ac676fa922df3498608ead42b0a0c85122 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Mon, 6 Jun 2022 10:30:07 +0800 Subject: tracing/user_events: Fix syntax errors in comments Delete the redundant word 'have'. Link: https://lkml.kernel.org/r/20220606023007.23377-1-wangxiang@cdjrlc.com Signed-off-by: Xiang wangx Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 706e1686b5eb..a6621c52ce45 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -567,7 +567,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) * to allow user_event files to be less locked down. The extreme case * being "other" has read/write access to user_events_data/status. * - * When not locked down, processes may not have have permissions to + * When not locked down, processes may not have permissions to * add/remove calls themselves to tracefs. We need to temporarily * switch to root file permission to allow for this scenario. */ -- cgit v1.2.3 From fb991f1942334b0cbf6aa6a88faa586ba22d3550 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 30 Jun 2022 09:31:52 +0800 Subject: tracing/histograms: Simplify create_hist_fields() When I look into implements of create_hist_fields(), I think there can be following two simplifications: 1. If something wrong happened in parse_var_defs(), free_var_defs() would have been called in it, so no need goto free again after calling it; 2. After calling create_key_fields(), regardless of the value of 'ret', it then always runs into 'out: ', so the judge of 'ret' is redundant. Link: https://lkml.kernel.org/r/20220630013152.164871-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Reviewed-by: Tom Rix Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e87a46794079..fdf784620c28 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4455,7 +4455,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, ret = parse_var_defs(hist_data); if (ret) - goto out; + return ret; ret = create_val_fields(hist_data, file); if (ret) @@ -4466,8 +4466,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, goto out; ret = create_key_fields(hist_data, file); - if (ret) - goto out; + out: free_var_defs(hist_data); -- cgit v1.2.3 From 900d156bac2bc474cf7c7bee4efbc6c83ec5ae58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Jul 2022 07:53:17 +0200 Subject: block: remove bdevname Replace the remaining calls of bdevname with snprintf using the %pg format specifier. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220713055317.1888500-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 23 ----------------------- drivers/md/md.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- fs/ext4/mmp.c | 3 ++- fs/jbd2/journal.c | 6 ++++-- include/linux/blkdev.h | 1 - kernel/trace/blktrace.c | 4 ++-- 8 files changed, 11 insertions(+), 32 deletions(-) (limited to 'kernel/trace') diff --git a/block/genhd.c b/block/genhd.c index 9d30f159c59a..44dfcf67ed96 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -101,29 +101,6 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) } EXPORT_SYMBOL_GPL(set_capacity_and_notify); -/* - * Format the device name of the indicated block device into the supplied buffer - * and return a pointer to that same buffer for convenience. - * - * Note: do not use this in new code, use the %pg specifier to sprintf and - * printk insted. - */ -const char *bdevname(struct block_device *bdev, char *buf) -{ - struct gendisk *hd = bdev->bd_disk; - int partno = bdev->bd_partno; - - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); - - return buf; -} -EXPORT_SYMBOL(bdevname); - static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 076255ec9ba1..4be9d8173071 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2438,7 +2438,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) mdname(mddev), mddev->max_disks); return -EBUSY; } - bdevname(rdev->bdev,b); + snprintf(b, sizeof(b), "%pg", rdev->bdev); strreplace(b, '/', '!'); rdev->mddev = mddev; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 258d4eb2d63c..65cd90f0b2a8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1240,7 +1240,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); if (rdev) - bdevname(rdev->bdev, b); + snprintf(b, sizeof(b), "%pg", rdev->bdev); else strcpy(b, "???"); rcu_read_unlock(); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d589f823feb1..a7dcb1bf6b0a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1164,7 +1164,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, disk = r10_bio->devs[slot].devnum; err_rdev = rcu_dereference(conf->mirrors[disk].rdev); if (err_rdev) - bdevname(err_rdev->bdev, b); + snprintf(b, sizeof(b), "%pg", err_rdev->bdev); else { strcpy(b, "???"); /* This never gets dereferenced */ diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index b7a850b0070b..b221f313ded6 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -371,7 +371,8 @@ skip: EXT4_SB(sb)->s_mmp_bh = bh; BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); - bdevname(bh->b_bdev, mmp->mmp_bdevname); + snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname), + "%pg", bh->b_bdev); /* * Start a kernel thread to update the MMP block periodically. diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0cbeeaec2d1..9015f5fa2862 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1465,7 +1465,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, if (!journal) return NULL; - bdevname(journal->j_dev, journal->j_devname); + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); @@ -1507,7 +1508,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return NULL; journal->j_inode = inode; - bdevname(journal->j_dev, journal->j_devname); + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); jbd2_stats_proc_init(journal); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 22c477fadc0f..2775763c51b9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1457,7 +1457,6 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) int bdev_read_only(struct block_device *bdev); int set_blocksize(struct block_device *bdev, int size); -const char *bdevname(struct block_device *bdev, char *buffer); int lookup_bdev(const char *pathname, dev_t *dev); void blkdev_show(struct seq_file *seqf, off_t offset); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c584effe5fe9..4752bda1b1a0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -736,12 +736,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: - bdevname(bdev, b); + snprintf(b, sizeof(b), "%pg", bdev); ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: - bdevname(bdev, b); + snprintf(b, sizeof(b), "%pg", bdev); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif -- cgit v1.2.3 From 22c80aac882f712897b88b7ea8f5a74ea19019df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:36 -0700 Subject: blktrace: Trace remapped requests correctly Trace the remapped operation and its flags instead of only the data direction of remapped operations. This issue was detected by analyzing the warnings reported by sparse related to the new blk_opf_t type. Reviewed-by: Jun'ichi Nomura Cc: Mike Snitzer Cc: Mike Christie Cc: Li Zefan Cc: Chaitanya Kulkarni Fixes: 1b9a9ab78b0a ("blktrace: use op accessors") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-11-bvanassche@acm.org Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4752bda1b1a0..4327b51da403 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1058,7 +1058,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, 0, + req_op(rq), rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } -- cgit v1.2.3 From 919dbca8670d0f7828dfbb2f9b434ac22dca8d2e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:37 -0700 Subject: blktrace: Use the new blk_opf_t type Improve static type checking by using the new blk_opf_t type for a function argument that represents a combination of a request operation and request flags. Rename that argument from 'op' into 'opf' to make its role more clear. Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Li Zefan Cc: Chaitanya Kulkarni Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-12-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 3 ++- kernel/trace/blktrace.c | 51 ++++++++++++++++++++++---------------------- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index f6f9b544365a..cfbda114348c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -7,6 +7,7 @@ #include #include #include +#include #if defined(CONFIG_BLK_DEV_IO_TRACE) @@ -105,7 +106,7 @@ struct compat_blk_user_trace_setup { #endif -void blk_fill_rwbs(char *rwbs, unsigned int op); +void blk_fill_rwbs(char *rwbs, blk_opf_t opf); static inline sector_t blk_rq_trace_sector(struct request *rq) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4327b51da403..150058f5daa9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), #define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ +#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* @@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, u64 cgid) + const blk_opf_t opf, u32 what, int error, + int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -227,16 +227,17 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + const enum req_op op = opf & REQ_OP_MASK; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[op_is_write(op) ? WRITE : READ]; - what |= MASK_TC_BIT(op_flags, SYNC); - what |= MASK_TC_BIT(op_flags, RAHEAD); - what |= MASK_TC_BIT(op_flags, META); - what |= MASK_TC_BIT(op_flags, PREFLUSH); - what |= MASK_TC_BIT(op_flags, FUA); + what |= MASK_TC_BIT(opf, SYNC); + what |= MASK_TC_BIT(opf, RAHEAD); + what |= MASK_TC_BIT(opf, META); + what |= MASK_TC_BIT(opf, PREFLUSH); + what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) @@ -842,9 +843,8 @@ static void blk_add_trace_rq(struct request *rq, blk_status_t error, else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, blk_status_to_errno(error), 0, - NULL, cgid); + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, + what, blk_status_to_errno(error), 0, NULL, cgid); rcu_read_unlock(); } @@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL, + bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } @@ -949,7 +949,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); rcu_read_unlock(); } @@ -969,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } rcu_read_unlock(); } @@ -985,8 +985,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, + bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, blk_status_to_errno(bio->bi_status), sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); @@ -1022,7 +1021,7 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, + bio->bi_opf, BLK_TA_REMAP, blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); @@ -1058,7 +1057,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - req_op(rq), rq->cmd_flags, BLK_TA_REMAP, 0, + rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } @@ -1084,7 +1083,7 @@ void blk_add_driver_data(struct request *rq, void *data, size_t len) return; } - __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); @@ -1881,14 +1880,14 @@ out: * caller with resulting string. * **/ -void blk_fill_rwbs(char *rwbs, unsigned int op) +void blk_fill_rwbs(char *rwbs, blk_opf_t opf) { int i = 0; - if (op & REQ_PREFLUSH) + if (opf & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op & REQ_OP_MASK) { + switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE: rwbs[i++] = 'W'; break; @@ -1909,13 +1908,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op) rwbs[i++] = 'N'; } - if (op & REQ_FUA) + if (opf & REQ_FUA) rwbs[i++] = 'F'; - if (op & REQ_RAHEAD) + if (opf & REQ_RAHEAD) rwbs[i++] = 'A'; - if (op & REQ_SYNC) + if (opf & REQ_SYNC) rwbs[i++] = 'S'; - if (op & REQ_META) + if (opf & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; -- cgit v1.2.3 From 020e3618cc81abf11fe6bffaac27861ff94707ce Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jul 2022 11:47:35 -0700 Subject: blktrace: Fix the blk_fill_rwbs() kernel-doc header Reflect recent changes in the blk_fill_rwbs() kernel-doc header. Reported-by: Stephen Rothwell Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Li Zefan Cc: Chaitanya Kulkarni Cc: Stephen Rothwell Fixes: 919dbca8670d ("blktrace: Use the new blk_opf_t type") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220715184735.2326034-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 150058f5daa9..7f5eb295fe19 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1873,11 +1873,11 @@ out: /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled - * @op: REQ_OP_XXX for the tracepoint + * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: - * Maps the REQ_OP_XXX to character and fills the buffer provided by the - * caller with resulting string. + * Maps each request operation and flag to a single character and fills the + * buffer provided by the caller with resulting string. * **/ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) -- cgit v1.2.3 From f96f644ab97abeed3f7007c953836a574ce928cc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:23 -0700 Subject: ftrace: Add modify_ftrace_direct_multi_nolock This is similar to modify_ftrace_direct_multi, but does not acquire direct_mutex. This is useful when direct_mutex is already locked by the user. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20220720002126.803253-2-song@kernel.org --- include/linux/ftrace.h | 5 +++ kernel/trace/ftrace.c | 86 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 24 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 979f6bfa2c25..acb35243ce5d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -340,6 +340,7 @@ unsigned long ftrace_find_rec_direct(unsigned long ip); int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr); #else struct ftrace_ops; @@ -384,6 +385,10 @@ static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned lo { return -ENODEV; } +static inline int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 601ccf1b2f09..5d67dc12231d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5691,22 +5691,8 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); -/** - * modify_ftrace_direct_multi - Modify an existing direct 'multi' call - * to call something else - * @ops: The address of the struct ftrace_ops object - * @addr: The address of the new trampoline to call at @ops functions - * - * This is used to unregister currently registered direct caller and - * register new one @addr on functions registered in @ops object. - * - * Note there's window between ftrace_shutdown and ftrace_startup calls - * where there will be no callbacks called. - * - * Returns: zero on success. Non zero on error, which includes: - * -EINVAL - The @ops object was not properly registered. - */ -int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +static int +__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) { struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; @@ -5717,12 +5703,7 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) int i, size; int err; - if (check_direct_multi(ops)) - return -EINVAL; - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return -EINVAL; - - mutex_lock(&direct_mutex); + lockdep_assert_held_once(&direct_mutex); /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); @@ -5730,7 +5711,7 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) err = register_ftrace_function(&tmp_ops); if (err) - goto out_direct; + return err; /* * Now the ftrace_ops_list_func() is called to do the direct callers. @@ -5754,7 +5735,64 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); - out_direct: + return err; +} + +/** + * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Caller should already have direct_mutex locked, so we don't lock + * direct_mutex here. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr) +{ + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + return __modify_ftrace_direct_multi(ops, addr); +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock); + +/** + * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = __modify_ftrace_direct_multi(ops, addr); mutex_unlock(&direct_mutex); return err; } -- cgit v1.2.3 From 53cd885bc5c3ea283cc9c00ca6446c778f00bfba Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Jul 2022 17:21:24 -0700 Subject: ftrace: Allow IPMODIFY and DIRECT ops on the same function IPMODIFY (livepatch) and DIRECT (bpf trampoline) ops are both important users of ftrace. It is necessary to allow them work on the same function at the same time. First, DIRECT ops no longer specify IPMODIFY flag. Instead, DIRECT flag is handled together with IPMODIFY flag in __ftrace_hash_update_ipmodify(). Then, a callback function, ops_func, is added to ftrace_ops. This is used by ftrace core code to understand whether the DIRECT ops can share with an IPMODIFY ops. To share with IPMODIFY ops, the DIRECT ops need to implement the callback function and adjust the direct trampoline accordingly. If DIRECT ops is attached before the IPMODIFY ops, ftrace core code calls ENABLE_SHARE_IPMODIFY_PEER on the DIRECT ops before registering the IPMODIFY ops. If IPMODIFY ops is attached before the DIRECT ops, ftrace core code calls ENABLE_SHARE_IPMODIFY_SELF in __ftrace_hash_update_ipmodify. Owner of the DIRECT ops may return 0 if the DIRECT trampoline can share with IPMODIFY, so error code otherwise. The error code is propagated to register_ftrace_direct_multi so that onwer of the DIRECT trampoline can handle it properly. For more details, please refer to comment before enum ftrace_ops_cmd. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/ Link: https://lore.kernel.org/all/20220718055449.3960512-1-song@kernel.org/ Link: https://lore.kernel.org/bpf/20220720002126.803253-3-song@kernel.org --- include/linux/ftrace.h | 38 ++++++++ kernel/trace/ftrace.c | 242 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 254 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index acb35243ce5d..0b61371e287b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -208,6 +208,43 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), }; +/* + * FTRACE_OPS_CMD_* commands allow the ftrace core logic to request changes + * to a ftrace_ops. Note, the requests may fail. + * + * ENABLE_SHARE_IPMODIFY_SELF - enable a DIRECT ops to work on the same + * function as an ops with IPMODIFY. Called + * when the DIRECT ops is being registered. + * This is called with both direct_mutex and + * ftrace_lock are locked. + * + * ENABLE_SHARE_IPMODIFY_PEER - enable a DIRECT ops to work on the same + * function as an ops with IPMODIFY. Called + * when the other ops (the one with IPMODIFY) + * is being registered. + * This is called with direct_mutex locked. + * + * DISABLE_SHARE_IPMODIFY_PEER - disable a DIRECT ops to work on the same + * function as an ops with IPMODIFY. Called + * when the other ops (the one with IPMODIFY) + * is being unregistered. + * This is called with direct_mutex locked. + */ +enum ftrace_ops_cmd { + FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF, + FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER, + FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER, +}; + +/* + * For most ftrace_ops_cmd, + * Returns: + * 0 - Success. + * Negative on failure. The return value is dependent on the + * callback. + */ +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); + #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { @@ -250,6 +287,7 @@ struct ftrace_ops { unsigned long trampoline; unsigned long trampoline_size; struct list_head list; + ftrace_ops_func_t ops_func; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5d67dc12231d..bc921a3f7ea8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,6 +1861,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } +static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); + /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -1869,6 +1871,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) * - If the hash is EMPTY_HASH, it hits nothing * - Anything else hits the recs which match the hash entries. + * + * DIRECT ops does not have IPMODIFY flag, but we still need to check it + * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call + * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with + * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate + * the return value to the caller and eventually to the owner of the DIRECT + * ops. */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, @@ -1877,17 +1886,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; int in_old, in_new; + bool is_ipmodify, is_direct; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY; + is_direct = ops->flags & FTRACE_OPS_FL_DIRECT; + + /* neither IPMODIFY nor DIRECT, skip */ + if (!is_ipmodify && !is_direct) + return 0; + + if (WARN_ON_ONCE(is_ipmodify && is_direct)) return 0; /* - * Since the IPMODIFY is a very address sensitive action, we do not - * allow ftrace_ops to set all functions to new hash. + * Since the IPMODIFY and DIRECT are very address sensitive + * actions, we do not allow ftrace_ops to set all functions to new + * hash. */ if (!new_hash || !old_hash) return -EINVAL; @@ -1905,12 +1923,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, continue; if (in_new) { - /* New entries must ensure no others are using it */ - if (rec->flags & FTRACE_FL_IPMODIFY) - goto rollback; - rec->flags |= FTRACE_FL_IPMODIFY; - } else /* Removed entry */ + if (rec->flags & FTRACE_FL_IPMODIFY) { + int ret; + + /* Cannot have two ipmodify on same rec */ + if (is_ipmodify) + goto rollback; + + FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + + /* + * Another ops with IPMODIFY is already + * attached. We are now attaching a direct + * ops. Run SHARE_IPMODIFY_SELF, to check + * whether sharing is supported. + */ + if (!ops->ops_func) + return -EBUSY; + ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + if (ret) + return ret; + } else if (is_ipmodify) { + rec->flags |= FTRACE_FL_IPMODIFY; + } + } else if (is_ipmodify) { rec->flags &= ~FTRACE_FL_IPMODIFY; + } } while_for_each_ftrace_rec(); return 0; @@ -2454,8 +2492,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops direct_ops = { .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_IPMODIFY - | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, /* * By declaring the main trampoline as this trampoline @@ -3072,14 +3109,14 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) } /* - * Check if the current ops references the record. + * Check if the current ops references the given ip. * * If the ops traces all functions, then it was already accounted for. * If the ops does not trace the current record function, skip it. * If the ops ignores the function via notrace filter, skip it. */ -static inline bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) { /* If ops isn't enabled, ignore it */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) @@ -3091,16 +3128,29 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) return false; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) return false; return true; } +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return ops_references_ip(ops, rec->ip); +} + static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -5215,6 +5265,8 @@ static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) return direct; } +static int register_ftrace_function_nolock(struct ftrace_ops *ops); + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5286,7 +5338,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { - ret = register_ftrace_function(&direct_ops); + ret = register_ftrace_function_nolock(&direct_ops); if (ret) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } @@ -5545,8 +5597,7 @@ int modify_ftrace_direct(unsigned long ip, } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \ - FTRACE_OPS_FL_SAVE_REGS) +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) static int check_direct_multi(struct ftrace_ops *ops) { @@ -5639,7 +5690,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; - err = register_ftrace_function(ops); + err = register_ftrace_function_nolock(ops); out_remove: if (err) @@ -5709,7 +5760,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; - err = register_ftrace_function(&tmp_ops); + err = register_ftrace_function_nolock(&tmp_ops); if (err) return err; @@ -8003,6 +8054,143 @@ int ftrace_is_dead(void) return ftrace_disabled; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When registering ftrace_ops with IPMODIFY, it is necessary to make sure + * it doesn't conflict with any direct ftrace_ops. If there is existing + * direct ftrace_ops on a kernel function being patched, call + * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing. + * + * @ops: ftrace_ops being registered. + * + * Returns: + * 0 on success; + * Negative on failure. + */ +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i, ret; + + lockdep_assert_held_once(&direct_mutex); + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + if (found_op) { + if (!op->ops_func) + return -EBUSY; + + ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + if (ret) + return ret; + } + } + } + + return 0; +} + +/* + * Similar to prepare_direct_functions_for_ipmodify, clean up after ops + * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT + * ops. + */ +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return; + + mutex_lock(&direct_mutex); + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + /* The cleanup is optional, ignore any errors */ + if (found_op && op->ops_func) + op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + } + } + mutex_unlock(&direct_mutex); +} + +#define lock_direct_mutex() mutex_lock(&direct_mutex) +#define unlock_direct_mutex() mutex_unlock(&direct_mutex) + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + return 0; +} + +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ +} + +#define lock_direct_mutex() do { } while (0) +#define unlock_direct_mutex() do { } while (0) + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/* + * Similar to register_ftrace_function, except we don't lock direct_mutex. + */ +static int register_ftrace_function_nolock(struct ftrace_ops *ops) +{ + int ret; + + ftrace_ops_init(ops); + + mutex_lock(&ftrace_lock); + + ret = ftrace_startup(ops, 0); + + mutex_unlock(&ftrace_lock); + + return ret; +} + /** * register_ftrace_function - register a function for profiling * @ops: ops structure that holds the function for profiling. @@ -8018,14 +8206,15 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret; - ftrace_ops_init(ops); - - mutex_lock(&ftrace_lock); - - ret = ftrace_startup(ops, 0); + lock_direct_mutex(); + ret = prepare_direct_functions_for_ipmodify(ops); + if (ret < 0) + goto out_unlock; - mutex_unlock(&ftrace_lock); + ret = register_ftrace_function_nolock(ops); +out_unlock: + unlock_direct_mutex(); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); @@ -8044,6 +8233,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); + cleanup_direct_functions_after_ipmodify(ops); return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_function); -- cgit v1.2.3 From b774926c733850037b15c50f893383aa71bd8695 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:05 +0800 Subject: tracing: eprobe: Add missing log index Add trace_probe_log_set_index(1) to allow report correct error if user input wrong SYSTEM.EVENT format. Link: https://lore.kernel.org/all/1656296348-16111-2-git-send-email-quic_linyyuan@quicinc.com/ Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 7d4478525c66..b805b570305f 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -881,6 +881,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (!is_good_name(event) || !is_good_name(group)) goto parse_error; + trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, sys_event - argv[1]); -- cgit v1.2.3 From f360ea5641dc9473ad485e882c8ac3b1aa2672ff Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:06 +0800 Subject: tracing: eprobe: Remove duplicate is_good_name() operation traceprobe_parse_event_name() already validate SYSTEM and EVENT name, there is no need to call is_good_name() after it. Link: https://lore.kernel.org/all/1656296348-16111-3-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b805b570305f..8979cb9ec37a 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -887,8 +887,6 @@ static int __trace_eprobe_create(int argc, const char *argv[]) sys_event - argv[1]); if (ret || !sys_name) goto parse_error; - if (!is_good_name(sys_event) || !is_good_name(sys_name)) - goto parse_error; mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); -- cgit v1.2.3 From 95c104c378dc7d4cb3fb9f289dc5354bfc285fe0 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:07 +0800 Subject: tracing: Auto generate event name when creating a group of events Currently when creating a specific group of trace events, take kprobe event as example, the user must use the following format: p:GRP/EVENT [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS], which means user must enter EVENT name, one example is: echo 'p:usb_gadget/config_usb_cfg_link config_usb_cfg_link $arg1' >> kprobe_events It is not simple if there are too many entries because the event name is the same as symbol name. This change allows user to specify no EVENT name, format changed as: p:GRP/ [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] It will generate event name automatically and one example is: echo 'p:usb_gadget/ config_usb_cfg_link $arg1' >> kprobe_events. Link: https://lore.kernel.org/all/1656296348-16111-4-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/kprobetrace.rst | 8 ++++---- Documentation/trace/uprobetracer.rst | 8 ++++---- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_eprobe.c | 25 +++++++++++++------------ kernel/trace/trace_kprobe.c | 16 ++++++++++------ kernel/trace/trace_probe.c | 4 ++++ kernel/trace/trace_uprobe.c | 12 ++++++++---- 8 files changed, 48 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index b175d88f31eb..4274cc6a2f94 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -28,10 +28,10 @@ Synopsis of kprobe_events ------------------------- :: - p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe - p:[GRP/]EVENT] [MOD:]SYM[+0]%return [FETCHARGS] : Set a return probe - -:[GRP/]EVENT : Clear a probe + p[:[GRP/][EVENT]] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe + p[:[GRP/][EVENT]] [MOD:]SYM[+0]%return [FETCHARGS] : Set a return probe + -:[GRP/][EVENT] : Clear a probe GRP : Group name. If omitted, use "kprobes" for it. EVENT : Event name. If omitted, the event name is generated diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index a8e5938f609e..3a1797d707f4 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -26,10 +26,10 @@ Synopsis of uprobe_tracer ------------------------- :: - p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a uprobe - r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe) - p[:[GRP/]EVENT] PATH:OFFSET%return [FETCHARGS] : Set a return uprobe (uretprobe) - -:[GRP/]EVENT : Clear uprobe or uretprobe event + p[:[GRP/][EVENT]] PATH:OFFSET [FETCHARGS] : Set a uprobe + r[:[GRP/][EVENT]] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe) + p[:[GRP/][EVENT]] PATH:OFFSET%return [FETCHARGS] : Set a return uprobe (uretprobe) + -:[GRP/][EVENT] : Clear uprobe or uretprobe event GRP : Group name. If omitted, "uprobes" is the default value. EVENT : Event name. If omitted, the event name is generated based diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8dd54627075..7eb5bce62500 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5569,13 +5569,13 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p[:[/]] []\n" - "\t r[maxactive][:[/]] []\n" + "\t Format: p[:[/][]] []\n" + "\t r[maxactive][:[/][]] []\n" #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif - "\t e[:[/]] . []\n" - "\t -:[/]\n" + "\t e[:[/][]] . []\n" + "\t -:[/][]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" "place (kretprobe): [:][+]%return|\n" diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 076b447a1b88..154996684fb5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -101,7 +101,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') { + if (!system && event[0] == '\0') { ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 8979cb9ec37a..a30f21499e81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -125,6 +125,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, * We match the following: * event only - match all eprobes with event name * system and event only - match all system/event probes + * system only - match all system probes * * The below has the above satisfied with more arguments: * @@ -143,7 +144,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, return false; /* Must match the event name */ - if (strcmp(trace_probe_name(&ep->tp), event) != 0) + if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0) return false; /* No arguments match all */ @@ -848,7 +849,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] * Fetch args: * =$[:TYPE] */ @@ -858,6 +859,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) struct trace_eprobe *ep = NULL; char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0; int i; @@ -869,25 +871,24 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { event++; - ret = traceprobe_parse_event_name(&event, &group, buf1, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); - event = buf1; } - if (!is_good_name(event) || !is_good_name(group)) - goto parse_error; trace_probe_log_set_index(1); sys_event = argv[1]; - ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, - sys_event - argv[1]); - if (ret || !sys_name) + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); + if (!sys_event || !sys_name) goto parse_error; + if (!event) { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a245ea673715..23f7f0ec4f4c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -163,7 +163,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_probe_name(&tk->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tk->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && trace_kprobe_match_command_head(tk, argc, argv); } @@ -708,11 +709,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* * Argument syntax: * - Add kprobe: - * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: - * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS] * Or - * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -739,6 +740,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; switch (argv[0][0]) { @@ -833,11 +835,13 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { + } + + if (!event) { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 80863c6508e5..850a88abd33b 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -257,6 +257,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } len = strlen(event); if (len == 0) { + if (slash) { + *pevent = NULL; + return 0; + } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c3dc4f859a6b..a3fec28961d6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -312,7 +312,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_probe_name(&tu->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tu->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && trace_uprobe_match_command_head(tu, argc, argv); } @@ -532,7 +533,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { @@ -540,6 +541,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; @@ -644,11 +646,13 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto fail_address_parse; - } else { + } + + if (!event) { char *tail; char *ptr; -- cgit v1.2.3 From 102227b970a15256f5ffd12a6a276ddf978e6caf Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:40 +0200 Subject: rv: Add Runtime Verification (RV) interface RV is a lightweight (yet rigorous) method that complements classical exhaustive verification techniques (such as model checking and theorem proving) with a more practical approach to complex systems. RV works by analyzing the trace of the system's actual execution, comparing it against a formal specification of the system behavior. RV can give precise information on the runtime behavior of the monitored system while enabling the reaction for unexpected events, avoiding, for example, the propagation of a failure on safety-critical systems. The development of this interface roots in the development of the paper: De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo Silva. Efficient formal verification for the Linux kernel. In: International Conference on Software Engineering and Formal Methods. Springer, Cham, 2019. p. 315-332. And: De Oliveira, Daniel Bristot. Automata-based formal analysis and verification of the real-time Linux kernel. PhD Thesis, 2020. The RV interface resembles the tracing/ interface on purpose. The current path for the RV interface is /sys/kernel/tracing/rv/. It presents these files: "available_monitors" - List the available monitors, one per line. For example: # cat available_monitors wip wwnr "enabled_monitors" - Lists the enabled monitors, one per line; - Writing to it enables a given monitor; - Writing a monitor name with a '!' prefix disables it; - Truncating the file disables all enabled monitors. For example: # cat enabled_monitors # echo wip > enabled_monitors # echo wwnr >> enabled_monitors # cat enabled_monitors wip wwnr # echo '!wip' >> enabled_monitors # cat enabled_monitors wwnr # echo > enabled_monitors # cat enabled_monitors # Note that more than one monitor can be enabled concurrently. "monitoring_on" - It is an on/off general switcher for monitoring. Note that it does not disable enabled monitors or detach events, but stop the per-entity monitors of monitoring the events received from the system. It resembles the "tracing_on" switcher. "monitors/" Each monitor will have its one directory inside "monitors/". There the monitor specific files will be presented. The "monitors/" directory resembles the "events" directory on tracefs. For example: # cd monitors/wip/ # ls desc enable # cat desc wakeup in preemptive per-cpu testing monitor. # cat enable 0 For further information, see the comments in the header of kernel/trace/rv/rv.c from this patch. Link: https://lkml.kernel.org/r/a4bfe038f50cb047bfb343ad0e12b0e646ab308b.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- include/linux/rv.h | 43 +++ include/linux/sched.h | 11 + kernel/trace/Kconfig | 2 + kernel/trace/Makefile | 1 + kernel/trace/rv/Kconfig | 12 + kernel/trace/rv/Makefile | 3 + kernel/trace/rv/rv.c | 782 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/rv/rv.h | 33 ++ kernel/trace/trace.c | 2 + kernel/trace/trace.h | 9 + 10 files changed, 898 insertions(+) create mode 100644 include/linux/rv.h create mode 100644 kernel/trace/rv/Kconfig create mode 100644 kernel/trace/rv/Makefile create mode 100644 kernel/trace/rv/rv.c create mode 100644 kernel/trace/rv/rv.h (limited to 'kernel/trace') diff --git a/include/linux/rv.h b/include/linux/rv.h new file mode 100644 index 000000000000..d8fa9e8be94a --- /dev/null +++ b/include/linux/rv.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Runtime Verification. + * + * For futher information, see: kernel/trace/rv/rv.c. + */ +#ifndef _LINUX_RV_H +#define _LINUX_RV_H + +#ifdef CONFIG_RV + +/* + * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS. + * If we find justification for more monitors, we can think about + * adding more or developing a dynamic method. So far, none of + * these are justified. + */ +#define RV_PER_TASK_MONITORS 1 +#define RV_PER_TASK_MONITOR_INIT (RV_PER_TASK_MONITORS) + +/* + * Futher monitor types are expected, so make this a union. + */ +union rv_task_monitor { +}; + +struct rv_monitor { + const char *name; + const char *description; + bool enabled; + int (*enable)(void); + void (*disable)(void); + void (*reset)(void); +}; + +bool rv_monitoring_on(void); +int rv_unregister_monitor(struct rv_monitor *monitor); +int rv_register_monitor(struct rv_monitor *monitor); +int rv_get_task_monitor_slot(void); +void rv_put_task_monitor_slot(int slot); + +#endif /* CONFIG_RV */ +#endif /* _LINUX_RV_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c46f3a63b758..b5da3e7c3a04 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include #include /* task_struct member predeclarations (sorted alphabetically): */ @@ -1500,6 +1501,16 @@ struct task_struct { struct callback_head l1d_flush_kill; #endif +#ifdef CONFIG_RV + /* + * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. + * If we find justification for more monitors, we can think + * about adding more or developing a dynamic method. So far, + * none of these are justified. + */ + union rv_task_monitor rv[RV_PER_TASK_MONITORS]; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ccd6a5ade3e9..1052126bdca2 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1106,4 +1106,6 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. +source "kernel/trace/rv/Kconfig" + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0d261774d6f3..c6651e16b557 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -106,5 +106,6 @@ obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_RV) += rv/ libftrace-y := ftrace.o diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig new file mode 100644 index 000000000000..6d127cdb00dd --- /dev/null +++ b/kernel/trace/rv/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +menuconfig RV + bool "Runtime Verification" + depends on TRACING + help + Enable the kernel runtime verification infrastructure. RV is a + lightweight (yet rigorous) method that complements classical + exhaustive verification techniques (such as model checking and + theorem proving). RV works by analyzing the trace of the system's + actual execution, comparing it against a formal specification of + the system behavior. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile new file mode 100644 index 000000000000..fd995379df67 --- /dev/null +++ b/kernel/trace/rv/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_RV) += rv.o diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c new file mode 100644 index 000000000000..731cc961cc70 --- /dev/null +++ b/kernel/trace/rv/rv.c @@ -0,0 +1,782 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * This is the online Runtime Verification (RV) interface. + * + * RV is a lightweight (yet rigorous) method that complements classical + * exhaustive verification techniques (such as model checking and + * theorem proving) with a more practical approach to complex systems. + * + * RV works by analyzing the trace of the system's actual execution, + * comparing it against a formal specification of the system behavior. + * RV can give precise information on the runtime behavior of the + * monitored system while enabling the reaction for unexpected + * events, avoiding, for example, the propagation of a failure on + * safety-critical systems. + * + * The development of this interface roots in the development of the + * paper: + * + * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo + * Silva. Efficient formal verification for the Linux kernel. In: + * International Conference on Software Engineering and Formal Methods. + * Springer, Cham, 2019. p. 315-332. + * + * And: + * + * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis + * and verification of the real-time Linux kernel. PhD Thesis, 2020. + * + * == Runtime monitor interface == + * + * A monitor is the central part of the runtime verification of a system. + * + * The monitor stands in between the formal specification of the desired + * (or undesired) behavior, and the trace of the actual system. + * + * In Linux terms, the runtime verification monitors are encapsulated + * inside the "RV monitor" abstraction. A RV monitor includes a reference + * model of the system, a set of instances of the monitor (per-cpu monitor, + * per-task monitor, and so on), and the helper functions that glue the + * monitor to the system via trace. Generally, a monitor includes some form + * of trace output as a reaction for event parsing and exceptions, + * as depicted bellow: + * + * Linux +----- RV Monitor ----------------------------------+ Formal + * Realm | | Realm + * +-------------------+ +----------------+ +-----------------+ + * | Linux kernel | | Monitor | | Reference | + * | Tracing | -> | Instance(s) | <- | Model | + * | (instrumentation) | | (verification) | | (specification) | + * +-------------------+ +----------------+ +-----------------+ + * | | | + * | V | + * | +----------+ | + * | | Reaction | | + * | +--+--+--+-+ | + * | | | | | + * | | | +-> trace output ? | + * +------------------------|--|----------------------+ + * | +----> panic ? + * +-------> + * + * This file implements the interface for loading RV monitors, and + * to control the verification session. + * + * == Registering monitors == + * + * The struct rv_monitor defines a set of callback functions to control + * a verification session. For instance, when a given monitor is enabled, + * the "enable" callback function is called to hook the instrumentation + * functions to the kernel trace events. The "disable" function is called + * when disabling the verification session. + * + * A RV monitor is registered via: + * int rv_register_monitor(struct rv_monitor *monitor); + * And unregistered via: + * int rv_unregister_monitor(struct rv_monitor *monitor); + * + * == User interface == + * + * The user interface resembles kernel tracing interface. It presents + * these files: + * + * "available_monitors" + * - List the available monitors, one per line. + * + * For example: + * # cat available_monitors + * wip + * wwnr + * + * "enabled_monitors" + * - Lists the enabled monitors, one per line; + * - Writing to it enables a given monitor; + * - Writing a monitor name with a '!' prefix disables it; + * - Truncating the file disables all enabled monitors. + * + * For example: + * # cat enabled_monitors + * # echo wip > enabled_monitors + * # echo wwnr >> enabled_monitors + * # cat enabled_monitors + * wip + * wwnr + * # echo '!wip' >> enabled_monitors + * # cat enabled_monitors + * wwnr + * # echo > enabled_monitors + * # cat enabled_monitors + * # + * + * Note that more than one monitor can be enabled concurrently. + * + * "monitoring_on" + * - It is an on/off general switcher for monitoring. Note + * that it does not disable enabled monitors or detach events, + * but stops the per-entity monitors from monitoring the events + * received from the instrumentation. It resembles the "tracing_on" + * switcher. + * + * "monitors/" + * Each monitor will have its own directory inside "monitors/". There + * the monitor specific files will be presented. + * The "monitors/" directory resembles the "events" directory on + * tracefs. + * + * For example: + * # cd monitors/wip/ + * # ls + * desc enable + * # cat desc + * auto-generated wakeup in preemptive monitor. + * # cat enable + * 0 + */ + +#include +#include +#include +#include + +#include "rv.h" + +DEFINE_MUTEX(rv_interface_lock); + +static struct rv_interface rv_root; + +struct dentry *get_monitors_root(void) +{ + return rv_root.monitors_dir; +} + +/* + * Interface for the monitor register. + */ +static LIST_HEAD(rv_monitors_list); + +static int task_monitor_count; +static bool task_monitor_slots[RV_PER_TASK_MONITORS]; + +int rv_get_task_monitor_slot(void) +{ + int i; + + lockdep_assert_held(&rv_interface_lock); + + if (task_monitor_count == RV_PER_TASK_MONITORS) + return -EBUSY; + + task_monitor_count++; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + if (task_monitor_slots[i] == false) { + task_monitor_slots[i] = true; + return i; + } + } + + WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n"); + + return -EINVAL; +} + +void rv_put_task_monitor_slot(int slot) +{ + lockdep_assert_held(&rv_interface_lock); + + if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); + return; + } + + WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n", + slot); + + task_monitor_count--; + task_monitor_slots[slot] = false; +} + +/* + * This section collects the monitor/ files and folders. + */ +static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + const char *buff; + + buff = mdef->monitor->enabled ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +/* + * __rv_disable_monitor - disabled an enabled monitor + */ +static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +{ + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) { + mdef->monitor->enabled = 0; + mdef->monitor->disable(); + + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + if (sync) + tracepoint_synchronize_unregister(); + return 1; + } + return 0; +} + +/** + * rv_disable_monitor - disable a given runtime monitor + * + * Returns 0 on success. + */ +int rv_disable_monitor(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); + return 0; +} + +/** + * rv_enable_monitor - enable a given runtime monitor + * + * Returns 0 on success, error otherwise. + */ +int rv_enable_monitor(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +/* + * interface for enabling/disabling a monitor. + */ +static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + retval = count; + + mutex_lock(&rv_interface_lock); + + if (val) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + mutex_unlock(&rv_interface_lock); + + return retval ? : count; +} + +static const struct file_operations interface_enable_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitor_enable_write_data, + .read = monitor_enable_read_data, +}; + +/* + * Interface to read monitors description. + */ +static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + char buff[256]; + + memset(buff, 0, sizeof(buff)); + + snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static const struct file_operations interface_desc_fops = { + .open = simple_open, + .llseek = no_llseek, + .read = monitor_desc_read_data, +}; + +/* + * During the registration of a monitor, this function creates + * the monitor dir, where the specific options of the monitor + * are exposed. + */ +static int create_monitor_dir(struct rv_monitor_def *mdef) +{ + struct dentry *root = get_monitors_root(); + const char *name = mdef->monitor->name; + struct dentry *tmp; + int retval; + + mdef->root_d = rv_create_dir(name, root); + if (!mdef->root_d) + return -ENOMEM; + + tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + return 0; + +out_remove_root: + rv_remove(mdef->root_d); + return retval; +} + +/* + * Available/Enable monitor shared seq functions. + */ +static int monitors_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mon_def = p; + + seq_printf(m, "%s\n", mon_def->monitor->name); + return 0; +} + +/* + * Used by the seq file operations at the end of a read + * operation. + */ +static void monitors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +/* + * Available monitor seq functions. + */ +static void *available_monitors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_monitors_list, *pos); +} + +static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_monitors_list, pos); +} + +/* + * Enable monitor seq functions. + */ +static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct rv_monitor_def *m_def = p; + + (*pos)++; + + list_for_each_entry_continue(m_def, &rv_monitors_list, list) { + if (m_def->monitor->enabled) + return m_def; + } + + return NULL; +} + +static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) +{ + struct rv_monitor_def *m_def; + loff_t l; + + mutex_lock(&rv_interface_lock); + + if (list_empty(&rv_monitors_list)) + return NULL; + + m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + + for (l = 0; l <= *pos; ) { + m_def = enabled_monitors_next(m, m_def, &l); + if (!m_def) + break; + } + + return m_def; +} + +/* + * available/enabled monitors seq definition. + */ +static const struct seq_operations available_monitors_seq_ops = { + .start = available_monitors_start, + .next = available_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +static const struct seq_operations enabled_monitors_seq_ops = { + .start = enabled_monitors_start, + .next = enabled_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +/* + * available_monitors interface. + */ +static int available_monitors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_monitors_seq_ops); +}; + +static const struct file_operations available_monitors_ops = { + .open = available_monitors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * enabled_monitors interface. + */ +static void disable_all_monitors(void) +{ + struct rv_monitor_def *mdef; + int enabled = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(mdef, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mdef, false); + + if (enabled) { + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + tracepoint_synchronize_unregister(); + } + + mutex_unlock(&rv_interface_lock); +} + +static int enabled_monitors_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + disable_all_monitors(); + + return seq_open(file, &enabled_monitors_seq_ops); +}; + +static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + int retval = -EINVAL; + bool enable = true; + char *ptr = buff; + int len; + + if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + if (ptr[0] == '!') { + enable = false; + ptr++; + } + + len = strlen(ptr); + if (!len) + return count; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (strcmp(ptr, mdef->monitor->name) != 0) + continue; + + /* + * Monitor found! + */ + if (enable) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + if (!retval) + retval = count; + + break; + } + + mutex_unlock(&rv_interface_lock); + return retval; +} + +static const struct file_operations enabled_monitors_ops = { + .open = enabled_monitors_open, + .read = seq_read, + .write = enabled_monitors_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Monitoring on global switcher! + */ +static bool __read_mostly monitoring_on; + +/** + * rv_monitoring_on - checks if monitoring is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_monitoring_on(void) +{ + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_rmb(); + return READ_ONCE(monitoring_on); +} + +/* + * monitoring_on general switcher. + */ +static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, + size_t count, loff_t *ppos) +{ + const char *buff; + + buff = rv_monitoring_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static void turn_monitoring_off(void) +{ + WRITE_ONCE(monitoring_on, false); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void reset_all_monitors(void) +{ + struct rv_monitor_def *mdef; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (mdef->monitor->enabled) + mdef->monitor->reset(); + } +} + +static void turn_monitoring_on(void) +{ + WRITE_ONCE(monitoring_on, true); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void turn_monitoring_on_with_reset(void) +{ + lockdep_assert_held(&rv_interface_lock); + + if (rv_monitoring_on()) + return; + + /* + * Monitors might be out of sync with the system if events were not + * processed because of !rv_monitoring_on(). + * + * Reset all monitors, forcing a re-sync. + */ + reset_all_monitors(); + turn_monitoring_on(); +} + +static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_monitoring_on_with_reset(); + else + turn_monitoring_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations monitoring_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitoring_on_write_data, + .read = monitoring_on_read_data, +}; + +static void destroy_monitor_dir(struct rv_monitor_def *mdef) +{ + rv_remove(mdef->root_d); +} + +/** + * rv_register_monitor - register a rv monitor. + * @monitor: The rv_monitor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *r; + int retval = 0; + + if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { + pr_info("Monitor %s has a name longer than %d\n", monitor->name, + MAX_RV_MONITOR_NAME_SIZE); + return -1; + } + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(monitor->name, r->monitor->name) == 0) { + pr_info("Monitor %s is already registered\n", monitor->name); + retval = -1; + goto out_unlock; + } + } + + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); + if (!r) { + retval = -ENOMEM; + goto out_unlock; + } + + r->monitor = monitor; + + retval = create_monitor_dir(r); + if (retval) { + kfree(r); + goto out_unlock; + } + + list_add_tail(&r->list, &rv_monitors_list); + +out_unlock: + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_monitor - unregister a rv monitor. + * @monitor: The rv_monitor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { + if (strcmp(monitor->name, ptr->monitor->name) == 0) { + rv_disable_monitor(ptr); + list_del(&ptr->list); + destroy_monitor_dir(ptr); + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +int __init rv_init_interface(void) +{ + struct dentry *tmp; + + rv_root.root_dir = rv_create_dir("rv", NULL); + if (!rv_root.root_dir) + goto out_err; + + rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + if (!rv_root.monitors_dir) + goto out_err; + + tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + &available_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + &enabled_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + &monitoring_on_fops); + if (!tmp) + goto out_err; + + turn_monitoring_on(); + + return 0; + +out_err: + rv_remove(rv_root.root_dir); + printk(KERN_ERR "RV: Error while creating the RV interface\n"); + return 1; +} diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h new file mode 100644 index 000000000000..50014aa224a7 --- /dev/null +++ b/kernel/trace/rv/rv.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +struct rv_interface { + struct dentry *root_dir; + struct dentry *monitors_dir; +}; + +#include "../trace.h" +#include +#include + +#define RV_MODE_WRITE TRACE_MODE_WRITE +#define RV_MODE_READ TRACE_MODE_READ + +#define rv_create_dir tracefs_create_dir +#define rv_create_file tracefs_create_file +#define rv_remove tracefs_remove + +#define MAX_RV_MONITOR_NAME_SIZE 32 + +extern struct mutex rv_interface_lock; + +struct rv_monitor_def { + struct list_head list; + struct rv_monitor *monitor; + struct dentry *root_d; + bool task_monitor; +}; + +struct dentry *get_monitors_root(void); +int rv_disable_monitor(struct rv_monitor_def *mdef); +int rv_enable_monitor(struct rv_monitor_def *mdef); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7eb5bce62500..301305ec134b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9772,6 +9772,8 @@ static __init int tracer_init_tracefs(void) tracer_init_tracefs_work_func(NULL); } + rv_init_interface(); + return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ff816fb41e48..900e75d96c84 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2005,4 +2005,13 @@ struct trace_min_max_param { extern const struct file_operations trace_min_max_fops; +#ifdef CONFIG_RV +extern int rv_init_interface(void); +#else +static inline int rv_init_interface(void) +{ + return 0; +} +#endif + #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3 From 04acadcb4453cf8011dd3d4ce8d97fecac42d325 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:41 +0200 Subject: rv: Add runtime reactors interface A runtime monitor can cause a reaction to the detection of an exception on the model's execution. By default, the monitors have tracing reactions, printing the monitor output via tracepoints. But other reactions can be added (on-demand) via this interface. The user interface resembles the kernel tracing interface and presents these files: "available_reactors" - Reading shows the available reactors, one per line. For example: # cat available_reactors nop panic printk "reacting_on" - It is an on/off general switch for reactors, disabling all reactions. "monitors/MONITOR/reactors" - List available reactors, with the select reaction for the given MONITOR inside []. The default one is the nop (no operation) reactor. - Writing the name of a reactor enables it to the given MONITOR. For example: # cat monitors/wip/reactors [nop] panic printk # echo panic > monitors/wip/reactors # cat monitors/wip/reactors nop [panic] printk Link: https://lkml.kernel.org/r/1794eb994637457bdeaa6bad0b8263d2f7eece0c.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- include/linux/rv.h | 17 ++ kernel/trace/rv/Kconfig | 11 + kernel/trace/rv/Makefile | 1 + kernel/trace/rv/rv.c | 9 + kernel/trace/rv/rv.h | 35 +++ kernel/trace/rv/rv_reactors.c | 508 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 581 insertions(+) create mode 100644 kernel/trace/rv/rv_reactors.c (limited to 'kernel/trace') diff --git a/include/linux/rv.h b/include/linux/rv.h index d8fa9e8be94a..dcce56987cf7 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -24,6 +24,14 @@ union rv_task_monitor { }; +#ifdef CONFIG_RV_REACTORS +struct rv_reactor { + const char *name; + const char *description; + void (*react)(char *msg); +}; +#endif + struct rv_monitor { const char *name; const char *description; @@ -31,6 +39,9 @@ struct rv_monitor { int (*enable)(void); void (*disable)(void); void (*reset)(void); +#ifdef CONFIG_RV_REACTORS + void (*react)(char *msg); +#endif }; bool rv_monitoring_on(void); @@ -39,5 +50,11 @@ int rv_register_monitor(struct rv_monitor *monitor); int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); +#ifdef CONFIG_RV_REACTORS +bool rv_reacting_on(void); +int rv_unregister_reactor(struct rv_reactor *reactor); +int rv_register_reactor(struct rv_reactor *reactor); +#endif /* CONFIG_RV_REACTORS */ + #endif /* CONFIG_RV */ #endif /* _LINUX_RV_H */ diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 6d127cdb00dd..3eb5d48ab4f6 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -10,3 +10,14 @@ menuconfig RV theorem proving). RV works by analyzing the trace of the system's actual execution, comparing it against a formal specification of the system behavior. + +config RV_REACTORS + bool "Runtime verification reactors" + default y + depends on RV + help + Enables the online runtime verification reactors. A runtime + monitor can cause a reaction to the detection of an exception + on the model's execution. By default, the monitors have + tracing reactions, printing the monitor output via tracepoints, + but other reactions can be added (on-demand) via this interface. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index fd995379df67..8944274d9b41 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_RV) += rv.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 731cc961cc70..45cf64eb2600 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -353,6 +353,10 @@ static int create_monitor_dir(struct rv_monitor_def *mdef) goto out_remove_root; } + retval = reactor_populate_monitor(mdef); + if (retval) + goto out_remove_root; + return 0; out_remove_root: @@ -669,6 +673,7 @@ static const struct file_operations monitoring_on_fops = { static void destroy_monitor_dir(struct rv_monitor_def *mdef) { + reactor_cleanup_monitor(mdef); rv_remove(mdef->root_d); } @@ -747,6 +752,7 @@ int rv_unregister_monitor(struct rv_monitor *monitor) int __init rv_init_interface(void) { struct dentry *tmp; + int retval; rv_root.root_dir = rv_create_dir("rv", NULL); if (!rv_root.root_dir) @@ -770,6 +776,9 @@ int __init rv_init_interface(void) &monitoring_on_fops); if (!tmp) goto out_err; + retval = init_rv_reactors(rv_root.root_dir); + if (retval) + goto out_err; turn_monitoring_on(); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 50014aa224a7..db6cb0913dbd 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -18,16 +18,51 @@ struct rv_interface { #define rv_remove tracefs_remove #define MAX_RV_MONITOR_NAME_SIZE 32 +#define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; +#ifdef CONFIG_RV_REACTORS +struct rv_reactor_def { + struct list_head list; + struct rv_reactor *reactor; + /* protected by the monitor interface lock */ + int counter; +}; +#endif + struct rv_monitor_def { struct list_head list; struct rv_monitor *monitor; struct dentry *root_d; +#ifdef CONFIG_RV_REACTORS + struct rv_reactor_def *rdef; + bool reacting; +#endif bool task_monitor; }; struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor_def *mdef); int rv_enable_monitor(struct rv_monitor_def *mdef); + +#ifdef CONFIG_RV_REACTORS +int reactor_populate_monitor(struct rv_monitor_def *mdef); +void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int init_rv_reactors(struct dentry *root_dir); +#else +static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + return 0; +} + +static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + return; +} + +static inline int init_rv_reactors(struct dentry *root_dir) +{ + return 0; +} +#endif diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c new file mode 100644 index 000000000000..a6522c196382 --- /dev/null +++ b/kernel/trace/rv/rv_reactors.c @@ -0,0 +1,508 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Runtime reactor interface. + * + * A runtime monitor can cause a reaction to the detection of an + * exception on the model's execution. By default, the monitors have + * tracing reactions, printing the monitor output via tracepoints. + * But other reactions can be added (on-demand) via this interface. + * + * == Registering reactors == + * + * The struct rv_reactor defines a callback function to be executed + * in case of a model exception happens. The callback function + * receives a message to be (optionally) printed before executing + * the reaction. + * + * A RV reactor is registered via: + * int rv_register_reactor(struct rv_reactor *reactor) + * And unregistered via: + * int rv_unregister_reactor(struct rv_reactor *reactor) + * + * These functions are exported to modules, enabling reactors to be + * dynamically loaded. + * + * == User interface == + * + * The user interface resembles the kernel tracing interface and + * presents these files: + * + * "available_reactors" + * - List the available reactors, one per line. + * + * For example: + * # cat available_reactors + * nop + * panic + * printk + * + * "reacting_on" + * - It is an on/off general switch for reactors, disabling + * all reactions. + * + * "monitors/MONITOR/reactors" + * - List available reactors, with the select reaction for the given + * MONITOR inside []. The default one is the nop (no operation) + * reactor. + * - Writing the name of an reactor enables it to the given + * MONITOR. + * + * For example: + * # cat monitors/wip/reactors + * [nop] + * panic + * printk + * # echo panic > monitors/wip/reactors + * # cat monitors/wip/reactors + * nop + * [panic] + * printk + */ + +#include + +#include "rv.h" + +/* + * Interface for the reactor register. + */ +static LIST_HEAD(rv_reactors_list); + +static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(name, r->reactor->name) == 0) + return r; + } + return NULL; +} + +/* + * Available reactors seq functions. + */ +static int reactors_show(struct seq_file *m, void *p) +{ + struct rv_reactor_def *rea_def = p; + + seq_printf(m, "%s\n", rea_def->reactor->name); + return 0; +} + +static void reactors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +static void *reactors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_reactors_list, *pos); +} + +static void *reactors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_reactors_list, pos); +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations available_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = reactors_show +}; + +/* + * available_reactors interface. + */ +static int available_reactors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_reactors_seq_ops); +}; + +static const struct file_operations available_reactors_ops = { + .open = available_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * Monitor's reactor file. + */ +static int monitor_reactor_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mdef = m->private; + struct rv_reactor_def *rdef = p; + + if (mdef->rdef == rdef) + seq_printf(m, "[%s]\n", rdef->reactor->name); + else + seq_printf(m, "%s\n", rdef->reactor->name); + return 0; +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations monitor_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = monitor_reactor_show +}; + +static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, + bool reacting) +{ + bool monitor_enabled; + + /* nothing to do */ + if (mdef->rdef == rdef) + return; + + monitor_enabled = mdef->monitor->enabled; + if (monitor_enabled) + rv_disable_monitor(mdef); + + /* swap reactor's usage */ + mdef->rdef->counter--; + rdef->counter++; + + mdef->rdef = rdef; + mdef->reacting = reacting; + mdef->monitor->react = rdef->reactor->react; + + if (monitor_enabled) + rv_enable_monitor(mdef); +} + +static ssize_t +monitor_reactors_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + struct rv_reactor_def *rdef; + struct seq_file *seq_f; + int retval = -EINVAL; + bool enable; + char *ptr; + int len; + + if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + len = strlen(ptr); + if (!len) + return count; + + /* + * See monitor_reactors_open() + */ + seq_f = file->private_data; + mdef = seq_f->private; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(rdef, &rv_reactors_list, list) { + if (strcmp(ptr, rdef->reactor->name) != 0) + continue; + + if (rdef == get_reactor_rdef_by_name("nop")) + enable = false; + else + enable = true; + + monitor_swap_reactors(mdef, rdef, enable); + + retval = count; + break; + } + + mutex_unlock(&rv_interface_lock); + + return retval; +} + +/* + * available_reactors interface. + */ +static int monitor_reactors_open(struct inode *inode, struct file *file) +{ + struct rv_monitor_def *mdef = inode->i_private; + struct seq_file *seq_f; + int ret; + + ret = seq_open(file, &monitor_reactors_seq_ops); + if (ret < 0) + return ret; + + /* + * seq_open stores the seq_file on the file->private data. + */ + seq_f = file->private_data; + + /* + * Copy the create file "private" data to the seq_file private data. + */ + seq_f->private = mdef; + + return 0; +}; + +static const struct file_operations monitor_reactors_ops = { + .open = monitor_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = monitor_reactors_write +}; + +static int __rv_register_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(reactor->name, r->reactor->name) == 0) { + pr_info("Reactor %s is already registered\n", reactor->name); + return -EINVAL; + } + } + + r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->reactor = reactor; + r->counter = 0; + + list_add_tail(&r->list, &rv_reactors_list); + + return 0; +} + +/** + * rv_register_reactor - register a rv reactor. + * @reactor: The rv_reactor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_reactor(struct rv_reactor *reactor) +{ + int retval = 0; + + if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { + pr_info("Reactor %s has a name longer than %d\n", + reactor->name, MAX_RV_MONITOR_NAME_SIZE); + return -EINVAL; + } + + mutex_lock(&rv_interface_lock); + retval = __rv_register_reactor(reactor); + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_reactor - unregister a rv reactor. + * @reactor: The rv_reactor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { + if (strcmp(reactor->name, ptr->reactor->name) == 0) { + + if (!ptr->counter) { + list_del(&ptr->list); + } else { + printk(KERN_WARNING + "rv: the rv_reactor %s is in use by %d monitor(s)\n", + ptr->reactor->name, ptr->counter); + printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", + ptr->reactor->name); + return -EBUSY; + } + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +/* + * reacting_on interface. + */ +static bool __read_mostly reacting_on; + +/** + * rv_reacting_on - checks if reacting is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_reacting_on(void) +{ + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_rmb(); + return READ_ONCE(reacting_on); +} + +static ssize_t reacting_on_read_data(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + char *buff; + + buff = rv_reacting_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +static void turn_reacting_off(void) +{ + WRITE_ONCE(reacting_on, false); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static void turn_reacting_on(void) +{ + WRITE_ONCE(reacting_on, true); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_reacting_on(); + else + turn_reacting_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations reacting_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = reacting_on_write_data, + .read = reacting_on_read_data, +}; + +/** + * reactor_populate_monitor - creates per monitor reactors file + * @mdef: monitor's definition. + * + * Returns 0 if successful, error otherwise. + */ +int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + struct dentry *tmp; + + tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + if (!tmp) + return -ENOMEM; + + /* + * Configure as the rv_nop reactor. + */ + mdef->rdef = get_reactor_rdef_by_name("nop"); + mdef->rdef->counter++; + mdef->reacting = false; + + return 0; +} + +/** + * reactor_cleanup_monitor - cleanup a monitor reference + * @mdef: monitor's definition. + */ +void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + lockdep_assert_held(&rv_interface_lock); + mdef->rdef->counter--; + WARN_ON_ONCE(mdef->rdef->counter < 0); +} + +/* + * Nop reactor register + */ +static void rv_nop_reaction(char *msg) +{ +} + +static struct rv_reactor rv_nop = { + .name = "nop", + .description = "no-operation reactor: do nothing.", + .react = rv_nop_reaction +}; + +int init_rv_reactors(struct dentry *root_dir) +{ + struct dentry *available, *reacting; + int retval; + + available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, + &available_reactors_ops); + if (!available) + goto out_err; + + reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); + if (!reacting) + goto rm_available; + + retval = __rv_register_reactor(&rv_nop); + if (retval) + goto rm_reacting; + + turn_reacting_on(); + + return 0; + +rm_reacting: + rv_remove(reacting); +rm_available: + rv_remove(available); +out_err: + return -ENOMEM; +} -- cgit v1.2.3 From 792575348ff70e05c6040d02fce38e949ef92c37 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:43 +0200 Subject: rv/include: Add deterministic automata monitor definition via C macros In Linux terms, the runtime verification monitors are encapsulated inside the "RV monitor" abstraction. The "RV monitor" includes a set of instances of the monitor (per-cpu monitor, per-task monitor, and so on), the helper functions that glue the monitor to the system reference model, and the trace output as a reaction for event parsing and exceptions, as depicted below: Linux +----- RV Monitor ----------------------------------+ Formal Realm | | Realm +-------------------+ +----------------+ +-----------------+ | Linux kernel | | Monitor | | Reference | | Tracing | -> | Instance(s) | <- | Model | | (instrumentation) | | (verification) | | (specification) | +-------------------+ +----------------+ +-----------------+ | | | | V | | +----------+ | | | Reaction | | | +--+--+--+-+ | | | | | | | | | +-> trace output ? | +------------------------|--|----------------------+ | +----> panic ? +-------> Add the rv/da_monitor.h, enabling automatic code generation for the *Monitor Instance(s)* using C macros, and code to support it. The benefits of the usage of macro for monitor synthesis are 3-fold as it: - Reduces the code duplication; - Facilitates the bug fix/improvement; - Avoids the case of developers changing the core of the monitor code to manipulate the model in a (let's say) non-standard way. This initial implementation presents three different types of monitor instances: - DECLARE_DA_MON_GLOBAL(name, type) - DECLARE_DA_MON_PER_CPU(name, type) - DECLARE_DA_MON_PER_TASK(name, type) The first declares the functions for a global deterministic automata monitor, the second for monitors with per-cpu instances, and the third with per-task instances. Link: https://lkml.kernel.org/r/51b0bf425a281e226dfeba7401d2115d6091f84e.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- include/linux/rv.h | 10 + include/rv/da_monitor.h | 541 ++++++++++++++++++++++++++++++++++++++++++++++ include/trace/events/rv.h | 120 ++++++++++ kernel/fork.c | 14 ++ kernel/trace/rv/Kconfig | 11 + kernel/trace/rv/rv.c | 5 + 6 files changed, 701 insertions(+) create mode 100644 include/rv/da_monitor.h create mode 100644 include/trace/events/rv.h (limited to 'kernel/trace') diff --git a/include/linux/rv.h b/include/linux/rv.h index dcce56987cf7..8883b41d88ec 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,7 +7,16 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H +#define MAX_DA_NAME_LEN 24 + #ifdef CONFIG_RV +/* + * Deterministic automaton per-object variables. + */ +struct da_monitor { + bool monitoring; + unsigned int curr_state; +}; /* * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS. @@ -22,6 +31,7 @@ * Futher monitor types are expected, so make this a union. */ union rv_task_monitor { + struct da_monitor da_mon; }; #ifdef CONFIG_RV_REACTORS diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h new file mode 100644 index 000000000000..001bc298289f --- /dev/null +++ b/include/rv/da_monitor.h @@ -0,0 +1,541 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Deterministic automata (DA) monitor functions, to be used together + * with automata models in C generated by the dot2k tool. + * + * The dot2k tool is available at tools/verification/dot2k/ + */ + +#include +#include +#include + +#ifdef CONFIG_RV_REACTORS + +#define DECLARE_RV_REACTING_HELPERS(name, type) \ +static char REACT_MSG_##name[1024]; \ + \ +static inline char *format_react_msg_##name(type curr_state, type event) \ +{ \ + snprintf(REACT_MSG_##name, 1024, \ + "rv: monitor %s does not allow event %s on state %s\n", \ + #name, \ + model_get_event_name_##name(event), \ + model_get_state_name_##name(curr_state)); \ + return REACT_MSG_##name; \ +} \ + \ +static void cond_react_##name(char *msg) \ +{ \ + if (rv_##name.react) \ + rv_##name.react(msg); \ +} \ + \ +static bool rv_reacting_on_##name(void) \ +{ \ + return rv_reacting_on(); \ +} + +#else /* CONFIG_RV_REACTOR */ + +#define DECLARE_RV_REACTING_HELPERS(name, type) \ +static inline char *format_react_msg_##name(type curr_state, type event) \ +{ \ + return NULL; \ +} \ + \ +static void cond_react_##name(char *msg) \ +{ \ + return; \ +} \ + \ +static bool rv_reacting_on_##name(void) \ +{ \ + return 0; \ +} +#endif + +/* + * Generic helpers for all types of deterministic automata monitors. + */ +#define DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ + \ +DECLARE_RV_REACTING_HELPERS(name, type) \ + \ +/* \ + * da_monitor_reset_##name - reset a monitor and setting it to init state \ + */ \ +static inline void da_monitor_reset_##name(struct da_monitor *da_mon) \ +{ \ + da_mon->monitoring = 0; \ + da_mon->curr_state = model_get_initial_state_##name(); \ +} \ + \ +/* \ + * da_monitor_curr_state_##name - return the current state \ + */ \ +static inline type da_monitor_curr_state_##name(struct da_monitor *da_mon) \ +{ \ + return da_mon->curr_state; \ +} \ + \ +/* \ + * da_monitor_set_state_##name - set the new current state \ + */ \ +static inline void \ +da_monitor_set_state_##name(struct da_monitor *da_mon, enum states_##name state) \ +{ \ + da_mon->curr_state = state; \ +} \ + \ +/* \ + * da_monitor_start_##name - start monitoring \ + * \ + * The monitor will ignore all events until monitoring is set to true. This \ + * function needs to be called to tell the monitor to start monitoring. \ + */ \ +static inline void da_monitor_start_##name(struct da_monitor *da_mon) \ +{ \ + da_mon->curr_state = model_get_initial_state_##name(); \ + da_mon->monitoring = 1; \ +} \ + \ +/* \ + * da_monitoring_##name - returns true if the monitor is processing events \ + */ \ +static inline bool da_monitoring_##name(struct da_monitor *da_mon) \ +{ \ + return da_mon->monitoring; \ +} \ + \ +/* \ + * da_monitor_enabled_##name - checks if the monitor is enabled \ + */ \ +static inline bool da_monitor_enabled_##name(void) \ +{ \ + /* global switch */ \ + if (unlikely(!rv_monitoring_on())) \ + return 0; \ + \ + /* monitor enabled */ \ + if (unlikely(!rv_##name.enabled)) \ + return 0; \ + \ + return 1; \ +} \ + \ +/* \ + * da_monitor_handling_event_##name - checks if the monitor is ready to handle events \ + */ \ +static inline bool da_monitor_handling_event_##name(struct da_monitor *da_mon) \ +{ \ + \ + if (!da_monitor_enabled_##name()) \ + return 0; \ + \ + /* monitor is actually monitoring */ \ + if (unlikely(!da_monitoring_##name(da_mon))) \ + return 0; \ + \ + return 1; \ +} + +/* + * Event handler for implicit monitors. Implicit monitor is the one which the + * handler does not need to specify which da_monitor to manipulate. Examples + * of implicit monitor are the per_cpu or the global ones. + */ +#define DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type) \ + \ +static inline bool \ +da_event_##name(struct da_monitor *da_mon, enum events_##name event) \ +{ \ + type curr_state = da_monitor_curr_state_##name(da_mon); \ + type next_state = model_get_next_state_##name(curr_state, event); \ + \ + if (next_state != INVALID_STATE) { \ + da_monitor_set_state_##name(da_mon, next_state); \ + \ + trace_event_##name(model_get_state_name_##name(curr_state), \ + model_get_event_name_##name(event), \ + model_get_state_name_##name(next_state), \ + model_is_final_state_##name(next_state)); \ + \ + return true; \ + } \ + \ + if (rv_reacting_on_##name()) \ + cond_react_##name(format_react_msg_##name(curr_state, event)); \ + \ + trace_error_##name(model_get_state_name_##name(curr_state), \ + model_get_event_name_##name(event)); \ + \ + return false; \ +} \ + +/* + * Event handler for per_task monitors. + */ +#define DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type) \ + \ +static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct *tsk, \ + enum events_##name event) \ +{ \ + type curr_state = da_monitor_curr_state_##name(da_mon); \ + type next_state = model_get_next_state_##name(curr_state, event); \ + \ + if (next_state != INVALID_STATE) { \ + da_monitor_set_state_##name(da_mon, next_state); \ + \ + trace_event_##name(tsk->pid, \ + model_get_state_name_##name(curr_state), \ + model_get_event_name_##name(event), \ + model_get_state_name_##name(next_state), \ + model_is_final_state_##name(next_state)); \ + \ + return true; \ + } \ + \ + if (rv_reacting_on_##name()) \ + cond_react_##name(format_react_msg_##name(curr_state, event)); \ + \ + trace_error_##name(tsk->pid, \ + model_get_state_name_##name(curr_state), \ + model_get_event_name_##name(event)); \ + \ + return false; \ +} + +/* + * Functions to define, init and get a global monitor. + */ +#define DECLARE_DA_MON_INIT_GLOBAL(name, type) \ + \ +/* \ + * global monitor (a single variable) \ + */ \ +static struct da_monitor da_mon_##name; \ + \ +/* \ + * da_get_monitor_##name - return the global monitor address \ + */ \ +static struct da_monitor *da_get_monitor_##name(void) \ +{ \ + return &da_mon_##name; \ +} \ + \ +/* \ + * da_monitor_reset_all_##name - reset the single monitor \ + */ \ +static void da_monitor_reset_all_##name(void) \ +{ \ + da_monitor_reset_##name(da_get_monitor_##name()); \ +} \ + \ +/* \ + * da_monitor_init_##name - initialize a monitor \ + */ \ +static inline int da_monitor_init_##name(void) \ +{ \ + da_monitor_reset_all_##name(); \ + return 0; \ +} \ + \ +/* \ + * da_monitor_destroy_##name - destroy the monitor \ + */ \ +static inline void da_monitor_destroy_##name(void) \ +{ \ + return; \ +} + +/* + * Functions to define, init and get a per-cpu monitor. + */ +#define DECLARE_DA_MON_INIT_PER_CPU(name, type) \ + \ +/* \ + * per-cpu monitor variables \ + */ \ +DEFINE_PER_CPU(struct da_monitor, da_mon_##name); \ + \ +/* \ + * da_get_monitor_##name - return current CPU monitor address \ + */ \ +static struct da_monitor *da_get_monitor_##name(void) \ +{ \ + return this_cpu_ptr(&da_mon_##name); \ +} \ + \ +/* \ + * da_monitor_reset_all_##name - reset all CPUs' monitor \ + */ \ +static void da_monitor_reset_all_##name(void) \ +{ \ + struct da_monitor *da_mon; \ + int cpu; \ + for_each_cpu(cpu, cpu_online_mask) { \ + da_mon = per_cpu_ptr(&da_mon_##name, cpu); \ + da_monitor_reset_##name(da_mon); \ + } \ +} \ + \ +/* \ + * da_monitor_init_##name - initialize all CPUs' monitor \ + */ \ +static inline int da_monitor_init_##name(void) \ +{ \ + da_monitor_reset_all_##name(); \ + return 0; \ +} \ + \ +/* \ + * da_monitor_destroy_##name - destroy the monitor \ + */ \ +static inline void da_monitor_destroy_##name(void) \ +{ \ + return; \ +} + +/* + * Functions to define, init and get a per-task monitor. + */ +#define DECLARE_DA_MON_INIT_PER_TASK(name, type) \ + \ +/* \ + * The per-task monitor is stored a vector in the task struct. This variable \ + * stores the position on the vector reserved for this monitor. \ + */ \ +static int task_mon_slot_##name = RV_PER_TASK_MONITOR_INIT; \ + \ +/* \ + * da_get_monitor_##name - return the monitor in the allocated slot for tsk \ + */ \ +static inline struct da_monitor *da_get_monitor_##name(struct task_struct *tsk) \ +{ \ + return &tsk->rv[task_mon_slot_##name].da_mon; \ +} \ + \ +static void da_monitor_reset_all_##name(void) \ +{ \ + struct task_struct *g, *p; \ + \ + read_lock(&tasklist_lock); \ + for_each_process_thread(g, p) \ + da_monitor_reset_##name(da_get_monitor_##name(p)); \ + read_unlock(&tasklist_lock); \ +} \ + \ +/* \ + * da_monitor_init_##name - initialize the per-task monitor \ + * \ + * Try to allocate a slot in the task's vector of monitors. If there \ + * is an available slot, use it and reset all task's monitor. \ + */ \ +static int da_monitor_init_##name(void) \ +{ \ + int slot; \ + \ + slot = rv_get_task_monitor_slot(); \ + if (slot < 0 || slot >= RV_PER_TASK_MONITOR_INIT) \ + return slot; \ + \ + task_mon_slot_##name = slot; \ + \ + da_monitor_reset_all_##name(); \ + return 0; \ +} \ + \ +/* \ + * da_monitor_destroy_##name - return the allocated slot \ + */ \ +static inline void da_monitor_destroy_##name(void) \ +{ \ + if (task_mon_slot_##name == RV_PER_TASK_MONITOR_INIT) { \ + WARN_ONCE(1, "Disabling a disabled monitor: " #name); \ + return; \ + } \ + rv_put_task_monitor_slot(task_mon_slot_##name); \ + task_mon_slot_##name = RV_PER_TASK_MONITOR_INIT; \ + return; \ +} + +/* + * Handle event for implicit monitor: da_get_monitor_##name() will figure out + * the monitor. + */ +#define DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type) \ + \ +static inline void __da_handle_event_##name(struct da_monitor *da_mon, \ + enum events_##name event) \ +{ \ + bool retval; \ + \ + retval = da_event_##name(da_mon, event); \ + if (!retval) \ + da_monitor_reset_##name(da_mon); \ +} \ + \ +/* \ + * da_handle_event_##name - handle an event \ + */ \ +static inline void da_handle_event_##name(enum events_##name event) \ +{ \ + struct da_monitor *da_mon = da_get_monitor_##name(); \ + bool retval; \ + \ + retval = da_monitor_handling_event_##name(da_mon); \ + if (!retval) \ + return; \ + \ + __da_handle_event_##name(da_mon, event); \ +} \ + \ +/* \ + * da_handle_start_event_##name - start monitoring or handle event \ + * \ + * This function is used to notify the monitor that the system is returning \ + * to the initial state, so the monitor can start monitoring in the next event. \ + * Thus: \ + * \ + * If the monitor already started, handle the event. \ + * If the monitor did not start yet, start the monitor but skip the event. \ + */ \ +static inline bool da_handle_start_event_##name(enum events_##name event) \ +{ \ + struct da_monitor *da_mon; \ + \ + if (!da_monitor_enabled_##name()) \ + return 0; \ + \ + da_mon = da_get_monitor_##name(); \ + \ + if (unlikely(!da_monitoring_##name(da_mon))) { \ + da_monitor_start_##name(da_mon); \ + return 0; \ + } \ + \ + __da_handle_event_##name(da_mon, event); \ + \ + return 1; \ +} \ + \ +/* \ + * da_handle_start_run_event_##name - start monitoring and handle event \ + * \ + * This function is used to notify the monitor that the system is in the \ + * initial state, so the monitor can start monitoring and handling event. \ + */ \ +static inline bool da_handle_start_run_event_##name(enum events_##name event) \ +{ \ + struct da_monitor *da_mon; \ + \ + if (!da_monitor_enabled_##name()) \ + return 0; \ + \ + da_mon = da_get_monitor_##name(); \ + \ + if (unlikely(!da_monitoring_##name(da_mon))) \ + da_monitor_start_##name(da_mon); \ + \ + __da_handle_event_##name(da_mon, event); \ + \ + return 1; \ +} + +/* + * Handle event for per task. + */ +#define DECLARE_DA_MON_MONITOR_HANDLER_PER_TASK(name, type) \ + \ +static inline void \ +__da_handle_event_##name(struct da_monitor *da_mon, struct task_struct *tsk, \ + enum events_##name event) \ +{ \ + bool retval; \ + \ + retval = da_event_##name(da_mon, tsk, event); \ + if (!retval) \ + da_monitor_reset_##name(da_mon); \ +} \ + \ +/* \ + * da_handle_event_##name - handle an event \ + */ \ +static inline void \ +da_handle_event_##name(struct task_struct *tsk, enum events_##name event) \ +{ \ + struct da_monitor *da_mon = da_get_monitor_##name(tsk); \ + bool retval; \ + \ + retval = da_monitor_handling_event_##name(da_mon); \ + if (!retval) \ + return; \ + \ + __da_handle_event_##name(da_mon, tsk, event); \ +} \ + \ +/* \ + * da_handle_start_event_##name - start monitoring or handle event \ + * \ + * This function is used to notify the monitor that the system is returning \ + * to the initial state, so the monitor can start monitoring in the next event. \ + * Thus: \ + * \ + * If the monitor already started, handle the event. \ + * If the monitor did not start yet, start the monitor but skip the event. \ + */ \ +static inline bool \ +da_handle_start_event_##name(struct task_struct *tsk, enum events_##name event) \ +{ \ + struct da_monitor *da_mon; \ + \ + if (!da_monitor_enabled_##name()) \ + return 0; \ + \ + da_mon = da_get_monitor_##name(tsk); \ + \ + if (unlikely(!da_monitoring_##name(da_mon))) { \ + da_monitor_start_##name(da_mon); \ + return 0; \ + } \ + \ + __da_handle_event_##name(da_mon, tsk, event); \ + \ + return 1; \ +} + +/* + * Entry point for the global monitor. + */ +#define DECLARE_DA_MON_GLOBAL(name, type) \ + \ +DECLARE_AUTOMATA_HELPERS(name, type) \ +DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ +DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type) \ +DECLARE_DA_MON_INIT_GLOBAL(name, type) \ +DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type) + +/* + * Entry point for the per-cpu monitor. + */ +#define DECLARE_DA_MON_PER_CPU(name, type) \ + \ +DECLARE_AUTOMATA_HELPERS(name, type) \ +DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ +DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type) \ +DECLARE_DA_MON_INIT_PER_CPU(name, type) \ +DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type) + +/* + * Entry point for the per-task monitor. + */ +#define DECLARE_DA_MON_PER_TASK(name, type) \ + \ +DECLARE_AUTOMATA_HELPERS(name, type) \ +DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ +DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type) \ +DECLARE_DA_MON_INIT_PER_TASK(name, type) \ +DECLARE_DA_MON_MONITOR_HANDLER_PER_TASK(name, type) diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h new file mode 100644 index 000000000000..20a2e09c6416 --- /dev/null +++ b/include/trace/events/rv.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rv + +#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RV_H + +#include +#include + +#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT +DECLARE_EVENT_CLASS(event_da_monitor, + + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(state, event, next_state, final_state), + + TP_STRUCT__entry( + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + __array( char, next_state, MAX_DA_NAME_LEN ) + __field( bool, final_state ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); + __entry->final_state = final_state; + ), + + TP_printk("%s x %s -> %s %s", + __entry->state, + __entry->event, + __entry->next_state, + __entry->final_state ? "(final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor, + + TP_PROTO(char *state, char *event), + + TP_ARGS(state, event), + + TP_STRUCT__entry( + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + ), + + TP_printk("event %s not expected in the state %s", + __entry->event, + __entry->state) +); +#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ + +#ifdef CONFIG_DA_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(id, state, event, next_state, final_state), + + TP_STRUCT__entry( + __field( int, id ) + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + __array( char, next_state, MAX_DA_NAME_LEN ) + __field( bool, final_state ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); + __entry->id = id; + __entry->final_state = final_state; + ), + + TP_printk("%d: %s x %s -> %s %s", + __entry->id, + __entry->state, + __entry->event, + __entry->next_state, + __entry->final_state ? "(final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor_id, + + TP_PROTO(int id, char *state, char *event), + + TP_ARGS(id, state, event), + + TP_STRUCT__entry( + __field( int, id ) + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s", + __entry->id, + __entry->event, + __entry->state) +); +#endif /* CONFIG_DA_MON_EVENTS_ID */ +#endif /* _TRACE_RV_H */ + +/* This part ust be outside protection */ +#undef TRACE_INCLUDE_PATH +#include diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c69..6f1f82ccd5f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1964,6 +1964,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) mutex_unlock(&oom_adj_mutex); } +#ifdef CONFIG_RV +static void rv_task_fork(struct task_struct *p) +{ + int i; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) + p->rv[i].da_mon.monitoring = false; +} +#else +#define rv_task_fork(p) do {} while (0) +#endif + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2399,6 +2411,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rv_task_fork(p); + rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 3eb5d48ab4f6..8714800e22ad 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,5 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-only # +config DA_MON_EVENTS + bool + +config DA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS + bool + +config DA_MON_EVENTS_ID + select DA_MON_EVENTS + bool + menuconfig RV bool "Runtime Verification" depends on TRACING diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 45cf64eb2600..df6678c86334 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -140,6 +140,11 @@ #include #include +#ifdef CONFIG_DA_MON_EVENTS +#define CREATE_TRACE_POINTS +#include +#endif + #include "rv.h" DEFINE_MUTEX(rv_interface_lock); -- cgit v1.2.3 From ff0aaf671230d409a68fd7400f41e9eb3ac61dd8 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:45 +0200 Subject: Documentation/rv: Add a basic documentation Add the runtime-verification.rst document, explaining the basics of RV and how to use the interface. Link: https://lkml.kernel.org/r/4be7d1a88ab1e2eb0767521e1ab52a149a154bc4.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/index.rst | 1 + Documentation/trace/rv/index.rst | 9 + Documentation/trace/rv/runtime-verification.rst | 231 ++++++++++++++++++++++++ kernel/trace/rv/Kconfig | 3 + kernel/trace/rv/rv.c | 3 + 5 files changed, 247 insertions(+) create mode 100644 Documentation/trace/rv/index.rst create mode 100644 Documentation/trace/rv/runtime-verification.rst (limited to 'kernel/trace') diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index f9b7bcb5a630..2d73e8697523 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -32,3 +32,4 @@ Linux Tracing Technologies sys-t coresight/index user_events + rv/index diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst new file mode 100644 index 000000000000..b54e49b1d0de --- /dev/null +++ b/Documentation/trace/rv/index.rst @@ -0,0 +1,9 @@ +==================== +Runtime Verification +==================== + +.. toctree:: + :maxdepth: 2 + :glob: + + runtime-verification.rst diff --git a/Documentation/trace/rv/runtime-verification.rst b/Documentation/trace/rv/runtime-verification.rst new file mode 100644 index 000000000000..c46b6149470e --- /dev/null +++ b/Documentation/trace/rv/runtime-verification.rst @@ -0,0 +1,231 @@ +==================== +Runtime Verification +==================== + +Runtime Verification (RV) is a lightweight (yet rigorous) method that +complements classical exhaustive verification techniques (such as *model +checking* and *theorem proving*) with a more practical approach for complex +systems. + +Instead of relying on a fine-grained model of a system (e.g., a +re-implementation a instruction level), RV works by analyzing the trace of the +system's actual execution, comparing it against a formal specification of +the system behavior. + +The main advantage is that RV can give precise information on the runtime +behavior of the monitored system, without the pitfalls of developing models +that require a re-implementation of the entire system in a modeling language. +Moreover, given an efficient monitoring method, it is possible execute an +*online* verification of a system, enabling the *reaction* for unexpected +events, avoiding, for example, the propagation of a failure on safety-critical +systems. + +Runtime Monitors and Reactors +============================= + +A monitor is the central part of the runtime verification of a system. The +monitor stands in between the formal specification of the desired (or +undesired) behavior, and the trace of the actual system. + +In Linux terms, the runtime verification monitors are encapsulated inside the +*RV monitor* abstraction. A *RV monitor* includes a reference model of the +system, a set of instances of the monitor (per-cpu monitor, per-task monitor, +and so on), and the helper functions that glue the monitor to the system via +trace, as depicted bellow:: + + Linux +---- RV Monitor ----------------------------------+ Formal + Realm | | Realm + +-------------------+ +----------------+ +-----------------+ + | Linux kernel | | Monitor | | Reference | + | Tracing | -> | Instance(s) | <- | Model | + | (instrumentation) | | (verification) | | (specification) | + +-------------------+ +----------------+ +-----------------+ + | | | + | V | + | +----------+ | + | | Reaction | | + | +--+--+--+-+ | + | | | | | + | | | +-> trace output ? | + +------------------------|--|----------------------+ + | +----> panic ? + +-------> + +In addition to the verification and monitoring of the system, a monitor can +react to an unexpected event. The forms of reaction can vary from logging the +event occurrence to the enforcement of the correct behavior to the extreme +action of taking a system down to avoid the propagation of a failure. + +In Linux terms, a *reactor* is an reaction method available for *RV monitors*. +By default, all monitors should provide a trace output of their actions, +which is already a reaction. In addition, other reactions will be available +so the user can enable them as needed. + +For further information about the principles of runtime verification and +RV applied to Linux: + + Bartocci, Ezio, et al. *Introduction to runtime verification.* In: Lectures on + Runtime Verification. Springer, Cham, 2018. p. 1-33. + + Falcone, Ylies, et al. *A taxonomy for classifying runtime verification tools.* + In: International Conference on Runtime Verification. Springer, Cham, 2018. p. + 241-262. + + De Oliveira, Daniel Bristot. *Automata-based formal analysis and + verification of the real-time Linux kernel.* Ph.D. Thesis, 2020. + +Online RV monitors +================== + +Monitors can be classified as *offline* and *online* monitors. *Offline* +monitor process the traces generated by a system after the events, generally by +reading the trace execution from a permanent storage system. *Online* monitors +process the trace during the execution of the system. Online monitors are said +to be *synchronous* if the processing of an event is attached to the system +execution, blocking the system during the event monitoring. On the other hand, +an *asynchronous* monitor has its execution detached from the system. Each type +of monitor has a set of advantages. For example, *offline* monitors can be +executed on different machines but require operations to save the log to a +file. In contrast, *synchronous online* method can react at the exact moment +a violation occurs. + +Another important aspect regarding monitors is the overhead associated with the +event analysis. If the system generates events at a frequency higher than the +monitor's ability to process them in the same system, only the *offline* +methods are viable. On the other hand, if the tracing of the events incurs +on higher overhead than the simple handling of an event by a monitor, then a +*synchronous online* monitors will incur on lower overhead. + +Indeed, the research presented in: + + De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo Silva. + *Efficient formal verification for the Linux kernel.* In: International + Conference on Software Engineering and Formal Methods. Springer, Cham, 2019. + p. 315-332. + +Shows that for Deterministic Automata models, the synchronous processing of +events in-kernel causes lower overhead than saving the same events to the trace +buffer, not even considering collecting the trace for user-space analysis. +This motivated the development of an in-kernel interface for online monitors. + +For further information about modeling of Linux kernel behavior using automata, +see: + + De Oliveira, Daniel B.; De Oliveira, Romulo S.; Cucinotta, Tommaso. *A thread + synchronization model for the PREEMPT_RT Linux kernel.* Journal of Systems + Architecture, 2020, 107: 101729. + +The user interface +================== + +The user interface resembles the tracing interface (on purpose). It is +currently at "/sys/kernel/tracing/rv/". + +The following files/folders are currently available: + +**available_monitors** + +- Reading list the available monitors, one per line + +For example:: + + # cat available_monitors + wip + wwnr + +**available_reactors** + +- Reading shows the available reactors, one per line. + +For example:: + + # cat available_reactors + nop + panic + printk + +**enabled_monitors**: + +- Reading lists the enabled monitors, one per line +- Writing to it enables a given monitor +- Writing a monitor name with a '!' prefix disables it +- Truncating the file disables all enabled monitors + +For example:: + + # cat enabled_monitors + # echo wip > enabled_monitors + # echo wwnr >> enabled_monitors + # cat enabled_monitors + wip + wwnr + # echo '!wip' >> enabled_monitors + # cat enabled_monitors + wwnr + # echo > enabled_monitors + # cat enabled_monitors + # + +Note that it is possible to enable more than one monitor concurrently. + +**monitoring_on** + +This is an on/off general switcher for monitoring. It resembles the +"tracing_on" switcher in the trace interface. + +- Writing "0" stops the monitoring +- Writing "1" continues the monitoring +- Reading returns the current status of the monitoring + +Note that it does not disable enabled monitors but stop the per-entity +monitors monitoring the events received from the system. + +**reacting_on** + +- Writing "0" prevents reactions for happening +- Writing "1" enable reactions +- Reading returns the current status of the reaction + +**monitors/** + +Each monitor will have its own directory inside "monitors/". There the +monitor-specific files will be presented. The "monitors/" directory resembles +the "events" directory on tracefs. + +For example:: + + # cd monitors/wip/ + # ls + desc enable + # cat desc + wakeup in preemptive per-cpu testing monitor. + # cat enable + 0 + +**monitors/MONITOR/desc** + +- Reading shows a description of the monitor *MONITOR* + +**monitors/MONITOR/enable** + +- Writing "0" disables the *MONITOR* +- Writing "1" enables the *MONITOR* +- Reading return the current status of the *MONITOR* + +**monitors/MONITOR/reactors** + +- List available reactors, with the select reaction for the given *MONITOR* + inside "[]". The default one is the nop (no operation) reactor. +- Writing the name of a reactor enables it to the given MONITOR. + +For example:: + + # cat monitors/wip/reactors + [nop] + panic + printk + # echo panic > monitors/wip/reactors + # cat monitors/wip/reactors + nop + [panic] + printk diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 8714800e22ad..0d9552b406c6 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -22,6 +22,9 @@ menuconfig RV actual execution, comparing it against a formal specification of the system behavior. + For further information, see: + Documentation/trace/rv/runtime-verification.rst + config RV_REACTORS bool "Runtime verification reactors" default y diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index df6678c86334..6c97cc2d754a 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -133,6 +133,9 @@ * auto-generated wakeup in preemptive monitor. * # cat enable * 0 + * + * For further information, see: + * Documentation/trace/rv/runtime-verification.rst */ #include -- cgit v1.2.3 From 8812d21219b9c649dd25eb93915e00939944aeb7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:51 +0200 Subject: rv/monitor: Add the wip monitor skeleton created by dot2k THIS CODE IS NOT LINKED TO THE MAKEFILE. This model does not compile because it lacks the instrumentation part, which will be added next. In the typical case, there will be only one patch, but it was split into two patches for educational purposes. This is the direct output this command line: $ dot2k -d tools/verification/models/wip.dot -t per_cpu Link: https://lkml.kernel.org/r/5eb7a9118917e8a814c5e49853a72fc62be0a101.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.c | 109 +++++++++++++++++++++++++++++++++++++ kernel/trace/rv/monitors/wip/wip.h | 46 ++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 kernel/trace/rv/monitors/wip/wip.c create mode 100644 kernel/trace/rv/monitors/wip/wip.h (limited to 'kernel/trace') diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c new file mode 100644 index 000000000000..79a054ca0cde --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "wip" + +/* + * XXX: include required tracepoint headers, e.g., + * #include + */ +#include + +/* + * This is the self-generated part of the monitor. Generally, there is no need + * to touch this section. + */ +#include "wip.h" + +/* + * Declare the deterministic automata monitor. + * + * The rv monitor reference is needed for the monitor declaration. + */ +struct rv_monitor rv_wip; +DECLARE_DA_MON_PER_CPU(wip, unsigned char); + +/* + * This is the instrumentation part of the monitor. + * + * This is the section where manual work is required. Here the kernel events + * are translated into model's event. + * + */ +static void handle_preempt_disable(void *data, /* XXX: fill header */) +{ + da_handle_event_wip(preempt_disable_wip); +} + +static void handle_preempt_enable(void *data, /* XXX: fill header */) +{ + da_handle_event_wip(preempt_enable_wip); +} + +static void handle_sched_waking(void *data, /* XXX: fill header */) +{ + da_handle_event_wip(sched_waking_wip); +} + +static int enable_wip(void) +{ + int retval; + + retval = da_monitor_init_wip(); + if (retval) + return retval; + + rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); + rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); + rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + + return 0; +} + +static void disable_wip(void) +{ + rv_wip.enabled = 0; + + rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); + rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); + rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + + da_monitor_destroy_wip(); +} + +/* + * This is the monitor register section. + */ +struct rv_monitor rv_wip = { + .name = "wip", + .description = "auto-generated wip", + .enable = enable_wip, + .disable = disable_wip, + .reset = da_monitor_reset_all_wip, + .enabled = 0, +}; + +static int register_wip(void) +{ + rv_register_monitor(&rv_wip); + return 0; +} + +static void unregister_wip(void) +{ + rv_unregister_monitor(&rv_wip); +} + +module_init(register_wip); +module_exit(unregister_wip); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("dot2k: auto-generated"); +MODULE_DESCRIPTION("wip"); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h new file mode 100644 index 000000000000..c1c47e2305ef --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wip automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wip { + preemptive_wip = 0, + non_preemptive_wip, + state_max_wip +}; + +#define INVALID_STATE state_max_wip + +enum events_wip { + preempt_disable_wip = 0, + preempt_enable_wip, + sched_waking_wip, + event_max_wip +}; + +struct automaton_wip { + char *state_names[state_max_wip]; + char *event_names[event_max_wip]; + unsigned char function[state_max_wip][event_max_wip]; + unsigned char initial_state; + bool final_states[state_max_wip]; +}; + +struct automaton_wip automaton_wip = { + .state_names = { + "preemptive", + "non_preemptive" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "sched_waking" + }, + .function = { + { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, preemptive_wip, non_preemptive_wip }, + }, + .initial_state = preemptive_wip, + .final_states = { 1, 0 }, +}; -- cgit v1.2.3 From 10bde81c74863472047f31304064018c40f488ee Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:52 +0200 Subject: rv/monitor: Add the wip monitor The wakeup in preemptive (wip) monitor verifies if the wakeup events always take place with preemption disabled: | | v #==================# H preemptive H <+ #==================# | | | | preempt_disable | preempt_enable v | sched_waking +------------------+ | +--------------- | | | | | non_preemptive | | +--------------> | | -+ +------------------+ The wakeup event always takes place with preemption disabled because of the scheduler synchronization. However, because the preempt_count and its trace event are not atomic with regard to interrupts, some inconsistencies might happen. The documentation illustrates one of these cases. Link: https://lkml.kernel.org/r/c98ca678df81115fddc04921b3c79720c836b18f.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/rv/index.rst | 1 + Documentation/trace/rv/monitor_wip.rst | 55 ++++++++++++++++++++++++++++++++++ include/trace/events/rv.h | 10 +++++++ kernel/trace/rv/Kconfig | 13 ++++++++ kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/wip/wip.c | 51 ++++++++++--------------------- tools/verification/models/wip.dot | 16 ++++++++++ 7 files changed, 111 insertions(+), 36 deletions(-) create mode 100644 Documentation/trace/rv/monitor_wip.rst create mode 100644 tools/verification/models/wip.dot (limited to 'kernel/trace') diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst index db2ae3f90b90..4cb71ed628b8 100644 --- a/Documentation/trace/rv/index.rst +++ b/Documentation/trace/rv/index.rst @@ -10,3 +10,4 @@ Runtime Verification deterministic_automata.rst da_monitor_synthesis.rst da_monitor_instrumentation.rst + monitor_wip.rst diff --git a/Documentation/trace/rv/monitor_wip.rst b/Documentation/trace/rv/monitor_wip.rst new file mode 100644 index 000000000000..a95763438c48 --- /dev/null +++ b/Documentation/trace/rv/monitor_wip.rst @@ -0,0 +1,55 @@ +Monitor wip +=========== + +- Name: wip - wakeup in preemptive +- Type: per-cpu deterministic automaton +- Author: Daniel Bristot de Oliveira + +Description +----------- + +The wakeup in preemptive (wip) monitor is a sample per-cpu monitor +that verifies if the wakeup events always take place with +preemption disabled:: + + | + | + v + #==================# + H preemptive H <+ + #==================# | + | | + | preempt_disable | preempt_enable + v | + sched_waking +------------------+ | + +--------------- | | | + | | non_preemptive | | + +--------------> | | -+ + +------------------+ + +The wakeup event always takes place with preemption disabled because +of the scheduler synchronization. However, because the preempt_count +and its trace event are not atomic with regard to interrupts, some +inconsistencies might happen. For example:: + + preempt_disable() { + __preempt_count_add(1) + -------> smp_apic_timer_interrupt() { + preempt_disable() + do not trace (preempt count >= 1) + + wake up a thread + + preempt_enable() + do not trace (preempt count >= 1) + } + <------ + trace_preempt_disable(); + } + +This problem was reported and discussed here: + https://lore.kernel.org/r/cover.1559051152.git.bristot@redhat.com/ + +Specification +------------- +Grapviz Dot file in tools/verification/models/wip.dot diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h index 20a2e09c6416..e972f27d8df3 100644 --- a/include/trace/events/rv.h +++ b/include/trace/events/rv.h @@ -56,6 +56,16 @@ DECLARE_EVENT_CLASS(error_da_monitor, __entry->event, __entry->state) ); + +#ifdef CONFIG_RV_MON_WIP +DEFINE_EVENT(event_da_monitor, event_wip, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_wip, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_WIP */ #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ #ifdef CONFIG_DA_MON_EVENTS_ID diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 0d9552b406c6..e50f3346164a 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -25,6 +25,19 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst + config RV_REACTORS bool "Runtime verification reactors" default y diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 8944274d9b41..b41109d2750a 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 79a054ca0cde..83cace53b9fa 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -10,44 +10,26 @@ #define MODULE_NAME "wip" -/* - * XXX: include required tracepoint headers, e.g., - * #include - */ #include +#include +#include -/* - * This is the self-generated part of the monitor. Generally, there is no need - * to touch this section. - */ #include "wip.h" -/* - * Declare the deterministic automata monitor. - * - * The rv monitor reference is needed for the monitor declaration. - */ struct rv_monitor rv_wip; DECLARE_DA_MON_PER_CPU(wip, unsigned char); -/* - * This is the instrumentation part of the monitor. - * - * This is the section where manual work is required. Here the kernel events - * are translated into model's event. - * - */ -static void handle_preempt_disable(void *data, /* XXX: fill header */) +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { da_handle_event_wip(preempt_disable_wip); } -static void handle_preempt_enable(void *data, /* XXX: fill header */) +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_wip(preempt_enable_wip); + da_handle_start_event_wip(preempt_enable_wip); } -static void handle_sched_waking(void *data, /* XXX: fill header */) +static void handle_sched_waking(void *data, struct task_struct *task) { da_handle_event_wip(sched_waking_wip); } @@ -60,9 +42,9 @@ static int enable_wip(void) if (retval) return retval; - rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); - rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); - rv_attach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("wip", sched_waking, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable); return 0; } @@ -71,19 +53,16 @@ static void disable_wip(void) { rv_wip.enabled = 0; - rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_disable); - rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_preempt_enable); - rv_detach_trace_probe("wip", /* XXX: tracepoint */, handle_sched_waking); + rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); da_monitor_destroy_wip(); } -/* - * This is the monitor register section. - */ struct rv_monitor rv_wip = { .name = "wip", - .description = "auto-generated wip", + .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, .disable = disable_wip, .reset = da_monitor_reset_all_wip, @@ -105,5 +84,5 @@ module_init(register_wip); module_exit(unregister_wip); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("dot2k: auto-generated"); -MODULE_DESCRIPTION("wip"); +MODULE_AUTHOR("Daniel Bristot de Oliveira "); +MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor."); diff --git a/tools/verification/models/wip.dot b/tools/verification/models/wip.dot new file mode 100644 index 000000000000..2a53a9700a89 --- /dev/null +++ b/tools/verification/models/wip.dot @@ -0,0 +1,16 @@ +digraph state_automaton { + {node [shape = circle] "non_preemptive"}; + {node [shape = plaintext, style=invis, label=""] "__init_preemptive"}; + {node [shape = doublecircle] "preemptive"}; + {node [shape = circle] "preemptive"}; + "__init_preemptive" -> "preemptive"; + "non_preemptive" [label = "non_preemptive"]; + "non_preemptive" -> "non_preemptive" [ label = "sched_waking" ]; + "non_preemptive" -> "preemptive" [ label = "preempt_enable" ]; + "preemptive" [label = "preemptive"]; + "preemptive" -> "non_preemptive" [ label = "preempt_disable" ]; + { rank = min ; + "__init_preemptive"; + "preemptive"; + } +} -- cgit v1.2.3 From ccc319dcb450d57b7befe924453d06804d83ba73 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:53 +0200 Subject: rv/monitor: Add the wwnr monitor Per task wakeup while not running (wwnr) monitor. This model is broken, the reason is that a task can be running in the processor without being set as RUNNABLE. Think about a task about to sleep: 1: set_current_state(TASK_UNINTERRUPTIBLE); 2: schedule(); And then imagine an IRQ happening in between the lines one and two, waking the task up. BOOM, the wakeup will happen while the task is running. Q: Why do we need this model, so? A: To test the reactors. Link: https://lkml.kernel.org/r/473c0fc39967250fdebcff8b620311c11dccad30.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/rv/index.rst | 1 + Documentation/trace/rv/monitor_wwnr.rst | 45 +++++++++++++++++ include/trace/events/rv.h | 12 +++++ kernel/trace/rv/Kconfig | 12 +++++ kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/wwnr/wwnr.c | 87 +++++++++++++++++++++++++++++++++ kernel/trace/rv/monitors/wwnr/wwnr.h | 46 +++++++++++++++++ tools/verification/models/wwnr.dot | 16 ++++++ 8 files changed, 220 insertions(+) create mode 100644 Documentation/trace/rv/monitor_wwnr.rst create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr.c create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr.h create mode 100644 tools/verification/models/wwnr.dot (limited to 'kernel/trace') diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst index 4cb71ed628b8..15fa966102c0 100644 --- a/Documentation/trace/rv/index.rst +++ b/Documentation/trace/rv/index.rst @@ -11,3 +11,4 @@ Runtime Verification da_monitor_synthesis.rst da_monitor_instrumentation.rst monitor_wip.rst + monitor_wwnr.rst diff --git a/Documentation/trace/rv/monitor_wwnr.rst b/Documentation/trace/rv/monitor_wwnr.rst new file mode 100644 index 000000000000..80f1777b85aa --- /dev/null +++ b/Documentation/trace/rv/monitor_wwnr.rst @@ -0,0 +1,45 @@ +Monitor wwnr +============ + +- Name: wwrn - wakeup while not running +- Type: per-task deterministic automaton +- Author: Daniel Bristot de Oliveira + +Description +----------- + +This is a per-task sample monitor, with the following +definition:: + + | + | + v + wakeup +-------------+ + +--------- | | + | | not_running | + +--------> | | <+ + +-------------+ | + | | + | switch_in | switch_out + v | + +-------------+ | + | running | -+ + +-------------+ + +This model is borken, the reason is that a task can be running +in the processor without being set as RUNNABLE. Think about a +task about to sleep:: + + 1: set_current_state(TASK_UNINTERRUPTIBLE); + 2: schedule(); + +And then imagine an IRQ happening in between the lines one and two, +waking the task up. BOOM, the wakeup will happen while the task is +running. + +- Why do we need this model, so? +- To test the reactors. + +Specification +------------- +Grapviz Dot file in tools/verification/models/wwnr.dot diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h index e972f27d8df3..56592da9301c 100644 --- a/include/trace/events/rv.h +++ b/include/trace/events/rv.h @@ -122,6 +122,18 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, __entry->event, __entry->state) ); + +#ifdef CONFIG_RV_MON_WWNR +/* id is the pid of the task */ +DEFINE_EVENT(event_da_monitor_id, event_wwnr, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_wwnr, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_WWNR */ + #endif /* CONFIG_DA_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index e50f3346164a..b259d6e8dc7c 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -38,6 +38,18 @@ config RV_MON_WIP For further information, see: Documentation/trace/rv/monitor_wip.rst +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst + config RV_REACTORS bool "Runtime verification reactors" default y diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index b41109d2750a..af0ff9a46418 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o +obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c new file mode 100644 index 000000000000..599225d9cf38 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "wwnr" + +#include +#include + +#include "wwnr.h" + +struct rv_monitor rv_wwnr; +DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); + +static void handle_switch(void *data, bool preempt, struct task_struct *p, + struct task_struct *n, unsigned int prev_state) +{ + /* start monitoring only after the first suspension */ + if (prev_state == TASK_INTERRUPTIBLE) + da_handle_start_event_wwnr(p, switch_out_wwnr); + else + da_handle_event_wwnr(p, switch_out_wwnr); + + da_handle_event_wwnr(n, switch_in_wwnr); +} + +static void handle_wakeup(void *data, struct task_struct *p) +{ + da_handle_event_wwnr(p, wakeup_wwnr); +} + +static int enable_wwnr(void) +{ + int retval; + + retval = da_monitor_init_wwnr(); + if (retval) + return retval; + + rv_attach_trace_probe("wwnr", sched_switch, handle_switch); + rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + return 0; +} + +static void disable_wwnr(void) +{ + rv_wwnr.enabled = 0; + + rv_detach_trace_probe("wwnr", sched_switch, handle_switch); + rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + da_monitor_destroy_wwnr(); +} + +struct rv_monitor rv_wwnr = { + .name = "wwnr", + .description = "wakeup while not running per-task testing model.", + .enable = enable_wwnr, + .disable = disable_wwnr, + .reset = da_monitor_reset_all_wwnr, + .enabled = 0, +}; + +static int register_wwnr(void) +{ + rv_register_monitor(&rv_wwnr); + return 0; +} + +static void unregister_wwnr(void) +{ + rv_unregister_monitor(&rv_wwnr); +} + +module_init(register_wwnr); +module_exit(unregister_wwnr); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira "); +MODULE_DESCRIPTION("wwnr: wakeup while not running monitor"); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h new file mode 100644 index 000000000000..d1afe55cdd4c --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wwnr automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wwnr { + not_running_wwnr = 0, + running_wwnr, + state_max_wwnr +}; + +#define INVALID_STATE state_max_wwnr + +enum events_wwnr { + switch_in_wwnr = 0, + switch_out_wwnr, + wakeup_wwnr, + event_max_wwnr +}; + +struct automaton_wwnr { + char *state_names[state_max_wwnr]; + char *event_names[event_max_wwnr]; + unsigned char function[state_max_wwnr][event_max_wwnr]; + unsigned char initial_state; + bool final_states[state_max_wwnr]; +}; + +struct automaton_wwnr automaton_wwnr = { + .state_names = { + "not_running", + "running" + }, + .event_names = { + "switch_in", + "switch_out", + "wakeup" + }, + .function = { + { running_wwnr, INVALID_STATE, not_running_wwnr }, + { INVALID_STATE, not_running_wwnr, INVALID_STATE }, + }, + .initial_state = not_running_wwnr, + .final_states = { 1, 0 }, +}; diff --git a/tools/verification/models/wwnr.dot b/tools/verification/models/wwnr.dot new file mode 100644 index 000000000000..1b206e83129c --- /dev/null +++ b/tools/verification/models/wwnr.dot @@ -0,0 +1,16 @@ +digraph state_automaton { + {node [shape = plaintext, style=invis, label=""] "__init_not_running"}; + {node [shape = ellipse] "not_running"}; + {node [shape = plaintext] "not_running"}; + {node [shape = plaintext] "running"}; + "__init_not_running" -> "not_running"; + "not_running" [label = "not_running", color = green3]; + "not_running" -> "not_running" [ label = "wakeup" ]; + "not_running" -> "running" [ label = "switch_in" ]; + "running" [label = "running"]; + "running" -> "not_running" [ label = "switch_out" ]; + { rank = min ; + "__init_not_running"; + "not_running"; + } +} -- cgit v1.2.3 From 135b881ea88566f27dd4acc5d2ed83ad418a3a69 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:54 +0200 Subject: rv/reactor: Add the printk reactor A reactor that printks the reaction message. Link: https://lkml.kernel.org/r/b65f18a7fd6dc6659a3008fd7b7392de3465d47b.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 8 ++++++++ kernel/trace/rv/Makefile | 3 ++- kernel/trace/rv/reactor_printk.c | 42 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/rv/reactor_printk.c (limited to 'kernel/trace') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b259d6e8dc7c..e82d5015e6ab 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -60,3 +60,11 @@ config RV_REACTORS on the model's execution. By default, the monitors have tracing reactions, printing the monitor output via tracepoints, but other reactions can be added (on-demand) via this interface. + +config RV_REACT_PRINTK + bool "Printk reactor" + depends on RV_REACTORS + default y + help + Enables the printk reactor. The printk reactor emits a printk() + message if an exception is found. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index af0ff9a46418..a13c750a35c1 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_RV) += rv.o -obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c new file mode 100644 index 000000000000..31899f953af4 --- /dev/null +++ b/kernel/trace/rv/reactor_printk.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Printk RV reactor: + * Prints the exception msg to the kernel message log. + */ +#include +#include +#include +#include +#include +#include + +static void rv_printk_reaction(char *msg) +{ + printk_deferred(msg); +} + +static struct rv_reactor rv_printk = { + .name = "printk", + .description = "prints the exception msg to the kernel message log.", + .react = rv_printk_reaction +}; + +static int register_react_printk(void) +{ + rv_register_reactor(&rv_printk); + return 0; +} + +static void unregister_react_printk(void) +{ + rv_unregister_reactor(&rv_printk); +} + +module_init(register_react_printk); +module_exit(unregister_react_printk); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit."); -- cgit v1.2.3 From e88043c0ac16f19960048372dcffc6df7c05c5b8 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 29 Jul 2022 11:38:55 +0200 Subject: rv/reactor: Add the panic reactor Sample reactor that panics the system when an exception is found. This is useful both to capture a vmcore, or to fail-safe a critical system. Link: https://lkml.kernel.org/r/729aae3aba95f35738b8f8180e626d747d1d9da2.1659052063.git.bristot@kernel.org Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Cc: Marco Elver Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Shuah Khan Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Clark Williams Cc: Tao Zhou Cc: Randy Dunlap Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 8 ++++++++ kernel/trace/rv/Makefile | 1 + kernel/trace/rv/reactor_panic.c | 43 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 kernel/trace/rv/reactor_panic.c (limited to 'kernel/trace') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index e82d5015e6ab..831779607e84 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -68,3 +68,11 @@ config RV_REACT_PRINTK help Enables the printk reactor. The printk reactor emits a printk() message if an exception is found. + +config RV_REACT_PANIC + bool "Panic reactor" + depends on RV_REACTORS + default y + help + Enables the panic reactor. The panic reactor emits a printk() + message if an exception is found and panic()s the system. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index a13c750a35c1..963d14875b45 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o +obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c new file mode 100644 index 000000000000..b698d05dd069 --- /dev/null +++ b/kernel/trace/rv/reactor_panic.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira + * + * Panic RV reactor: + * Prints the exception msg to the kernel message log and panic(). + */ + +#include +#include +#include +#include +#include +#include + +static void rv_panic_reaction(char *msg) +{ + panic(msg); +} + +static struct rv_reactor rv_panic = { + .name = "panic", + .description = "panic the system if an exception is found.", + .react = rv_panic_reaction +}; + +static int register_react_panic(void) +{ + rv_register_reactor(&rv_panic); + return 0; +} + +static void unregister_react_panic(void) +{ + rv_unregister_reactor(&rv_panic); +} + +module_init(register_react_panic); +module_exit(unregister_react_panic); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found."); -- cgit v1.2.3 From 59927cbe3f30c5ed7fc5d33487ef06611394a548 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Wed, 20 Jul 2022 10:46:48 +0800 Subject: tracing: Use free_trace_buffer() in allocate_trace_buffers() In allocate_trace_buffers(), if allocating tr->max_buffer fails, we can directly call free_trace_buffer to free tr->array_buffer. Link: https://lkml.kernel.org/r/65f0702d-07f6-08de-2a07-4c50af56a67b@huawei.com Signed-off-by: Zhiqiang Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 301305ec134b..27febd4ee33e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9101,6 +9101,16 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size return 0; } +static void free_trace_buffer(struct array_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; @@ -9113,10 +9123,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { - ring_buffer_free(tr->array_buffer.buffer); - tr->array_buffer.buffer = NULL; - free_percpu(tr->array_buffer.data); - tr->array_buffer.data = NULL; + free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; @@ -9131,16 +9138,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } -static void free_trace_buffer(struct array_buffer *buf) -{ - if (buf->buffer) { - ring_buffer_free(buf->buffer); - buf->buffer = NULL; - free_percpu(buf->data); - buf->data = NULL; - } -} - static void free_trace_buffers(struct trace_array *tr) { if (!tr) -- cgit v1.2.3 From 2f63e5d2e391837c70741311b5b70d2fbd15d138 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 1 Aug 2022 11:32:15 +0900 Subject: tracing/eprobe: Show syntax error logs in error_log file Show the syntax errors for event probes in error_log file as same as other dynamic events, so that user can understand what is the problem. Link: https://lkml.kernel.org/r/165932113556.2850673.3483079297896607612.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 11 +++++++++-- kernel/trace/trace_probe.h | 5 ++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a30f21499e81..4a0e9d927443 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -839,8 +839,11 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { ret = trace_eprobe_tp_arg_update(ep, i); + if (ret) + trace_probe_log_err(0, BAD_ATTACH_ARG); + } return ret; } @@ -880,8 +883,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); - if (!sys_event || !sys_name) + if (!sys_event || !sys_name) { + trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; + } if (!event) { strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); @@ -896,6 +901,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (IS_ERR(ep)) { ret = PTR_ERR(ep); + if (ret == -ENODEV) + trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); ep = NULL; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 92cc149af0fd..3b3869ae8cfd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -442,7 +442,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(FAIL_REG_PROBE, "Failed to register probe event"),\ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ - C(SAME_PROBE, "There is already the exact same probe event"), + C(SAME_PROBE, "There is already the exact same probe event"),\ + C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ + C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ + C(BAD_ATTACH_ARG, "Attached event does not have this field"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From f1a15b977ff864513133ecb611eb28603d32c1b4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Aug 2022 17:33:48 +0300 Subject: rv: Unlock on error path in rv_unregister_reactor() Unlock the "rv_interface_lock" mutex before returning. Link: https://lkml.kernel.org/r/YuvYzNfGMgV+PIhd@kili Fixes: 04acadcb4453 ("rv: Add runtime reactors interface") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_reactors.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index a6522c196382..6aae106695b6 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -329,6 +329,7 @@ int rv_register_reactor(struct rv_reactor *reactor) int rv_unregister_reactor(struct rv_reactor *reactor) { struct rv_reactor_def *ptr, *next; + int ret = 0; mutex_lock(&rv_interface_lock); @@ -343,13 +344,14 @@ int rv_unregister_reactor(struct rv_reactor *reactor) ptr->reactor->name, ptr->counter); printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", ptr->reactor->name); - return -EBUSY; + ret = -EBUSY; + break; } } } mutex_unlock(&rv_interface_lock); - return 0; + return ret; } /* -- cgit v1.2.3 From d8a64313c171464aedd6289378a51c8f0f524acb Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 11 Aug 2022 09:17:34 +0200 Subject: tracing: React to error return from traceprobe_parse_event_name() The function traceprobe_parse_event_name() may set the first two function arguments to a non-null value and still return -EINVAL to indicate an unsuccessful completion of the function. Hence, it is not sufficient to just check the result of the two function arguments for being not null, but the return value also needs to be checked. Commit 95c104c378dc ("tracing: Auto generate event name when creating a group of events") changed the error-return-value checking of the second traceprobe_parse_event_name() invocation in __trace_eprobe_create() and removed checking the return value to jump to the error handling case. Reinstate using the return value in the error-return-value checking. Link: https://lkml.kernel.org/r/20220811071734.20700-1-lukas.bulwahn@gmail.com Fixes: 95c104c378dc ("tracing: Auto generate event name when creating a group of events") Acked-by: Linyu Yuan Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 4a0e9d927443..550671985fd1 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -883,7 +883,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); - if (!sys_event || !sys_name) { + if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; } -- cgit v1.2.3 From 7249921d94ff64f67b733eca0b68853a62032b3d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 16 Aug 2022 19:28:17 -0400 Subject: tracing/perf: Fix double put of trace event when init fails If in perf_trace_event_init(), the perf_trace_event_open() fails, then it will call perf_trace_event_unreg() which will not only unregister the perf trace event, but will also call the put() function of the tp_event. The problem here is that the trace_event_try_get_ref() is called by the caller of perf_trace_event_init() and if perf_trace_event_init() returns a failure, it will then call trace_event_put(). But since the perf_trace_event_unreg() already called the trace_event_put() function, it triggers a WARN_ON(). WARNING: CPU: 1 PID: 30309 at kernel/trace/trace_dynevent.c:46 trace_event_dyn_put_ref+0x15/0x20 If perf_trace_event_reg() does not call the trace_event_try_get_ref() then the perf_trace_event_unreg() should not be calling trace_event_put(). This breaks symmetry and causes bugs like these. Pull out the trace_event_put() from perf_trace_event_unreg() and call it in the locations that perf_trace_event_unreg() is called. This not only fixes this bug, but also brings back the proper symmetry of the reg/unreg vs get/put logic. Link: https://lore.kernel.org/all/cover.1660347763.git.kjlx@templeofstupid.com/ Link: https://lkml.kernel.org/r/20220816192817.43d5e17f@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 1d18538e6a092 ("tracing: Have dynamic events have a ref counter") Reported-by: Krister Johansen Reviewed-by: Krister Johansen Tested-by: Krister Johansen Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_event_perf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a114549720d6..61e3a2620fa3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } @@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); @@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } -- cgit v1.2.3 From c3b0f72e805f0801f05fa2aa52011c4bfc694c44 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Thu, 18 Aug 2022 11:26:59 +0800 Subject: ftrace: Fix NULL pointer dereference in is_ftrace_trampoline when ftrace is dead ftrace_startup does not remove ops from ftrace_ops_list when ftrace_startup_enable fails: register_ftrace_function ftrace_startup __register_ftrace_function ... add_ftrace_ops(&ftrace_ops_list, ops) ... ... ftrace_startup_enable // if ftrace failed to modify, ftrace_disabled is set to 1 ... return 0 // ops is in the ftrace_ops_list. When ftrace_disabled = 1, unregister_ftrace_function simply returns without doing anything: unregister_ftrace_function ftrace_shutdown if (unlikely(ftrace_disabled)) return -ENODEV; // return here, __unregister_ftrace_function is not executed, // as a result, ops is still in the ftrace_ops_list __unregister_ftrace_function ... If ops is dynamically allocated, it will be free later, in this case, is_ftrace_trampoline accesses NULL pointer: is_ftrace_trampoline ftrace_ops_trampoline do_for_each_ftrace_op(op, ftrace_ops_list) // OOPS! op may be NULL! Syzkaller reports as follows: [ 1203.506103] BUG: kernel NULL pointer dereference, address: 000000000000010b [ 1203.508039] #PF: supervisor read access in kernel mode [ 1203.508798] #PF: error_code(0x0000) - not-present page [ 1203.509558] PGD 800000011660b067 P4D 800000011660b067 PUD 130fb8067 PMD 0 [ 1203.510560] Oops: 0000 [#1] SMP KASAN PTI [ 1203.511189] CPU: 6 PID: 29532 Comm: syz-executor.2 Tainted: G B W 5.10.0 #8 [ 1203.512324] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1203.513895] RIP: 0010:is_ftrace_trampoline+0x26/0xb0 [ 1203.514644] Code: ff eb d3 90 41 55 41 54 49 89 fc 55 53 e8 f2 00 fd ff 48 8b 1d 3b 35 5d 03 e8 e6 00 fd ff 48 8d bb 90 00 00 00 e8 2a 81 26 00 <48> 8b ab 90 00 00 00 48 85 ed 74 1d e8 c9 00 fd ff 48 8d bb 98 00 [ 1203.518838] RSP: 0018:ffffc900012cf960 EFLAGS: 00010246 [ 1203.520092] RAX: 0000000000000000 RBX: 000000000000007b RCX: ffffffff8a331866 [ 1203.521469] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000000010b [ 1203.522583] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffff8df18b07 [ 1203.523550] R10: fffffbfff1be3160 R11: 0000000000000001 R12: 0000000000478399 [ 1203.524596] R13: 0000000000000000 R14: ffff888145088000 R15: 0000000000000008 [ 1203.525634] FS: 00007f429f5f4700(0000) GS:ffff8881daf00000(0000) knlGS:0000000000000000 [ 1203.526801] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1203.527626] CR2: 000000000000010b CR3: 0000000170e1e001 CR4: 00000000003706e0 [ 1203.528611] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1203.529605] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Therefore, when ftrace_startup_enable fails, we need to rollback registration process and remove ops from ftrace_ops_list. Link: https://lkml.kernel.org/r/20220818032659.56209-1-yangjihong1@huawei.com Suggested-by: Steven Rostedt Signed-off-by: Yang Jihong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 601ccf1b2f09..4baa99363b16 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2937,6 +2937,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_startup_enable(command); + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + ops->flags &= ~FTRACE_OPS_FL_ADDING; return 0; -- cgit v1.2.3 From 2673c60ee67e71f2ebe34386e62d348f71edee47 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:17 -0400 Subject: tracing/eprobes: Do not allow eprobes to use $stack, or % for regs While playing with event probes (eprobes), I tried to see what would happen if I attempted to retrieve the instruction pointer (%rip) knowing that event probes do not use pt_regs. The result was: BUG: kernel NULL pointer dereference, address: 0000000000000024 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1847 Comm: trace-cmd Not tainted 5.19.0-rc5-test+ #309 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:get_event_field.isra.0+0x0/0x50 Code: ff 48 c7 c7 c0 8f 74 a1 e8 3d 8b f5 ff e8 88 09 f6 ff 4c 89 e7 e8 50 6a 13 00 48 89 ef 5b 5d 41 5c 41 5d e9 42 6a 13 00 66 90 <48> 63 47 24 8b 57 2c 48 01 c6 8b 47 28 83 f8 02 74 0e 83 f8 04 74 RSP: 0018:ffff916c394bbaf0 EFLAGS: 00010086 RAX: ffff916c854041d8 RBX: ffff916c8d9fbf50 RCX: ffff916c255d2000 RDX: 0000000000000000 RSI: ffff916c255d2008 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff916c3a2a0c08 R09: ffff916c394bbda8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff916c854041d8 R13: ffff916c854041b0 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff916c9ea40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000024 CR3: 000000011b60a002 CR4: 00000000001706e0 Call Trace: get_eprobe_size+0xb4/0x640 ? __mod_node_page_state+0x72/0xc0 __eprobe_trace_func+0x59/0x1a0 ? __mod_lruvec_page_state+0xaa/0x1b0 ? page_remove_file_rmap+0x14/0x230 ? page_remove_rmap+0xda/0x170 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x18f/0x240 trace_event_raw_event_sched_wakeup_template+0x7a/0xb0 try_to_wake_up+0x260/0x4c0 __wake_up_common+0x80/0x180 __wake_up_common_lock+0x7c/0xc0 do_notify_parent+0x1c9/0x2a0 exit_notify+0x1a9/0x220 do_exit+0x2ba/0x450 do_group_exit+0x2d/0x90 __x64_sys_exit_group+0x14/0x20 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Obviously this is not the desired result. Move the testing for TPARG_FL_TPOINT which is only used for event probes to the top of the "$" variable check, as all the other variables are not used for event probes. Also add a check in the register parsing "%" to fail if an event probe is used. Link: https://lkml.kernel.org/r/20220820134400.564426983@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 850a88abd33b..dec657af363c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -283,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (strcmp(arg, "retval") == 0) { + if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; + } else if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; } else { @@ -323,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif - } else if (flags & TPARG_FL_TPOINT) { - if (code->data) - return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -384,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ + if (flags & TPARG_FL_TPOINT) { + /* eprobes do not handle registers */ + trace_probe_log_err(offs, BAD_VAR); + break; + } ret = regs_query_register_offset(arg + 1); if (ret >= 0) { code->op = FETCH_OP_REG; -- cgit v1.2.3 From 02333de90e5945e2fe7fc75b15b4eb9aee187f0a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:18 -0400 Subject: tracing/eprobes: Do not hardcode $comm as a string The variable $comm is hard coded as a string, which is true for both kprobes and uprobes, but for event probes (eprobes) it is a field name. In most cases the "comm" field would be a string, but there's no guarantee of that fact. Do not assume that comm is a string. Not to mention, it currently forces comm fields to fault, as string processing for event probes is currently broken. Link: https://lkml.kernel.org/r/20220820134400.756152112@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dec657af363c..4daabbb8b772 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -622,9 +622,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* * Since $comm and immediate string can not be dereferenced, - * we can find those by strcmp. + * we can find those by strcmp. But ignore for eprobes. */ - if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { + if (!(flags & TPARG_FL_TPOINT) && + (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; -- cgit v1.2.3 From f04dec93466a0481763f3b56cdadf8076e28bfbf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:19 -0400 Subject: tracing/eprobes: Fix reading of string fields Currently when an event probe (eprobe) hooks to a string field, it does not display it as a string, but instead as a number. This makes the field rather useless. Handle the different kinds of strings, dynamic, static, relational/dynamic etc. Now when a string field is used, the ":string" type can be used to display it: echo "e:sw sched/sched_switch comm=$next_comm:string" > dynamic_events Link: https://lkml.kernel.org/r/20220820134400.959640191@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 550671985fd1..a1d3423ab74f 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -311,6 +311,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) addr = rec + field->offset; + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + switch (field->size) { case 1: if (field->is_signed) -- cgit v1.2.3 From 6a832ec3d680b3a4f4fad5752672827d71bae501 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:20 -0400 Subject: tracing/eprobes: Have event probes be consistent with kprobes and uprobes Currently, if a symbol "@" is attempted to be used with an event probe (eprobes), it will cause a NULL pointer dereference crash. Both kprobes and uprobes can reference data other than the main registers. Such as immediate address, symbols and the current task name. Have eprobes do the same thing. For "comm", if "comm" is used and the event being attached to does not have the "comm" field, then make it the "$comm" that kprobes has. This is consistent to the way histograms and filters work. Link: https://lkml.kernel.org/r/20220820134401.136924220@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 70 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a1d3423ab74f..1783e3478912 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,6 +227,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) struct probe_arg *parg = &ep->tp.args[i]; struct ftrace_event_field *field; struct list_head *head; + int ret = -ENOENT; head = trace_get_fields(ep->event); list_for_each_entry(field, head, link) { @@ -236,9 +237,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) return 0; } } + + /* + * Argument not found on event. But allow for comm and COMM + * to be used to get the current->comm. + */ + if (strcmp(parg->code->data, "COMM") == 0 || + strcmp(parg->code->data, "comm") == 0) { + parg->code->op = FETCH_OP_COMM; + ret = 0; + } + kfree(parg->code->data); parg->code->data = NULL; - return -ENOENT; + return ret; } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -363,16 +375,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) static int get_eprobe_size(struct trace_probe *tp, void *rec) { + struct fetch_insn *code; struct probe_arg *arg; int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { + if (arg->dynamic) { unsigned long val; - val = get_event_field(arg->code, rec); - len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); if (len > 0) ret += len; } @@ -390,8 +424,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { unsigned long val; - val = get_event_field(code, rec); - return process_fetch_insn_bottom(code + 1, val, dest, base); + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) @@ -866,6 +920,10 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ trace_probe_log_err(0, BAD_ATTACH_ARG); } + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); + return ret; } -- cgit v1.2.3 From ab8384442ee512fc0fc72deeb036110843d0e7ff Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:21 -0400 Subject: tracing/probes: Have kprobes and uprobes use $COMM too Both $comm and $COMM can be used to get current->comm in eprobes and the filtering and histogram logic. Make kprobes and uprobes consistent in this regard and allow both $comm and $COMM as well. Currently kprobes and uprobes only handle $comm, which is inconsistent with the other utilities, and can be confusing to users. Link: https://lkml.kernel.org/r/20220820134401.317014913@goodmis.org Link: https://lore.kernel.org/all/20220820220442.776e1ddaf8836e82edb34d01@kernel.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4daabbb8b772..36dff277de46 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -314,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else goto inval_var; - } else if (strcmp(arg, "comm") == 0) { + } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == @@ -625,7 +625,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * we can find those by strcmp. But ignore for eprobes. */ if (!(flags & TPARG_FL_TPOINT) && - (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0)) { + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; -- cgit v1.2.3 From b2380577d4fe1c0ef3fa50417f1e441c016e4cbe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 20 Aug 2022 09:43:22 -0400 Subject: tracing: Have filter accept "common_cpu" to be consistent Make filtering consistent with histograms. As "cpu" can be a field of an event, allow for "common_cpu" to keep it from being confused with the "cpu" field of the event. Link: https://lkml.kernel.org/r/20220820134401.513062765@goodmis.org Link: https://lore.kernel.org/all/20220820220920.e42fa32b70505b1904f0a0ad@kernel.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Fixes: 1e3bac71c5053 ("tracing/histogram: Rename "cpu" to "common_cpu"") Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 181f08186d32..0356cae0cf74 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -176,6 +176,7 @@ static int trace_define_generic_fields(void) __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); -- cgit v1.2.3 From 123d6455771ec577ce65f8d1bda548fb0eb7ef21 Mon Sep 17 00:00:00 2001 From: Wang Jingjin Date: Mon, 1 Aug 2022 16:47:45 +0800 Subject: ftrace: Fix build warning for ops_references_rec() not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The change that made IPMODIFY and DIRECT ops work together needed access to the ops_references_ip() function, which it pulled out of the module only code. But now if both CONFIG_MODULES and CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is not set, we get the below warning: ‘ops_references_rec’ defined but not used. Since ops_references_rec() only calls ops_references_ip() replace the usage of ops_references_rec() with ops_references_ip() and encompass the function with an #ifdef of DIRECT_CALLS || MODULES being defined. Link: https://lkml.kernel.org/r/20220801084745.1187987-1-wangjingjin1@huawei.com Fixes: 53cd885bc5c3 ("ftrace: Allow IPMODIFY and DIRECT ops on the same function") Signed-off-by: Wang Jingjin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 79 +++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 46 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 126c769d36c3..439e2ab6905e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,8 +1861,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } -static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); - /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -3118,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the given ip. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_ip(struct ftrace_ops *ops, unsigned long ip) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) - return false; - - return true; -} - -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - return ops_references_ip(ops, rec->ip); -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -6822,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6834,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) -- cgit v1.2.3 From baf2c002402702accc30dcc8f19199f303638306 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 23 Aug 2022 17:20:29 +0200 Subject: rv/monitors: Make monitor's automata definition static Monitor's automata definition is only used locally, so make them static for all existing monitors. Link: https://lore.kernel.org/all/202208210332.gtHXje45-lkp@intel.com Link: https://lore.kernel.org/all/202208210358.6HH3OrVs-lkp@intel.com Link: https://lkml.kernel.org/r/a50e27c3738d6ef809f4201857229fed64799234.1661266564.git.bristot@kernel.org Fixes: ccc319dcb450 ("rv/monitor: Add the wwnr monitor") Fixes: 8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.h | 2 +- kernel/trace/rv/monitors/wwnr/wwnr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c1c47e2305ef..dacc37b62a2c 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -struct automaton_wip automaton_wip = { +static struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d1afe55cdd4c..118e576b91b4 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -struct automaton_wwnr automaton_wwnr = { +static struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" -- cgit v1.2.3 From 54c3931957f6a6194d5972eccc36d052964b2abe Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Thu, 1 Sep 2022 18:45:14 +0800 Subject: tracing: hold caller_addr to hardirq_{enable,disable}_ip Currently, The arguments passing to lockdep_hardirqs_{on,off} was fixed in CALLER_ADDR0. The function trace_hardirqs_on_caller should have been intended to use caller_addr to represent the address that caller wants to be traced. For example, lockdep log in riscv showing the last {enabled,disabled} at __trace_hardirqs_{on,off} all the time(if called by): [ 57.853175] hardirqs last enabled at (2519): __trace_hardirqs_on+0xc/0x14 [ 57.853848] hardirqs last disabled at (2520): __trace_hardirqs_off+0xc/0x14 After use trace_hardirqs_xx_caller, we can get more effective information: [ 53.781428] hardirqs last enabled at (2595): restore_all+0xe/0x66 [ 53.782185] hardirqs last disabled at (2596): ret_from_exception+0xa/0x10 Link: https://lkml.kernel.org/r/20220901104515.135162-2-zouyipeng@huawei.com Cc: stable@vger.kernel.org Fixes: c3bc8fd637a96 ("tracing: Centralize preemptirq tracepoints and unify their usage") Signed-off-by: Yipeng Zou Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_preemptirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 95b58bd757ce..1e130da1b742 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) } lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(CALLER_ADDR0); + lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); -- cgit v1.2.3 From cecf8e128ec69149fe53c9a7bafa505a4bee25d9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 4 Sep 2022 13:12:29 +0900 Subject: tracing: Fix to check event_mutex is held while accessing trigger list Since the check_user_trigger() is called outside of RCU read lock, this list_for_each_entry_rcu() caused a suspicious RCU usage warning. # echo hist:keys=pid > events/sched/sched_stat_runtime/trigger # cat events/sched/sched_stat_runtime/trigger [ 43.167032] [ 43.167418] ============================= [ 43.167992] WARNING: suspicious RCU usage [ 43.168567] 5.19.0-rc5-00029-g19ebe4651abf #59 Not tainted [ 43.169283] ----------------------------- [ 43.169863] kernel/trace/trace_events_trigger.c:145 RCU-list traversed in non-reader section!! ... However, this file->triggers list is safe when it is accessed under event_mutex is held. To fix this warning, adds a lockdep_is_held check to the list_for_each_entry_rcu(). Link: https://lkml.kernel.org/r/166226474977.223837.1992182913048377113.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cb866c3141af..918730d74932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { if (data->flags & EVENT_TRIGGER_FL_PROBE) continue; return true; -- cgit v1.2.3 From 93d71986a60c37395e0a1d2f7fe170ef1e0a8ba4 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 22:12:10 +0800 Subject: rv/reactor: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220906141210.132607-1-xiujianfeng@huawei.com Fixes: 135b881ea885 ("rv/reactor: Add the printk reactor") Fixes: e88043c0ac16 ("rv/reactor: Add the panic reactor") Signed-off-by: Xiu Jianfeng Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/reactor_panic.c | 4 ++-- kernel/trace/rv/reactor_printk.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index b698d05dd069..d65f6c25a87c 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -24,13 +24,13 @@ static struct rv_reactor rv_panic = { .react = rv_panic_reaction }; -static int register_react_panic(void) +static int __init register_react_panic(void) { rv_register_reactor(&rv_panic); return 0; } -static void unregister_react_panic(void) +static void __exit unregister_react_panic(void) { rv_unregister_reactor(&rv_panic); } diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 31899f953af4..4b6b7106a477 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -23,13 +23,13 @@ static struct rv_reactor rv_printk = { .react = rv_printk_reaction }; -static int register_react_printk(void) +static int __init register_react_printk(void) { rv_register_reactor(&rv_printk); return 0; } -static void unregister_react_printk(void) +static void __exit unregister_react_printk(void) { rv_unregister_reactor(&rv_printk); } -- cgit v1.2.3