summaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorNick Terrell <terrelln@fb.com>2022-12-13 16:21:55 -0800
committerNick Terrell <terrelln@fb.com>2022-12-13 16:21:55 -0800
commit4f2c0a4acffbec01079c28f839422e64ddeff004 (patch)
tree06ada4a8a6d94a94c93944806041b8c994cebfc5 /kernel/trace
parent88a309465b3f05a100c3b81966982c0f9f5d23a6 (diff)
parent830b3c68c1fb1e9176028d02ef86f3cf76aa2476 (diff)
downloadlinux-4f2c0a4acffbec01079c28f839422e64ddeff004.tar.gz
linux-4f2c0a4acffbec01079c28f839422e64ddeff004.zip
Merge branch 'main' into zstd-linus
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig54
-rw-r--r--kernel/trace/Makefile8
-rw-r--r--kernel/trace/blktrace.c208
-rw-r--r--kernel/trace/bpf_trace.c647
-rw-r--r--kernel/trace/fgraph.c33
-rw-r--r--kernel/trace/fprobe.c327
-rw-r--r--kernel/trace/ftrace.c935
-rw-r--r--kernel/trace/kprobe_event_gen_test.c113
-rw-r--r--kernel/trace/pid_list.c4
-rw-r--r--kernel/trace/rethook.c328
-rw-r--r--kernel/trace/ring_buffer.c256
-rw-r--r--kernel/trace/rv/Kconfig78
-rw-r--r--kernel/trace/rv/Makefile8
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c88
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h46
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c87
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h46
-rw-r--r--kernel/trace/rv/reactor_panic.c43
-rw-r--r--kernel/trace/rv/reactor_printk.c42
-rw-r--r--kernel/trace/rv/rv.c799
-rw-r--r--kernel/trace/rv/rv.h68
-rw-r--r--kernel/trace/rv/rv_reactors.c510
-rw-r--r--kernel/trace/synth_event_gen_test.c16
-rw-r--r--kernel/trace/trace.c337
-rw-r--r--kernel/trace/trace.h52
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_benchmark.h8
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_dynevent.c13
-rw-r--r--kernel/trace/trace_eprobe.c333
-rw-r--r--kernel/trace/trace_event_perf.c7
-rw-r--r--kernel/trace/trace_events.c175
-rw-r--r--kernel/trace/trace_events_filter.c241
-rw-r--r--kernel/trace/trace_events_hist.c490
-rw-r--r--kernel/trace/trace_events_synth.c44
-rw-r--r--kernel/trace/trace_events_trigger.c400
-rw-r--r--kernel/trace/trace_events_user.c1914
-rw-r--r--kernel/trace/trace_kprobe.c108
-rw-r--r--kernel/trace/trace_osnoise.c121
-rw-r--r--kernel/trace/trace_output.c25
-rw-r--r--kernel/trace/trace_preemptirq.c8
-rw-r--r--kernel/trace/trace_probe.c43
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_probe_kernel.h115
-rw-r--r--kernel/trace/trace_recursion_record.c7
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c3
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/trace/trace_syscalls.c37
-rw-r--r--kernel/trace/trace_uprobe.c20
-rw-r--r--kernel/trace/tracing_map.c8
51 files changed, 7896 insertions, 1382 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..e9e95c790b8e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_RETHOOK
+ bool
+
+config RETHOOK
+ bool
+ depends on HAVE_RETHOOK
+ help
+ Enable generic return hooking feature. This is an internal
+ API, which will be used by other function-entry hooking
+ features like fprobe and kprobes.
+
config HAVE_FUNCTION_TRACER
bool
help
@@ -40,6 +51,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
This allows for use of regs_get_kernel_argument() and
kernel_stack_pointer().
+config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ bool
+ help
+ If the architecture generates __patchable_function_entries sections
+ but does not want them included in the ftrace locations.
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -133,6 +150,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
+ select TASKS_RCU if PREEMPTION
config GENERIC_TRACER
bool
@@ -182,7 +200,8 @@ config FUNCTION_TRACER
sequence is then dynamically patched into a tracer call when
tracing is enabled by the administrator. If it's runtime disabled
(the bootup default), then the overhead of the instructions is very
- small and not measurable even in micro-benchmarks.
+ small and not measurable even in micro-benchmarks (at least on
+ x86, but may have impact on other architectures).
config FUNCTION_GRAPH_TRACER
bool "Kernel Function Graph Tracer"
@@ -236,6 +255,21 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config FPROBE
+ bool "Kernel Function Probe (fprobe)"
+ depends on FUNCTION_TRACER
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on HAVE_RETHOOK
+ select RETHOOK
+ default n
+ help
+ This option enables kernel function probe (fprobe) based on ftrace.
+ The fprobe is similar to kprobes, but probes only for kernel function
+ entries and exits. This also can probe multiple functions by one
+ fprobe.
+
+ If unsure, say N.
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -702,6 +736,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
depends on FTRACE_MCOUNT_RECORD
+ select OBJTOOL
config FTRACE_MCOUNT_USE_RECORDMCOUNT
def_bool y
@@ -737,6 +772,21 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ depends on BROKEN || COMPILE_TEST # API needs to be straighten out
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by memory mapping a tracefs file and checking for
+ an associated byte being non-zero.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -1062,4 +1112,6 @@ config HIST_TRIGGERS_DEBUG
If unsure, say N.
+source "kernel/trace/rv/Kconfig"
+
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..c6651e16b557 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
GCOV_PROFILE := y
endif
+# Functions in this file could be invoked from early interrupt
+# code and produce random code coverage.
+KCOV_INSTRUMENT_trace_preemptirq.o := n
+
CFLAGS_bpf_trace.o := -I$(src)
CFLAGS_trace_benchmark.o := -I$(src)
@@ -82,6 +86,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
@@ -97,7 +102,10 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+obj-$(CONFIG_RV) += rv/
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index af68a67179b4..a995ea1ef849 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt)
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
- const char *fmt, ...)
+void __blk_trace_note_message(struct blk_trace *bt,
+ struct cgroup_subsys_state *css, const char *fmt, ...)
{
int n;
va_list args;
unsigned long flags;
char *buf;
+ u64 cgid = 0;
if (unlikely(bt->trace_state != Blktrace_running &&
!blk_tracer_enabled))
@@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
- blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
- trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
- blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
-#else
- trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
+ if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ cgid = cgroup_id(css->cgroup);
+ else
+ cgid = 1;
#endif
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(__trace_note_message);
+EXPORT_SYMBOL_GPL(__blk_trace_note_message);
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
pid_t pid)
@@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \
(ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/*
@@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data, u64 cgid)
+ const blk_opf_t opf, u32 what, int error,
+ int pdu_len, void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -227,16 +227,17 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+ const enum req_op op = opf & REQ_OP_MASK;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
what |= ddir_act[op_is_write(op) ? WRITE : READ];
- what |= MASK_TC_BIT(op_flags, SYNC);
- what |= MASK_TC_BIT(op_flags, RAHEAD);
- what |= MASK_TC_BIT(op_flags, META);
- what |= MASK_TC_BIT(op_flags, PREFLUSH);
- what |= MASK_TC_BIT(op_flags, FUA);
+ what |= MASK_TC_BIT(opf, SYNC);
+ what |= MASK_TC_BIT(opf, RAHEAD);
+ what |= MASK_TC_BIT(opf, META);
+ what |= MASK_TC_BIT(opf, PREFLUSH);
+ what |= MASK_TC_BIT(opf, FUA);
if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
@@ -310,10 +311,20 @@ record_it:
local_irq_restore(flags);
}
-static void blk_trace_free(struct blk_trace *bt)
+static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
{
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
+
+ /*
+ * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
+ * under 'q->debugfs_dir', thus lookup and remove them.
+ */
+ if (!bt->dir) {
+ debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
+ debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ } else {
+ debugfs_remove(bt->dir);
+ }
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -335,10 +346,42 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
-static void blk_trace_cleanup(struct blk_trace *bt)
+static int blk_trace_start(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_setup &&
+ bt->trace_state != Blktrace_stopped)
+ return -EINVAL;
+
+ blktrace_seq++;
+ smp_mb();
+ bt->trace_state = Blktrace_running;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ trace_note_time(bt);
+
+ return 0;
+}
+
+static int blk_trace_stop(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_running)
+ return -EINVAL;
+
+ bt->trace_state = Blktrace_stopped;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+
+ return 0;
+}
+
+static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
+ blk_trace_stop(bt);
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
put_probe_ref();
}
@@ -351,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q)
if (!bt)
return -EINVAL;
- if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -401,7 +443,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, NULL, "%s", msg);
+ __blk_trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -572,7 +614,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = 0;
err:
if (ret)
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -647,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
- int ret;
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
@@ -655,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
if (bt == NULL)
return -EINVAL;
- /*
- * For starting a trace, we can transition from a setup or stopped
- * trace. For stopping a trace, the state must be running
- */
- ret = -EINVAL;
- if (start) {
- if (bt->trace_state == Blktrace_setup ||
- bt->trace_state == Blktrace_stopped) {
- blktrace_seq++;
- smp_mb();
- bt->trace_state = Blktrace_running;
- raw_spin_lock_irq(&running_trace_lock);
- list_add(&bt->running_list, &running_trace_list);
- raw_spin_unlock_irq(&running_trace_lock);
-
- trace_note_time(bt);
- ret = 0;
- }
- } else {
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- raw_spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- raw_spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- ret = 0;
- }
- }
-
- return ret;
+ if (start)
+ return blk_trace_start(bt);
+ else
+ return blk_trace_stop(bt);
}
int blk_trace_startstop(struct request_queue *q, int start)
@@ -726,12 +741,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
switch (cmd) {
case BLKTRACESETUP:
- bdevname(bdev, b);
+ snprintf(b, sizeof(b), "%pg", bdev);
ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
- bdevname(bdev, b);
+ snprintf(b, sizeof(b), "%pg", bdev);
ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#endif
@@ -760,19 +775,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
- mutex_lock(&q->debugfs_mutex);
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->debugfs_mutex))) {
- __blk_trace_startstop(q, 0);
+ lockdep_is_held(&q->debugfs_mutex)))
__blk_trace_remove(q);
- }
-
- mutex_unlock(&q->debugfs_mutex);
}
#ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
+ struct cgroup_subsys_state *blkcg_css;
struct blk_trace *bt;
/* We don't use the 'bt' value here except as an optimization... */
@@ -780,9 +791,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return 0;
- if (!bio->bi_blkg)
+ blkcg_css = bio_blkcg_css(bio);
+ if (!blkcg_css)
return 0;
- return cgroup_id(bio_blkcg(bio)->css.cgroup);
+ return cgroup_id(blkcg_css->cgroup);
}
#else
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
@@ -833,9 +845,8 @@ static void blk_add_trace_rq(struct request *rq, blk_status_t error,
else
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, blk_status_to_errno(error), 0,
- NULL, cgid);
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags,
+ what, blk_status_to_errno(error), 0, NULL, cgid);
rcu_read_unlock();
}
@@ -894,7 +905,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
}
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+ bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
@@ -940,7 +951,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
rcu_read_unlock();
}
@@ -960,7 +971,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
}
rcu_read_unlock();
}
@@ -976,8 +987,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT,
+ bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT,
blk_status_to_errno(bio->bi_status),
sizeof(rpdu), &rpdu,
blk_trace_bio_get_cgid(q, bio));
@@ -1013,7 +1023,7 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ bio->bi_opf, BLK_TA_REMAP,
blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
@@ -1049,7 +1059,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
+ rq->cmd_flags, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
@@ -1075,7 +1085,7 @@ void blk_add_driver_data(struct request *rq, void *data, size_t len)
return;
}
- __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(rq));
rcu_read_unlock();
@@ -1606,17 +1616,11 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- raw_spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- raw_spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- }
+ blk_trace_stop(bt);
put_probe_ref();
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return 0;
}
@@ -1647,7 +1651,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
return 0;
free_bt:
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1858,17 +1862,6 @@ out_unlock_bdev:
out:
return ret ? ret : count;
}
-
-int blk_trace_init_sysfs(struct device *dev)
-{
- return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
-}
-
-void blk_trace_remove_sysfs(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
-}
-
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
@@ -1876,23 +1869,22 @@ void blk_trace_remove_sysfs(struct device *dev)
/**
* blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
* @rwbs: buffer to be filled
- * @op: REQ_OP_XXX for the tracepoint
+ * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint
*
* Description:
- * Maps the REQ_OP_XXX to character and fills the buffer provided by the
- * caller with resulting string.
+ * Maps each request operation and flag to a single character and fills the
+ * buffer provided by the caller with resulting string.
*
**/
-void blk_fill_rwbs(char *rwbs, unsigned int op)
+void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
{
int i = 0;
- if (op & REQ_PREFLUSH)
+ if (opf & REQ_PREFLUSH)
rwbs[i++] = 'F';
- switch (op & REQ_OP_MASK) {
+ switch (opf & REQ_OP_MASK) {
case REQ_OP_WRITE:
- case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
break;
case REQ_OP_DISCARD:
@@ -1912,13 +1904,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
rwbs[i++] = 'N';
}
- if (op & REQ_FUA)
+ if (opf & REQ_FUA)
rwbs[i++] = 'F';
- if (op & REQ_RAHEAD)
+ if (opf & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (op & REQ_SYNC)
+ if (opf & REQ_SYNC)
rwbs[i++] = 'S';
- if (op & REQ_META)
+ if (opf & REQ_META)
rwbs[i++] = 'M';
rwbs[i] = '\0';
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..1ed08967fb97 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,11 @@
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <linux/key.h>
+#include <linux/verification.h>
#include <net/bpf_sk_storage.h>
@@ -77,6 +82,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
u64 flags, const struct btf **btf,
s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
/**
* trace_call_bpf - invoke BPF program
@@ -124,7 +131,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
- ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
+ rcu_read_lock();
+ ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
+ ctx, bpf_prog_run);
+ rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -332,8 +342,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -679,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
+ sd->sample_flags |= PERF_SAMPLE_RAW;
err = __bpf_perf_event_output(regs, map, flags, sd);
@@ -737,6 +746,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
+ sd->sample_flags |= PERF_SAMPLE_RAW;
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
@@ -835,8 +845,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -1022,11 +1030,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
+#ifdef CONFIG_X86_KERNEL_IBT
+static unsigned long get_entry_ip(unsigned long fentry_ip)
+{
+ u32 instr;
+
+ /* Being extra safe in here in case entry ip is on the page-edge. */
+ if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
+ return fentry_ip;
+ if (is_endbr(instr))
+ fentry_ip -= ENDBR_INSN_SIZE;
+ return fentry_ip;
+}
+#else
+#define get_entry_ip(fentry_ip) fentry_ip
+#endif
+
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
struct kprobe *kp = kprobe_running();
- return kp ? (uintptr_t)kp->addr : 0;
+ if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
+ return 0;
+
+ return get_entry_ip((uintptr_t)kp->addr);
}
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
@@ -1036,6 +1063,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
+ .func = bpf_get_func_ip_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+ .func = bpf_get_attach_cookie_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
@@ -1063,6 +1114,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
+ .func = bpf_get_attach_cookie_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
#ifndef CONFIG_X86
@@ -1138,6 +1204,184 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+#ifdef CONFIG_KEYS
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_ptr: data to verify
+ * @sig_ptr: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+ struct bpf_dynptr_kern *sig_ptr,
+ struct bpf_key *trusted_keyring)
+{
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ return verify_pkcs7_signature(data_ptr->data,
+ bpf_dynptr_get_size(data_ptr),
+ sig_ptr->data,
+ bpf_dynptr_get_size(sig_ptr),
+ trusted_keyring->key,
+ VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
+ NULL);
+}
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
+__diag_pop();
+
+BTF_SET8_START(key_sig_kfunc_set)
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+BTF_SET8_END(key_sig_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &key_sig_kfunc_set,
+};
+
+static int __init bpf_key_sig_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_key_sig_kfunc_set);
+}
+
+late_initcall(bpf_key_sig_kfuncs_init);
+#endif /* CONFIG_KEYS */
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1154,6 +1398,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
@@ -1235,6 +1481,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ case BPF_FUNC_copy_from_user_task:
+ return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -1277,9 +1525,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- return &bpf_get_func_ip_proto_kprobe;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_func_ip_proto_kprobe_multi :
+ &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- return &bpf_get_attach_cookie_proto_trace;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_attach_cookie_proto_kmulti :
+ &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1456,6 +1708,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
return -EINVAL;
+ if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
+ return -ENOENT;
+
if (unlikely(!br_stack))
return -ENOENT;
@@ -1562,6 +1817,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
@@ -1653,6 +1909,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skc_to_udp6_sock_proto;
case BPF_FUNC_skc_to_unix_sock:
return &bpf_skc_to_unix_sock_proto;
+ case BPF_FUNC_skc_to_mptcp_sock:
+ return &bpf_skc_to_mptcp_sock_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_tracing_proto;
case BPF_FUNC_sk_storage_delete:
@@ -1661,6 +1919,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_from_file_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_trace_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
@@ -1682,6 +1942,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
@@ -1878,7 +2140,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
event->prog = prog;
event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
unlock:
mutex_unlock(&bpf_event_mutex);
@@ -1904,7 +2166,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
rcu_assign_pointer(event->tp_event->prog_array, new_array);
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
}
bpf_prog_put(event->prog);
@@ -1984,9 +2246,15 @@ static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ goto out;
+ }
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+out:
+ this_cpu_dec(*(prog->active));
}
#define UNPACK(...) __VA_ARGS__
@@ -2176,3 +2444,362 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+ struct bpf_link link;
+ struct fprobe fp;
+ unsigned long *addrs;
+ u64 *cookies;
+ u32 cnt;
+};
+
+struct bpf_kprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ unsigned long entry_ip;
+};
+
+struct user_syms {
+ const char **syms;
+ char *buf;
+};
+
+static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
+{
+ unsigned long __user usymbol;
+ const char **syms = NULL;
+ char *buf = NULL, *p;
+ int err = -ENOMEM;
+ unsigned int i;
+
+ syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL);
+ if (!syms)
+ goto error;
+
+ buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL);
+ if (!buf)
+ goto error;
+
+ for (p = buf, i = 0; i < cnt; i++) {
+ if (__get_user(usymbol, usyms + i)) {
+ err = -EFAULT;
+ goto error;
+ }
+ err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
+ if (err == KSYM_NAME_LEN)
+ err = -E2BIG;
+ if (err < 0)
+ goto error;
+ syms[i] = p;
+ p += err + 1;
+ }
+
+ us->syms = syms;
+ us->buf = buf;
+ return 0;
+
+error:
+ if (err) {
+ kvfree(syms);
+ kvfree(buf);
+ }
+ return err;
+}
+
+static void free_user_syms(struct user_syms *us)
+{
+ kvfree(us->syms);
+ kvfree(us->buf);
+}
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ unregister_fprobe(&kmulti_link->fp);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ kvfree(kmulti_link->addrs);
+ kvfree(kmulti_link->cookies);
+ kfree(kmulti_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+ .release = bpf_kprobe_multi_link_release,
+ .dealloc = bpf_kprobe_multi_link_dealloc,
+};
+
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+ const struct bpf_kprobe_multi_link *link = priv;
+ unsigned long *addr_a = a, *addr_b = b;
+ u64 *cookie_a, *cookie_b;
+
+ cookie_a = link->cookies + (addr_a - link->addrs);
+ cookie_b = link->cookies + (addr_b - link->addrs);
+
+ /* swap addr_a/addr_b and cookie_a/cookie_b values */
+ swap(*addr_a, *addr_b);
+ swap(*cookie_a, *cookie_b);
+}
+
+static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+{
+ const unsigned long *addr_a = a, *addr_b = b;
+
+ if (*addr_a == *addr_b)
+ return 0;
+ return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+ return __bpf_kprobe_multi_cookie_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ u64 *cookie, entry_ip;
+ unsigned long *addr;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ link = run_ctx->link;
+ if (!link->cookies)
+ return 0;
+ entry_ip = run_ctx->entry_ip;
+ addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
+ __bpf_kprobe_multi_cookie_cmp);
+ if (!addr)
+ return 0;
+ cookie = link->cookies + (addr - link->addrs);
+ return *cookie;
+}
+
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+ unsigned long entry_ip, struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .link = link,
+ .entry_ip = entry_ip,
+ };
+ struct bpf_run_ctx *old_run_ctx;
+ int err;
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ err = 0;
+ goto out;
+ }
+
+ migrate_disable();
+ rcu_read_lock();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ return err;
+}
+
+static void
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+}
+
+static int symbols_cmp_r(const void *a, const void *b, const void *priv)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+struct multi_symbols_sort {
+ const char **funcs;
+ u64 *cookies;
+};
+
+static void symbols_swap_r(void *a, void *b, int size, const void *priv)
+{
+ const struct multi_symbols_sort *data = priv;
+ const char **name_a = a, **name_b = b;
+
+ swap(*name_a, *name_b);
+
+ /* If defined, swap also related cookies. */
+ if (data->cookies) {
+ u64 *cookie_a, *cookie_b;
+
+ cookie_a = data->cookies + (name_a - data->funcs);
+ cookie_b = data->cookies + (name_b - data->funcs);
+ swap(*cookie_a, *cookie_b);
+ }
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_kprobe_multi_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ void __user *ucookies;
+ unsigned long *addrs;
+ u32 flags, cnt, size;
+ void __user *uaddrs;
+ u64 *cookies = NULL;
+ void __user *usyms;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.kprobe_multi.flags;
+ if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+ usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+ if (!!uaddrs == !!usyms)
+ return -EINVAL;
+
+ cnt = attr->link_create.kprobe_multi.cnt;
+ if (!cnt)
+ return -EINVAL;
+
+ size = cnt * sizeof(*addrs);
+ addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ if (uaddrs) {
+ if (copy_from_user(addrs, uaddrs, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ struct multi_symbols_sort data = {
+ .cookies = cookies,
+ };
+ struct user_syms us;
+
+ err = copy_user_syms(&us, usyms, cnt);
+ if (err)
+ goto error;
+
+ if (cookies)
+ data.funcs = us.syms;
+
+ sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
+ symbols_swap_r, &data);
+
+ err = ftrace_lookup_symbols(us.syms, cnt, addrs);
+ free_user_syms(&us);
+ if (err)
+ goto error;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+ &bpf_kprobe_multi_link_lops, prog);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ if (flags & BPF_F_KPROBE_MULTI_RETURN)
+ link->fp.exit_handler = kprobe_multi_link_handler;
+ else
+ link->fp.entry_handler = kprobe_multi_link_handler;
+
+ link->addrs = addrs;
+ link->cookies = cookies;
+ link->cnt = cnt;
+
+ if (cookies) {
+ /*
+ * Sorting addresses will trigger sorting cookies as well
+ * (check bpf_kprobe_multi_cookie_swap). This way we can
+ * find cookie based on the address in bpf_get_attach_cookie
+ * helper.
+ */
+ sort_r(addrs, cnt, sizeof(*addrs),
+ bpf_kprobe_multi_cookie_cmp,
+ bpf_kprobe_multi_cookie_swap,
+ link);
+ }
+
+ err = register_fprobe_ips(&link->fp, addrs, cnt);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+error:
+ kfree(link);
+ kvfree(addrs);
+ kvfree(cookies);
+ return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..218cd95bf8e4 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -23,23 +24,31 @@
#define ASSIGN_OPS_HASH(opsname, val)
#endif
-static bool kill_ftrace_graph;
+DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
-/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * archs can override this function if they must do something
+ * to enable hook for graph tracer.
+ */
+int __weak ftrace_enable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * to disable hook for graph tracer.
*/
-bool ftrace_graph_is_dead(void)
+int __weak ftrace_disable_ftrace_graph_caller(void)
{
- return kill_ftrace_graph;
+ return 0;
}
+#endif
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
@@ -51,7 +60,7 @@ bool ftrace_graph_is_dead(void)
*/
void ftrace_graph_stop(void)
{
- kill_ftrace_graph = true;
+ static_branch_enable(&kill_ftrace_graph);
}
/* Add a function return address to the trace stack on thread info.*/
@@ -415,7 +424,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..e8143e368074
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/err.h>
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "trace.h"
+
+struct fprobe_rethook_node {
+ struct rethook_node node;
+ unsigned long entry_ip;
+};
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_rethook_node *fpr;
+ struct rethook_node *rh;
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ if (fp->entry_handler)
+ fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+
+ if (fp->exit_handler) {
+ rh = rethook_try_get(fp->rethook);
+ if (!rh) {
+ fp->nmissed++;
+ goto out;
+ }
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+ fpr->entry_ip = ip;
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
+
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp = container_of(ops, struct fprobe, ops);
+
+ if (unlikely(kprobe_running())) {
+ fp->nmissed++;
+ return;
+ }
+ kprobe_busy_begin();
+ fprobe_handler(ip, parent_ip, ops, fregs);
+ kprobe_busy_end();
+}
+
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
+{
+ struct fprobe *fp = (struct fprobe *)data;
+ struct fprobe_rethook_node *fpr;
+
+ if (!fp || fprobe_disabled(fp))
+ return;
+
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+ fp->exit_handler(fp, fpr->entry_ip, regs);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+static int symbols_cmp(const void *a, const void *b)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+/* Convert ftrace location address from symbols */
+static unsigned long *get_ftrace_locations(const char **syms, int num)
+{
+ unsigned long *addrs;
+
+ /* Convert symbols to symbol address */
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ /* ftrace_lookup_symbols expects sorted symbols */
+ sort(syms, num, sizeof(*syms), symbols_cmp, NULL);
+
+ if (!ftrace_lookup_symbols(syms, num, addrs))
+ return addrs;
+
+ kfree(addrs);
+ return ERR_PTR(-ENOENT);
+}
+
+static void fprobe_init(struct fprobe *fp)
+{
+ fp->nmissed = 0;
+ if (fprobe_shared_with_kprobes(fp))
+ fp->ops.func = fprobe_kprobe_handler;
+ else
+ fp->ops.func = fprobe_handler;
+ fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+}
+
+static int fprobe_init_rethook(struct fprobe *fp, int num)
+{
+ int i, size;
+
+ if (num < 0)
+ return -EINVAL;
+
+ if (!fp->exit_handler) {
+ fp->rethook = NULL;
+ return 0;
+ }
+
+ /* Initialize rethook if needed */
+ size = num * num_possible_cpus() * 2;
+ if (size < 0)
+ return -E2BIG;
+
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ if (!fp->rethook)
+ return -ENOMEM;
+ for (i = 0; i < size; i++) {
+ struct fprobe_rethook_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(fp->rethook, &node->node);
+ }
+ return 0;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+ if (fp->rethook) {
+ /* Don't need to cleanup rethook->handler because this is not used. */
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ }
+ ftrace_free_filter(&fp->ops);
+}
+
+/**
+ * register_fprobe() - Register fprobe to ftrace by pattern.
+ * @fp: A fprobe data structure to be registered.
+ * @filter: A wildcard pattern of probed symbols.
+ * @notfilter: A wildcard pattern of NOT probed symbols.
+ *
+ * Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
+ * If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
+{
+ struct ftrace_hash *hash;
+ unsigned char *str;
+ int ret, len;
+
+ if (!fp || !filter)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ len = strlen(filter);
+ str = kstrdup(filter, GFP_KERNEL);
+ ret = ftrace_set_filter(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ return ret;
+
+ if (notfilter) {
+ len = strlen(notfilter);
+ str = kstrdup(notfilter, GFP_KERNEL);
+ ret = ftrace_set_notrace(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ goto out;
+ }
+
+ /* TODO:
+ * correctly calculate the total number of filtered symbols
+ * from both filter and notfilter.
+ */
+ hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
+ if (WARN_ON_ONCE(!hash))
+ goto out;
+
+ ret = fprobe_init_rethook(fp, (int)hash->count);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+out:
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * register_fprobe_ips() - Register fprobe to ftrace by address.
+ * @fp: A fprobe data structure to be registered.
+ * @addrs: An array of target ftrace location addresses.
+ * @num: The number of entries of @addrs.
+ *
+ * Register @fp to ftrace for enabling the probe on the address given by @addrs.
+ * The @addrs must be the addresses of ftrace location address, which may be
+ * the symbol address + arch-dependent offset.
+ * If you unsure what this mean, please use other registration functions.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ int ret;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ ret = fprobe_init_rethook(fp, num);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_ips);
+
+/**
+ * register_fprobe_syms() - Register fprobe to ftrace by symbols.
+ * @fp: A fprobe data structure to be registered.
+ * @syms: An array of target symbols.
+ * @num: The number of entries of @syms.
+ *
+ * Register @fp to the symbols given by @syms array. This will be useful if
+ * you are sure the symbols exist in the kernel.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
+{
+ unsigned long *addrs;
+ int ret;
+
+ if (!fp || !syms || num <= 0)
+ return -EINVAL;
+
+ addrs = get_ftrace_locations(syms, num);
+ if (IS_ERR(addrs))
+ return PTR_ERR(addrs);
+
+ ret = register_fprobe_ips(fp, addrs, num);
+
+ kfree(addrs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_syms);
+
+/**
+ * unregister_fprobe() - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fp || (fp->ops.saved_func != fprobe_handler &&
+ fp->ops.saved_func != fprobe_kprobe_handler))
+ return -EINVAL;
+
+ /*
+ * rethook_free() starts disabling the rethook, but the rethook handlers
+ * may be running on other processors at this point. To make sure that all
+ * current running handlers are finished, call unregister_ftrace_function()
+ * after this.
+ */
+ if (fp->rethook)
+ rethook_free(fp->rethook);
+
+ ret = unregister_ftrace_function(&fp->ops);
+ if (ret < 0)
+ return ret;
+
+ ftrace_free_filter(&fp->ops);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9feb197b2da..33236241f236 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,8 @@
#include "trace_output.h"
#include "trace_stat.h"
+#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__"
+
#define FTRACE_WARN_ON(cond) \
({ \
int ___r = cond; \
@@ -86,7 +88,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = {
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
-static int last_ftrace_enabled;
+static int __maybe_unused last_ftrace_enabled;
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
@@ -119,7 +121,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
-/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */
+/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */
void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
@@ -952,7 +954,6 @@ static struct tracer_stat function_stats __initdata = {
static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
- struct dentry *entry;
char *name;
int ret;
int cpu;
@@ -983,11 +984,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
}
- entry = tracefs_create_file("function_profile_enabled",
- TRACE_MODE_WRITE, d_tracer, NULL,
- &ftrace_profile_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
+ trace_create_file("function_profile_enabled",
+ TRACE_MODE_WRITE, d_tracer, NULL,
+ &ftrace_profile_fops);
}
#else /* CONFIG_FUNCTION_PROFILER */
@@ -1290,6 +1289,7 @@ static int ftrace_add_mod(struct trace_array *tr,
if (!ftrace_mod)
return -ENOMEM;
+ INIT_LIST_HEAD(&ftrace_mod->list);
ftrace_mod->func = kstrdup(func, GFP_KERNEL);
ftrace_mod->module = kstrdup(module, GFP_KERNEL);
ftrace_mod->enable = enable;
@@ -1568,17 +1568,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
}
/**
- * ftrace_location - return true if the ip giving is a traced location
+ * ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * Returns rec->ip if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
+ * If @ip matches the ftrace location, return @ip.
+ * If @ip matches sym+0, return sym's ftrace location.
+ * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
- return ftrace_location_range(ip, ip);
+ struct dyn_ftrace *rec;
+ unsigned long offset;
+ unsigned long size;
+
+ rec = lookup_rec(ip, ip);
+ if (!rec) {
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset))
+ goto out;
+
+ /* map sym+0 to __fentry__ */
+ if (!offset)
+ rec = lookup_rec(ip, ip + size - 1);
+ }
+
+ if (rec)
+ return rec->ip;
+
+out:
+ return 0;
}
/**
@@ -1628,6 +1645,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex
static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+static bool skip_record(struct dyn_ftrace *rec)
+{
+ /*
+ * At boot up, weak functions are set to disable. Function tracing
+ * can be enabled before they are, and they still need to be disabled now.
+ * If the record is disabled, still continue if it is marked as already
+ * enabled (this is needed to keep the accounting working).
+ */
+ return rec->flags & FTRACE_FL_DISABLED &&
+ !(rec->flags & FTRACE_FL_ENABLED);
+}
+
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1677,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
continue;
if (all) {
@@ -1853,6 +1882,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
* - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
* - If the hash is EMPTY_HASH, it hits nothing
* - Anything else hits the recs which match the hash entries.
+ *
+ * DIRECT ops does not have IPMODIFY flag, but we still need to check it
+ * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call
+ * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with
+ * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate
+ * the return value to the caller and eventually to the owner of the DIRECT
+ * ops.
*/
static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_hash *old_hash,
@@ -1861,17 +1897,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_page *pg;
struct dyn_ftrace *rec, *end = NULL;
int in_old, in_new;
+ bool is_ipmodify, is_direct;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
+ is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
+
+ /* neither IPMODIFY nor DIRECT, skip */
+ if (!is_ipmodify && !is_direct)
+ return 0;
+
+ if (WARN_ON_ONCE(is_ipmodify && is_direct))
return 0;
/*
- * Since the IPMODIFY is a very address sensitive action, we do not
- * allow ftrace_ops to set all functions to new hash.
+ * Since the IPMODIFY and DIRECT are very address sensitive
+ * actions, we do not allow ftrace_ops to set all functions to new
+ * hash.
*/
if (!new_hash || !old_hash)
return -EINVAL;
@@ -1889,12 +1934,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
continue;
if (in_new) {
- /* New entries must ensure no others are using it */
- if (rec->flags & FTRACE_FL_IPMODIFY)
- goto rollback;
- rec->flags |= FTRACE_FL_IPMODIFY;
- } else /* Removed entry */
+ if (rec->flags & FTRACE_FL_IPMODIFY) {
+ int ret;
+
+ /* Cannot have two ipmodify on same rec */
+ if (is_ipmodify)
+ goto rollback;
+
+ FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+
+ /*
+ * Another ops with IPMODIFY is already
+ * attached. We are now attaching a direct
+ * ops. Run SHARE_IPMODIFY_SELF, to check
+ * whether sharing is supported.
+ */
+ if (!ops->ops_func)
+ return -EBUSY;
+ ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ if (ret)
+ return ret;
+ } else if (is_ipmodify) {
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ }
+ } else if (is_ipmodify) {
rec->flags &= ~FTRACE_FL_IPMODIFY;
+ }
} while_for_each_ftrace_rec();
return 0;
@@ -1964,7 +2029,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
char ins[MCOUNT_INSN_SIZE];
- int i;
if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
@@ -1972,9 +2036,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p)
}
printk(KERN_CONT "%s", fmt);
-
- for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
+ pr_cont("%*phC", MCOUNT_INSN_SIZE, ins);
}
enum ftrace_bug_type ftrace_bug_type;
@@ -2074,7 +2136,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
ftrace_bug_type = FTRACE_BUG_UNKNOWN;
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
return FTRACE_UPDATE_IGNORE;
/*
@@ -2189,7 +2251,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
- rec->flags = 0;
+ rec->flags &= FTRACE_FL_DISABLED;
else
/*
* Just disable the record, but keep the ops TRAMP
@@ -2438,8 +2500,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops direct_ops = {
.func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_IPMODIFY
- | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
+ .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
/*
* By declaring the main trampoline as this trampoline
@@ -2583,7 +2644,7 @@ void __weak ftrace_replace_code(int mod_flags)
do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_DISABLED)
+ if (skip_record(rec))
continue;
failed = __ftrace_replace_code(rec, enable);
@@ -2690,18 +2751,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec)
* archs can override this function if they must do something
* before the modifying code is performed.
*/
-int __weak ftrace_arch_code_modify_prepare(void)
+void __weak ftrace_arch_code_modify_prepare(void)
{
- return 0;
}
/*
* archs can override this function if they must do something
* after the modifying code is performed.
*/
-int __weak ftrace_arch_code_modify_post_process(void)
+void __weak ftrace_arch_code_modify_post_process(void)
{
- return 0;
}
void ftrace_modify_all_code(int command)
@@ -2787,12 +2846,7 @@ void __weak arch_ftrace_update_code(int command)
static void ftrace_run_update_code(int command)
{
- int ret;
-
- ret = ftrace_arch_code_modify_prepare();
- FTRACE_WARN_ON(ret);
- if (ret)
- return;
+ ftrace_arch_code_modify_prepare();
/*
* By default we use stop_machine() to modify the code.
@@ -2802,8 +2856,7 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- ret = ftrace_arch_code_modify_post_process();
- FTRACE_WARN_ON(ret);
+ ftrace_arch_code_modify_post_process();
}
static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
@@ -2929,6 +2982,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_startup_enable(command);
+ /*
+ * If ftrace is in an undefined state, we just remove ops from list
+ * to prevent the NULL pointer, instead of totally rolling it back and
+ * free trampoline, because those actions could cause further damage.
+ */
+ if (unlikely(ftrace_disabled)) {
+ __unregister_ftrace_function(ops);
+ return -ENODEV;
+ }
+
ops->flags &= ~FTRACE_OPS_FL_ADDING;
return 0;
@@ -2966,18 +3029,8 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
command |= FTRACE_UPDATE_TRACE_FUNC;
}
- if (!command || !ftrace_enabled) {
- /*
- * If these are dynamic or per_cpu ops, they still
- * need their data freed. Since, function tracing is
- * not currently active, we can just free them
- * without synchronizing all CPUs.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- goto free_ops;
-
- return 0;
- }
+ if (!command || !ftrace_enabled)
+ goto out;
/*
* If the ops uses a trampoline, then it needs to be
@@ -3014,6 +3067,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
removed_ops = NULL;
ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+out:
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
@@ -3041,47 +3095,12 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
- free_ops:
ftrace_trampoline_free(ops);
}
return 0;
}
-static void ftrace_startup_sysctl(void)
-{
- int command;
-
- if (unlikely(ftrace_disabled))
- return;
-
- /* Force update next time */
- saved_ftrace_func = NULL;
- /* ftrace_start_up is true if we want ftrace running */
- if (ftrace_start_up) {
- command = FTRACE_UPDATE_CALLS;
- if (ftrace_graph_active)
- command |= FTRACE_START_FUNC_RET;
- ftrace_startup_enable(command);
- }
-}
-
-static void ftrace_shutdown_sysctl(void)
-{
- int command;
-
- if (unlikely(ftrace_disabled))
- return;
-
- /* ftrace_start_up is true if ftrace is running */
- if (ftrace_start_up) {
- command = FTRACE_DISABLE_CALLS;
- if (ftrace_graph_active)
- command |= FTRACE_STOP_FUNC_RET;
- ftrace_run_update_code(command);
- }
-}
-
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
@@ -3097,36 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
ftrace_hash_empty(ops->func_hash->notrace_hash);
}
-/*
- * Check if the current ops references the record.
- *
- * If the ops traces all functions, then it was already accounted for.
- * If the ops does not trace the current record function, skip it.
- * If the ops ignores the function via notrace filter, skip it.
- */
-static inline bool
-ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
-{
- /* If ops isn't enabled, ignore it */
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return false;
-
- /* If ops traces all then it includes this function */
- if (ops_traces_mod(ops))
- return true;
-
- /* The function must be in the filter */
- if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
- return false;
-
- /* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
- return false;
-
- return true;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
bool init_nop = ftrace_need_init_nop();
@@ -3202,7 +3191,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
/* if we can't allocate this size, try something smaller */
if (!order)
return -ENOMEM;
- order >>= 1;
+ order--;
goto again;
}
@@ -3648,6 +3637,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
seq_printf(m, " ->%pS", ptr);
}
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+/*
+ * Weak functions can still have an mcount/fentry that is saved in
+ * the __mcount_loc section. These can be detected by having a
+ * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the
+ * symbol found by kallsyms is not the function that the mcount/fentry
+ * is part of. The offset is much greater in these cases.
+ *
+ * Test the record to make sure that the ip points to a valid kallsyms
+ * and if not, mark it disabled.
+ */
+static int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+ char str[KSYM_SYMBOL_LEN];
+ unsigned long offset;
+ const char *ret;
+
+ ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str);
+
+ /* Weak functions can cause invalid addresses */
+ if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+ rec->flags |= FTRACE_FL_DISABLED;
+ return 0;
+ }
+ return 1;
+}
+
+static struct workqueue_struct *ftrace_check_wq __initdata;
+static struct work_struct ftrace_check_work __initdata;
+
+/*
+ * Scan all the mcount/fentry entries to make sure they are valid.
+ */
+static __init void ftrace_check_work_func(struct work_struct *work)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+ test_for_valid_rec(rec);
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+static int __init ftrace_check_for_weak_functions(void)
+{
+ INIT_WORK(&ftrace_check_work, ftrace_check_work_func);
+
+ ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0);
+
+ queue_work(ftrace_check_wq, &ftrace_check_work);
+ return 0;
+}
+
+static int __init ftrace_check_sync(void)
+{
+ /* Make sure the ftrace_check updates are finished */
+ if (ftrace_check_wq)
+ destroy_workqueue(ftrace_check_wq);
+ return 0;
+}
+
+late_initcall_sync(ftrace_check_sync);
+subsys_initcall(ftrace_check_for_weak_functions);
+
+static int print_rec(struct seq_file *m, unsigned long ip)
+{
+ unsigned long offset;
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+ const char *ret;
+
+ ret = kallsyms_lookup(ip, NULL, &offset, &modname, str);
+ /* Weak functions can cause invalid addresses */
+ if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+ snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld",
+ FTRACE_INVALID_FUNCTION, offset);
+ ret = NULL;
+ }
+
+ seq_puts(m, str);
+ if (modname)
+ seq_printf(m, " [%s]", modname);
+ return ret == NULL ? -1 : 0;
+}
+#else
+static inline int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+ return 1;
+}
+
+static inline int print_rec(struct seq_file *m, unsigned long ip)
+{
+ seq_printf(m, "%ps", (void *)ip);
+ return 0;
+}
+#endif
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -3672,7 +3760,13 @@ static int t_show(struct seq_file *m, void *v)
if (!rec)
return 0;
- seq_printf(m, "%ps", (void *)rec->ip);
+ if (print_rec(m, rec->ip)) {
+ /* This should only happen when a rec is disabled */
+ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
+ seq_putc(m, '\n');
+ return 0;
+ }
+
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;
@@ -3990,6 +4084,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
return 0;
}
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+ unsigned long offset;
+
+ kallsyms_lookup(ip, NULL, &offset, modname, str);
+ if (offset > FTRACE_MCOUNT_MAX_OFFSET)
+ return -1;
+ return 0;
+}
+#else
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+ kallsyms_lookup(ip, NULL, NULL, modname, str);
+ return 0;
+}
+#endif
+
static int
ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
struct ftrace_glob *mod_g, int exclude_mod)
@@ -3997,7 +4109,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
char str[KSYM_SYMBOL_LEN];
char *modname;
- kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+ if (lookup_ip(rec->ip, &modname, str)) {
+ /* This should only happen when a rec is disabled */
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING &&
+ !(rec->flags & FTRACE_FL_DISABLED));
+ return 0;
+ }
if (mod_g) {
int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0;
@@ -4448,7 +4565,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to remove the data from
*
* Returns the data if it is found, otherwise NULL.
- * Note, if the data pointer is used as the data itself, (see
+ * Note, if the data pointer is used as the data itself, (see
* ftrace_func_mapper_find_ip(), then the return value may be meaningless,
* if the data pointer was set to zero.
*/
@@ -4543,8 +4660,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops,
void *data)
{
+ struct ftrace_func_probe *probe = NULL, *iter;
struct ftrace_func_entry *entry;
- struct ftrace_func_probe *probe;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
@@ -4563,11 +4680,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
mutex_lock(&ftrace_lock);
/* Check if the probe_ops is already registered */
- list_for_each_entry(probe, &tr->func_probes, list) {
- if (probe->probe_ops == probe_ops)
+ list_for_each_entry(iter, &tr->func_probes, list) {
+ if (iter->probe_ops == probe_ops) {
+ probe = iter;
break;
+ }
}
- if (&probe->list == &tr->func_probes) {
+ if (!probe) {
probe = kzalloc(sizeof(*probe), GFP_KERNEL);
if (!probe) {
mutex_unlock(&ftrace_lock);
@@ -4685,9 +4804,9 @@ int
unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops)
{
+ struct ftrace_func_probe *probe = NULL, *iter;
struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_entry *entry;
- struct ftrace_func_probe *probe;
struct ftrace_glob func_g;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
@@ -4715,11 +4834,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
mutex_lock(&ftrace_lock);
/* Check if the probe_ops is already registered */
- list_for_each_entry(probe, &tr->func_probes, list) {
- if (probe->probe_ops == probe_ops)
+ list_for_each_entry(iter, &tr->func_probes, list) {
+ if (iter->probe_ops == probe_ops) {
+ probe = iter;
break;
+ }
}
- if (&probe->list == &tr->func_probes)
+ if (!probe)
goto err_unlock_ftrace;
ret = -EINVAL;
@@ -4958,11 +5079,12 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
{
struct ftrace_func_entry *entry;
- if (!ftrace_location(ip))
+ ip = ftrace_location(ip);
+ if (!ip)
return -EINVAL;
if (remove) {
@@ -4977,8 +5099,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
}
static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+ unsigned int cnt, int remove)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < cnt; i++) {
+ err = __ftrace_match_addr(hash, ips[i], remove);
+ if (err) {
+ /*
+ * This expects the @hash is a temporary hash and if this
+ * fails the caller must free the @hash.
+ */
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
- unsigned long ip, int remove, int reset, int enable)
+ unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5008,8 +5151,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
- if (ip) {
- ret = ftrace_match_addr(hash, ip, remove);
+ if (ips) {
+ ret = ftrace_match_addr(hash, ips, cnt, remove);
if (ret < 0)
goto out_regex_unlock;
}
@@ -5026,10 +5169,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
- int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5087,6 +5230,8 @@ static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
return direct;
}
+static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* @ip: The address of the nop at the beginning of a function
@@ -5110,11 +5255,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
struct ftrace_func_entry *entry;
struct ftrace_hash *free_hash = NULL;
struct dyn_ftrace *rec;
- int ret = -EBUSY;
+ int ret = -ENODEV;
mutex_lock(&direct_mutex);
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
/* See if there's a direct function at @ip already */
+ ret = -EBUSY;
if (ftrace_find_rec_direct(ip))
goto out_unlock;
@@ -5151,16 +5301,15 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
goto out_unlock;
ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
- if (ret)
- remove_hash_entry(direct_functions, entry);
if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
- ret = register_ftrace_function(&direct_ops);
+ ret = register_ftrace_function_nolock(&direct_ops);
if (ret)
ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
}
if (ret) {
+ remove_hash_entry(direct_functions, entry);
kfree(entry);
if (!direct->count) {
list_del_rcu(&direct->next);
@@ -5222,6 +5371,10 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
mutex_lock(&direct_mutex);
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
entry = find_direct_entry(&ip, NULL);
if (!entry)
goto out_unlock;
@@ -5274,6 +5427,8 @@ static struct ftrace_ops stub_ops = {
* it is safe to modify the ftrace record, where it should be
* currently calling @old_addr directly, to call @new_addr.
*
+ * This is called with direct_mutex locked.
+ *
* Safety checks should be made to make sure that the code at
* @rec->ip is currently calling @old_addr. And this must
* also update entry->direct to @new_addr.
@@ -5286,6 +5441,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
unsigned long ip = rec->ip;
int ret;
+ lockdep_assert_held(&direct_mutex);
+
/*
* The ftrace_lock was used to determine if the record
* had more than one registered user to it. If it did,
@@ -5308,7 +5465,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
if (ret)
goto out_lock;
- ret = register_ftrace_function(&stub_ops);
+ ret = register_ftrace_function_nolock(&stub_ops);
if (ret) {
ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
goto out_lock;
@@ -5354,6 +5511,11 @@ int modify_ftrace_direct(unsigned long ip,
mutex_lock(&direct_mutex);
mutex_lock(&ftrace_lock);
+
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
entry = find_direct_entry(&ip, &rec);
if (!entry)
goto out_unlock;
@@ -5404,8 +5566,7 @@ int modify_ftrace_direct(unsigned long ip,
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \
- FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
static int check_direct_multi(struct ftrace_ops *ops)
{
@@ -5498,7 +5659,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
- err = register_ftrace_function(ops);
+ err = register_ftrace_function_nolock(ops);
out_remove:
if (err)
@@ -5550,22 +5711,8 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
-/**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
- * to call something else
- * @ops: The address of the struct ftrace_ops object
- * @addr: The address of the new trampoline to call at @ops functions
- *
- * This is used to unregister currently registered direct caller and
- * register new one @addr on functions registered in @ops object.
- *
- * Note there's window between ftrace_shutdown and ftrace_startup calls
- * where there will be no callbacks called.
- *
- * Returns: zero on success. Non zero on error, which includes:
- * -EINVAL - The @ops object was not properly registered.
- */
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+static int
+__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash;
struct ftrace_func_entry *entry, *iter;
@@ -5576,20 +5723,15 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
int i, size;
int err;
- if (check_direct_multi(ops))
- return -EINVAL;
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return -EINVAL;
-
- mutex_lock(&direct_mutex);
+ lockdep_assert_held_once(&direct_mutex);
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
- err = register_ftrace_function(&tmp_ops);
+ err = register_ftrace_function_nolock(&tmp_ops);
if (err)
- goto out_direct;
+ return err;
/*
* Now the ftrace_ops_list_func() is called to do the direct callers.
@@ -5613,7 +5755,64 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* Removing the tmp_ops will add the updated direct callers to the functions */
unregister_ftrace_function(&tmp_ops);
- out_direct:
+ return err;
+}
+
+/**
+ * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Caller should already have direct_mutex locked, so we don't lock
+ * direct_mutex here.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+{
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ return __modify_ftrace_direct_multi(ops, addr);
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+
+/**
+ * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ int err;
+
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+ err = __modify_ftrace_direct_multi(ops, addr);
mutex_unlock(&direct_mutex);
return err;
}
@@ -5634,11 +5833,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
ftrace_ops_init(ops);
- return ftrace_set_addr(ops, ip, remove, reset, 1);
+ return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
+ * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ */
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
+/**
* ftrace_ops_set_global_filter - setup ops to use global filters
* @ops - the ops which will use the global filters
*
@@ -5659,7 +5877,7 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+ return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
}
/**
@@ -5867,8 +6085,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- if (iter->tr && !list_empty(&iter->tr->mod_trace))
- iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ if (iter->tr) {
+ if (list_empty(&iter->tr->mod_trace))
+ iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
+ else
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ }
} else
orig_hash = &iter->ops->func_hash->notrace_hash;
@@ -6563,6 +6785,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum,
return -ERANGE;
}
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES)
+/*
+ * Check if the current ops references the given ip.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static bool
+ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return false;
+
+ /* If ops traces all then it includes this function */
+ if (ops_traces_mod(ops))
+ return true;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
+ return false;
+
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
+ return false;
+
+ return true;
+}
+#endif
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6575,7 +6829,7 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec)) {
+ if (ops_references_ip(ops, rec->ip)) {
if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
continue;
if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
@@ -6755,6 +7009,13 @@ void ftrace_module_enable(struct module *mod)
!within_module_init(rec->ip, mod))
break;
+ /* Weak functions should still be ignored */
+ if (!test_for_valid_rec(rec)) {
+ /* Clear all other flags. Should not be enabled anyway */
+ rec->flags = FTRACE_FL_DISABLED;
+ continue;
+ }
+
cnt = 0;
/*
@@ -6791,11 +7052,16 @@ void ftrace_module_enable(struct module *mod)
void ftrace_module_init(struct module *mod)
{
+ int ret;
+
if (ftrace_disabled || !mod->num_ftrace_callsites)
return;
- ftrace_process_locs(mod, mod->ftrace_callsites,
- mod->ftrace_callsites + mod->num_ftrace_callsites);
+ ret = ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
+ if (ret)
+ pr_warn("ftrace: failed to allocate entries for module '%s' functions\n",
+ mod->name);
}
static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
@@ -7096,6 +7362,8 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
@@ -7124,17 +7392,21 @@ void __init ftrace_init(void)
}
pr_info("ftrace: allocating %ld entries in %ld pages\n",
- count, count / ENTRIES_PER_PAGE + 1);
-
- last_ftrace_enabled = ftrace_enabled = 1;
+ count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
+ if (ret) {
+ pr_warn("ftrace: failed to allocate entries for functions\n");
+ goto failed;
+ }
pr_info("ftrace: allocated %ld pages with %ld groups\n",
ftrace_number_of_pages, ftrace_number_of_groups);
+ last_ftrace_enabled = ftrace_enabled = 1;
+
set_ftrace_early_filters();
return;
@@ -7191,12 +7463,8 @@ static int __init ftrace_nodyn_init(void)
core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
-static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
-# define ftrace_startup_sysctl() do { } while (0)
-# define ftrace_shutdown_sysctl() do { } while (0)
-
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
}
@@ -7347,7 +7615,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
@@ -7789,9 +8059,146 @@ int ftrace_is_dead(void)
return ftrace_disabled;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When registering ftrace_ops with IPMODIFY, it is necessary to make sure
+ * it doesn't conflict with any direct ftrace_ops. If there is existing
+ * direct ftrace_ops on a kernel function being patched, call
+ * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing.
+ *
+ * @ops: ftrace_ops being registered.
+ *
+ * Returns:
+ * 0 on success;
+ * Negative on failure.
+ */
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *op;
+ int size, i, ret;
+
+ lockdep_assert_held_once(&direct_mutex);
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ unsigned long ip = entry->ip;
+ bool found_op = false;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (ops_references_ip(op, ip)) {
+ found_op = true;
+ break;
+ }
+ } while_for_each_ftrace_op(op);
+ mutex_unlock(&ftrace_lock);
+
+ if (found_op) {
+ if (!op->ops_func)
+ return -EBUSY;
+
+ ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Similar to prepare_direct_functions_for_ipmodify, clean up after ops
+ * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT
+ * ops.
+ */
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *op;
+ int size, i;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return;
+
+ mutex_lock(&direct_mutex);
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ unsigned long ip = entry->ip;
+ bool found_op = false;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (ops_references_ip(op, ip)) {
+ found_op = true;
+ break;
+ }
+ } while_for_each_ftrace_op(op);
+ mutex_unlock(&ftrace_lock);
+
+ /* The cleanup is optional, ignore any errors */
+ if (found_op && op->ops_func)
+ op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ }
+ }
+ mutex_unlock(&direct_mutex);
+}
+
+#define lock_direct_mutex() mutex_lock(&direct_mutex)
+#define unlock_direct_mutex() mutex_unlock(&direct_mutex)
+
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+ return 0;
+}
+
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+}
+
+#define lock_direct_mutex() do { } while (0)
+#define unlock_direct_mutex() do { } while (0)
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+/*
+ * Similar to register_ftrace_function, except we don't lock direct_mutex.
+ */
+static int register_ftrace_function_nolock(struct ftrace_ops *ops)
+{
+ int ret;
+
+ ftrace_ops_init(ops);
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_startup(ops, 0);
+
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
/**
* register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
+ * @ops: ops structure that holds the function for profiling.
*
* Register a function to be called by all functions in the
* kernel.
@@ -7804,21 +8211,22 @@ int register_ftrace_function(struct ftrace_ops *ops)
{
int ret;
- ftrace_ops_init(ops);
-
- mutex_lock(&ftrace_lock);
-
- ret = ftrace_startup(ops, 0);
+ lock_direct_mutex();
+ ret = prepare_direct_functions_for_ipmodify(ops);
+ if (ret < 0)
+ goto out_unlock;
- mutex_unlock(&ftrace_lock);
+ ret = register_ftrace_function_nolock(ops);
+out_unlock:
+ unlock_direct_mutex();
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_function);
/**
* unregister_ftrace_function - unregister a function for profiling.
- * @ops - ops structure that holds the function to unregister
+ * @ops: ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.
*/
@@ -7830,10 +8238,122 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
ret = ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
+ cleanup_direct_functions_after_ipmodify(ops);
return ret;
}
EXPORT_SYMBOL_GPL(unregister_ftrace_function);
+static int symbols_cmp(const void *a, const void *b)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+struct kallsyms_data {
+ unsigned long *addrs;
+ const char **syms;
+ size_t cnt;
+ size_t found;
+};
+
+static int kallsyms_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct kallsyms_data *args = data;
+ const char **sym;
+ int idx;
+
+ sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp);
+ if (!sym)
+ return 0;
+
+ idx = sym - args->syms;
+ if (args->addrs[idx])
+ return 0;
+
+ if (!ftrace_location(addr))
+ return 0;
+
+ args->addrs[idx] = addr;
+ args->found++;
+ return args->found == args->cnt ? 1 : 0;
+}
+
+/**
+ * ftrace_lookup_symbols - Lookup addresses for array of symbols
+ *
+ * @sorted_syms: array of symbols pointers symbols to resolve,
+ * must be alphabetically sorted
+ * @cnt: number of symbols/addresses in @syms/@addrs arrays
+ * @addrs: array for storing resulting addresses
+ *
+ * This function looks up addresses for array of symbols provided in
+ * @syms array (must be alphabetically sorted) and stores them in
+ * @addrs array, which needs to be big enough to store at least @cnt
+ * addresses.
+ *
+ * This function returns 0 if all provided symbols are found,
+ * -ESRCH otherwise.
+ */
+int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
+{
+ struct kallsyms_data args;
+ int err;
+
+ memset(addrs, 0, sizeof(*addrs) * cnt);
+ args.addrs = addrs;
+ args.syms = sorted_syms;
+ args.cnt = cnt;
+ args.found = 0;
+ err = kallsyms_on_each_symbol(kallsyms_callback, &args);
+ if (err < 0)
+ return err;
+ return args.found == args.cnt ? 0 : -ESRCH;
+}
+
+#ifdef CONFIG_SYSCTL
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void ftrace_startup_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* Force update next time */
+ saved_ftrace_func = NULL;
+ /* ftrace_start_up is true if we want ftrace running */
+ if (ftrace_start_up) {
+ command = FTRACE_UPDATE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_START_FUNC_RET;
+ ftrace_startup_enable(command);
+ }
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* ftrace_start_up is true if ftrace is running */
+ if (ftrace_start_up) {
+ command = FTRACE_DISABLE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_STOP_FUNC_RET;
+ ftrace_run_update_code(command);
+ }
+}
+#else
+# define ftrace_startup_sysctl() do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
static bool is_permanent_ops_registered(void)
{
struct ftrace_ops *op;
@@ -7846,7 +8366,7 @@ static bool is_permanent_ops_registered(void)
return false;
}
-int
+static int
ftrace_enable_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -7889,3 +8409,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_unlock(&ftrace_lock);
return ret;
}
+
+static struct ctl_table ftrace_sysctls[] = {
+ {
+ .procname = "ftrace_enabled",
+ .data = &ftrace_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ftrace_enable_sysctl,
+ },
+ {}
+};
+
+static int __init ftrace_sysctl_init(void)
+{
+ register_sysctl_init("kernel", ftrace_sysctls);
+ return 0;
+}
+late_initcall(ftrace_sysctl_init);
+#endif
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index 18b0f1cbb947..c736487fc0e4 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -35,6 +35,49 @@
static struct trace_event_file *gen_kprobe_test;
static struct trace_event_file *gen_kretprobe_test;
+#define KPROBE_GEN_TEST_FUNC "do_sys_open"
+
+/* X86 */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%ax"
+#define KPROBE_GEN_TEST_ARG1 "filename=%dx"
+#define KPROBE_GEN_TEST_ARG2 "flags=%cx"
+#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)"
+
+/* ARM64 */
+#elif defined(CONFIG_ARM64)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%x0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%x1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%x2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%x3"
+
+/* ARM */
+#elif defined(CONFIG_ARM)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%r0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%r1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%r2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%r3"
+
+/* RISCV */
+#elif defined(CONFIG_RISCV)
+#define KPROBE_GEN_TEST_ARG0 "dfd=%a0"
+#define KPROBE_GEN_TEST_ARG1 "filename=%a1"
+#define KPROBE_GEN_TEST_ARG2 "flags=%a2"
+#define KPROBE_GEN_TEST_ARG3 "mode=%a3"
+
+/* others */
+#else
+#define KPROBE_GEN_TEST_ARG0 NULL
+#define KPROBE_GEN_TEST_ARG1 NULL
+#define KPROBE_GEN_TEST_ARG2 NULL
+#define KPROBE_GEN_TEST_ARG3 NULL
+#endif
+
+static bool trace_event_file_is_valid(struct trace_event_file *input)
+{
+ return input && !IS_ERR(input);
+}
+
/*
* Test to make sure we can create a kprobe event, then add more
* fields.
@@ -58,23 +101,23 @@ static int __init test_gen_kprobe_cmd(void)
* fields.
*/
ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
- "do_sys_open",
- "dfd=%ax", "filename=%dx");
+ KPROBE_GEN_TEST_FUNC,
+ KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
if (ret)
- goto free;
+ goto out;
/* Use kprobe_event_add_fields to add the rest of the fields */
- ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+ ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kprobe_test event file. We need to prevent
@@ -97,13 +140,13 @@ static int __init test_gen_kprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
+ if (trace_event_file_is_valid(gen_kprobe_test))
+ gen_kprobe_test = NULL;
/* We got an error after creating the event, delete it */
ret = kprobe_event_delete("gen_kprobe_test");
- free:
- kfree(buf);
-
goto out;
}
@@ -128,17 +171,17 @@ static int __init test_gen_kretprobe_cmd(void)
* Define the kretprobe event.
*/
ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
- "do_sys_open",
+ KPROBE_GEN_TEST_FUNC,
"$retval");
if (ret)
- goto free;
+ goto out;
/*
* This actually creates the event.
*/
ret = kretprobe_event_gen_cmd_end(&cmd);
if (ret)
- goto free;
+ goto out;
/*
* Now get the gen_kretprobe_test event file. We need to
@@ -162,13 +205,13 @@ static int __init test_gen_kretprobe_cmd(void)
goto delete;
}
out:
+ kfree(buf);
return ret;
delete:
+ if (trace_event_file_is_valid(gen_kretprobe_test))
+ gen_kretprobe_test = NULL;
/* We got an error after creating the event, delete it */
ret = kprobe_event_delete("gen_kretprobe_test");
- free:
- kfree(buf);
-
goto out;
}
@@ -182,10 +225,12 @@ static int __init kprobe_event_gen_test_init(void)
ret = test_gen_kretprobe_cmd();
if (ret) {
- WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
- "kprobes",
- "gen_kretprobe_test", false));
- trace_put_event_file(gen_kretprobe_test);
+ if (trace_event_file_is_valid(gen_kretprobe_test)) {
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+ trace_put_event_file(gen_kretprobe_test);
+ }
WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
}
@@ -194,24 +239,30 @@ static int __init kprobe_event_gen_test_init(void)
static void __exit kprobe_event_gen_test_exit(void)
{
- /* Disable the event or you can't remove it */
- WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
- "kprobes",
- "gen_kprobe_test", false));
+ if (trace_event_file_is_valid(gen_kprobe_test)) {
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+ "kprobes",
+ "gen_kprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kprobe_test);
+ }
- /* Now give the file and instance back */
- trace_put_event_file(gen_kprobe_test);
/* Now unregister and free the event */
WARN_ON(kprobe_event_delete("gen_kprobe_test"));
- /* Disable the event or you can't remove it */
- WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
- "kprobes",
- "gen_kretprobe_test", false));
+ if (trace_event_file_is_valid(gen_kretprobe_test)) {
+ /* Disable the event or you can't remove it */
+ WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
+ "kprobes",
+ "gen_kretprobe_test", false));
+
+ /* Now give the file and instance back */
+ trace_put_event_file(gen_kretprobe_test);
+ }
- /* Now give the file and instance back */
- trace_put_event_file(gen_kretprobe_test);
/* Now unregister and free the event */
WARN_ON(kprobe_event_delete("gen_kretprobe_test"));
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index a2ef1d18126a..95106d02b32d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1,
/**
* trace_pid_list_is_set - test if the pid is set in the list
* @pid_list: The pid list to test
- * @pid: The pid to to see if set in the list.
+ * @pid: The pid to see if set in the list.
*
- * Tests if @pid is is set in the @pid_list. This is usually called
+ * Tests if @pid is set in the @pid_list. This is usually called
* from the scheduler when a task is scheduled. Its pid is checked
* if it should be traced or not.
*
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..32c3dfdb4d6a
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+/* Return hook list (shadow stack by list) */
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void rethook_flush_task(struct task_struct *tk)
+{
+ struct rethook_node *rhn;
+ struct llist_node *node;
+
+ node = __llist_del_all(&tk->rethooks);
+ while (node) {
+ rhn = container_of(node, struct rethook_node, llist);
+ node = node->next;
+ preempt_disable();
+ rethook_recycle(rhn);
+ preempt_enable();
+ }
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+ struct rethook *rh = container_of(head, struct rethook, rcu);
+ struct rethook_node *rhn;
+ struct freelist_node *node;
+ int count = 1;
+
+ node = rh->pool.head;
+ while (node) {
+ rhn = container_of(node, struct rethook_node, freelist);
+ node = node->next;
+ kfree(rhn);
+ count++;
+ }
+
+ /* The rh->ref is the number of pooled node + 1 */
+ if (refcount_sub_and_test(count, &rh->ref))
+ kfree(rh);
+}
+
+/**
+ * rethook_free() - Free struct rethook.
+ * @rh: the struct rethook to be freed.
+ *
+ * Free the rethook. Before calling this function, user must ensure the
+ * @rh::data is cleaned if needed (or, the handler can access it after
+ * calling this function.) This function will set the @rh to be freed
+ * after all rethook_node are freed (not soon). And the caller must
+ * not touch @rh after calling this.
+ */
+void rethook_free(struct rethook *rh)
+{
+ WRITE_ONCE(rh->handler, NULL);
+
+ call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+/**
+ * rethook_alloc() - Allocate struct rethook.
+ * @data: a data to pass the @handler when hooking the return.
+ * @handler: the return hook callback function.
+ *
+ * Allocate and initialize a new rethook with @data and @handler.
+ * Return NULL if memory allocation fails or @handler is NULL.
+ * Note that @handler == NULL means this rethook is going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+{
+ struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+
+ if (!rh || !handler) {
+ kfree(rh);
+ return NULL;
+ }
+
+ rh->data = data;
+ rh->handler = handler;
+ rh->pool.head = NULL;
+ refcount_set(&rh->ref, 1);
+
+ return rh;
+}
+
+/**
+ * rethook_add_node() - Add a new node to the rethook.
+ * @rh: the struct rethook.
+ * @node: the struct rethook_node to be added.
+ *
+ * Add @node to @rh. User must allocate @node (as a part of user's
+ * data structure.) The @node fields are initialized in this function.
+ */
+void rethook_add_node(struct rethook *rh, struct rethook_node *node)
+{
+ node->rethook = rh;
+ freelist_add(&node->freelist, &rh->pool);
+ refcount_inc(&rh->ref);
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+ struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+
+ if (refcount_dec_and_test(&node->rethook->ref))
+ kfree(node->rethook);
+ kfree(node);
+}
+
+/**
+ * rethook_recycle() - return the node to rethook.
+ * @node: The struct rethook_node to be returned.
+ *
+ * Return back the @node to @node::rethook. If the @node::rethook is already
+ * marked as freed, this will free the @node.
+ */
+void rethook_recycle(struct rethook_node *node)
+{
+ lockdep_assert_preemption_disabled();
+
+ if (likely(READ_ONCE(node->rethook->handler)))
+ freelist_add(&node->freelist, &node->rethook->pool);
+ else
+ call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+NOKPROBE_SYMBOL(rethook_recycle);
+
+/**
+ * rethook_try_get() - get an unused rethook node.
+ * @rh: The struct rethook which pools the nodes.
+ *
+ * Get an unused rethook node from @rh. If the node pool is empty, this
+ * will return NULL. Caller must disable preemption.
+ */
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+ rethook_handler_t handler = READ_ONCE(rh->handler);
+ struct freelist_node *fn;
+
+ lockdep_assert_preemption_disabled();
+
+ /* Check whether @rh is going to be freed. */
+ if (unlikely(!handler))
+ return NULL;
+
+ /*
+ * This expects the caller will set up a rethook on a function entry.
+ * When the function returns, the rethook will eventually be reclaimed
+ * or released in the rethook_recycle() with call_rcu().
+ * This means the caller must be run in the RCU-availabe context.
+ */
+ if (unlikely(!rcu_is_watching()))
+ return NULL;
+
+ fn = freelist_try_get(&rh->pool);
+ if (!fn)
+ return NULL;
+
+ return container_of(fn, struct rethook_node, freelist);
+}
+NOKPROBE_SYMBOL(rethook_try_get);
+
+/**
+ * rethook_hook() - Hook the current function return.
+ * @node: The struct rethook node to hook the function return.
+ * @regs: The struct pt_regs for the function entry.
+ * @mcount: True if this is called from mcount(ftrace) context.
+ *
+ * Hook the current running function return. This must be called when the
+ * function entry (or at least @regs must be the registers of the function
+ * entry.) @mcount is used for identifying the context. If this is called
+ * from ftrace (mcount) callback, @mcount must be set true. If this is called
+ * from the real function entry (e.g. kprobes) @mcount must be set false.
+ * This is because the way to hook the function return depends on the context.
+ */
+void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
+{
+ arch_rethook_prepare(node, regs, mcount);
+ __llist_add(&node->llist, &current->rethooks);
+}
+NOKPROBE_SYMBOL(rethook_hook);
+
+/* This assumes the 'tsk' is the current task or is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct rethook_node *rh = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->rethooks.first;
+ else
+ node = node->next;
+
+ while (node) {
+ rh = container_of(node, struct rethook_node, llist);
+ if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+ *cur = node;
+ return rh->ret_addr;
+ }
+ node = node->next;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ *
+ * Returns found address value or zero if not found.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur)
+{
+ struct rethook_node *rhn = NULL;
+ unsigned long ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ return 0;
+
+ do {
+ ret = __rethook_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ rhn = container_of(*cur, struct rethook_node, llist);
+ } while (rhn->frame != frame);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /*
+ * Do nothing by default. If the architecture which uses a
+ * frame pointer to record real return address on the stack,
+ * it should fill this function to fixup the return address
+ * so that stacktrace works from the rethook handler.
+ */
+}
+
+/* This function will be called from each arch-defined trampoline. */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame)
+{
+ struct llist_node *first, *node = NULL;
+ unsigned long correct_ret_addr;
+ rethook_handler_t handler;
+ struct rethook_node *rhn;
+
+ correct_ret_addr = __rethook_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+ BUG_ON(1);
+ }
+
+ instruction_pointer_set(regs, correct_ret_addr);
+
+ /*
+ * These loops must be protected from rethook_free_rcu() because those
+ * are accessing 'rhn->rethook'.
+ */
+ preempt_disable();
+
+ /*
+ * Run the handler on the shadow stack. Do not unlink the list here because
+ * stackdump inside the handlers needs to decode it.
+ */
+ first = current->rethooks.first;
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ if (WARN_ON_ONCE(rhn->frame != frame))
+ break;
+ handler = READ_ONCE(rhn->rethook->handler);
+ if (handler)
+ handler(rhn, rhn->rethook->data, regs);
+
+ if (first == node)
+ break;
+ first = first->next;
+ }
+
+ /* Fixup registers for returning to correct address. */
+ arch_rethook_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink used shadow stack */
+ first = current->rethooks.first;
+ current->rethooks.first = node->next;
+ node->next = NULL;
+
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ first = first->next;
+ rethook_recycle(rhn);
+ }
+ preempt_enable();
+
+ return correct_ret_addr;
+}
+NOKPROBE_SYMBOL(rethook_trampoline_handler);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05dfc7a12d3d..b21bf14bae9b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -29,6 +29,14 @@
#include <asm/local.h>
+/*
+ * The "absolute" timestamp in the buffer is only 59 bits.
+ * If a clock has the 5 MSBs set, it needs to be saved and
+ * reinserted.
+ */
+#define TS_MSB (0xf8ULL << 56)
+#define ABS_TS_MASK (~TS_MSB)
+
static void update_pages_handler(struct work_struct *work);
/*
@@ -405,6 +413,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -468,6 +477,7 @@ struct rb_time_struct {
local_t cnt;
local_t top;
local_t bottom;
+ local_t msb;
};
#else
#include <asm/local64.h>
@@ -509,6 +519,7 @@ struct ring_buffer_per_cpu {
local_t committing;
local_t commits;
local_t pages_touched;
+ local_t pages_lost;
local_t pages_read;
long last_pages_touch;
size_t shortest_full;
@@ -569,7 +580,6 @@ struct ring_buffer_iter {
* For the ring buffer, 64 bit required operations for the time is
* the following:
*
- * - Only need 59 bits (uses 60 to make it even).
* - Reads may fail if it interrupted a modification of the time stamp.
* It will succeed if it did not interrupt another write even if
* the read itself is interrupted by a write.
@@ -594,6 +604,7 @@ struct ring_buffer_iter {
*/
#define RB_TIME_SHIFT 30
#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+#define RB_TIME_MSB_SHIFT 60
static inline int rb_time_cnt(unsigned long val)
{
@@ -613,7 +624,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
{
- unsigned long top, bottom;
+ unsigned long top, bottom, msb;
unsigned long c;
/*
@@ -625,6 +636,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
c = local_read(&t->cnt);
top = local_read(&t->top);
bottom = local_read(&t->bottom);
+ msb = local_read(&t->msb);
} while (c != local_read(&t->cnt));
*cnt = rb_time_cnt(top);
@@ -633,7 +645,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
if (*cnt != rb_time_cnt(bottom))
return false;
- *ret = rb_time_val(top, bottom);
+ /* The shift to msb will lose its cnt bits */
+ *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
return true;
}
@@ -649,10 +662,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt
return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
}
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
+ unsigned long *msb)
{
*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+ *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
}
static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
@@ -663,15 +678,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long
static void rb_time_set(rb_time_t *t, u64 val)
{
- unsigned long cnt, top, bottom;
+ unsigned long cnt, top, bottom, msb;
- rb_time_split(val, &top, &bottom);
+ rb_time_split(val, &top, &bottom, &msb);
/* Writes always succeed with a valid number even if it gets interrupted. */
do {
cnt = local_inc_return(&t->cnt);
rb_time_val_set(&t->top, top, cnt);
rb_time_val_set(&t->bottom, bottom, cnt);
+ rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
} while (cnt != local_read(&t->cnt));
}
@@ -686,8 +702,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
- unsigned long cnt, top, bottom;
- unsigned long cnt2, top2, bottom2;
+ unsigned long cnt, top, bottom, msb;
+ unsigned long cnt2, top2, bottom2, msb2;
u64 val;
/* The cmpxchg always fails if it interrupted an update */
@@ -703,16 +719,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
cnt2 = cnt + 1;
- rb_time_split(val, &top, &bottom);
+ rb_time_split(val, &top, &bottom, &msb);
top = rb_time_val_cnt(top, cnt);
bottom = rb_time_val_cnt(bottom, cnt);
- rb_time_split(set, &top2, &bottom2);
+ rb_time_split(set, &top2, &bottom2, &msb2);
top2 = rb_time_val_cnt(top2, cnt2);
bottom2 = rb_time_val_cnt(bottom2, cnt2);
if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
return false;
+ if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+ return false;
if (!rb_time_read_cmpxchg(&t->top, top, top2))
return false;
if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
@@ -783,6 +801,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
}
#endif
+/*
+ * The absolute time stamp drops the 5 MSBs and some clocks may
+ * require them. The rb_fix_abs_ts() will take a previous full
+ * time stamp, and add the 5 MSB of that time stamp on to the
+ * saved absolute time stamp. Then they are compared in case of
+ * the unlikely event that the latest time stamp incremented
+ * the 5 MSB.
+ */
+static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
+{
+ if (save_ts & TS_MSB) {
+ abs |= save_ts & TS_MSB;
+ /* Check for overflow */
+ if (unlikely(abs < save_ts))
+ abs += 1ULL << 59;
+ }
+ return abs;
+}
static inline u64 rb_time_stamp(struct trace_buffer *buffer);
@@ -811,8 +847,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
u64 ts;
/* If the event includes an absolute time, then just use that */
- if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
- return rb_event_time_stamp(event);
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ ts = rb_event_time_stamp(event);
+ return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
+ }
nest = local_read(&cpu_buffer->committing);
verify_event(cpu_buffer, event);
@@ -848,7 +886,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
}
/**
- * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
@@ -857,10 +895,18 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
{
size_t read;
+ size_t lost;
size_t cnt;
read = local_read(&buffer->buffers[cpu]->pages_read);
+ lost = local_read(&buffer->buffers[cpu]->pages_lost);
cnt = local_read(&buffer->buffers[cpu]->pages_touched);
+
+ if (WARN_ON_ONCE(cnt < lost))
+ return 0;
+
+ cnt -= lost;
+
/* The reader can read an empty page, but not more than that */
if (cnt < read) {
WARN_ON_ONCE(read > cnt + 1);
@@ -870,6 +916,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
return cnt - read;
}
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ size_t nr_pages;
+ size_t dirty;
+
+ nr_pages = cpu_buffer->nr_pages;
+ if (!nr_pages || !full)
+ return true;
+
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+
+ return (dirty * 100) > (full * nr_pages);
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -881,13 +942,56 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
- if (rbwork->wakeup_full) {
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
rbwork->wakeup_full = false;
+ rbwork->full_waiters_pending = false;
wake_up_all(&rbwork->full_waiters);
}
}
/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *rbwork;
+
+ if (!buffer)
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+
+ /* Wake up individual ones too. One level recursion */
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_wake_waiters(buffer, cpu);
+
+ rbwork = &buffer->irq_work;
+ } else {
+ if (WARN_ON_ONCE(!buffer->buffers))
+ return;
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ /* The CPU buffer may not have been initialized yet */
+ if (!cpu_buffer)
+ return;
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ rbwork->wait_index++;
+ /* make sure the waiters see the new index */
+ smp_wmb();
+
+ rb_wake_up_waiters(&rbwork->work);
+}
+
+/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
@@ -902,6 +1006,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ long wait_index;
int ret = 0;
/*
@@ -920,6 +1025,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
+ wait_index = READ_ONCE(work->wait_index);
while (true) {
if (full)
@@ -964,26 +1070,29 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
- size_t nr_pages;
- size_t dirty;
+ bool done;
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ done = !pagebusy && full_hit(buffer, cpu, full);
+
if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full < full)
+ cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (!pagebusy &&
- (!nr_pages || (dirty * 100) > full * nr_pages))
+ if (done)
break;
}
schedule();
+
+ /* Make sure to see the new wait index */
+ smp_rmb();
+ if (wait_index != work->wait_index)
+ break;
}
if (full)
@@ -1000,6 +1109,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -1009,14 +1119,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table)
+ struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ full = 0;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
@@ -1024,8 +1135,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ if (full) {
+ poll_wait(filp, &work->full_waiters, poll_table);
+ work->full_waiters_pending = true;
+ } else {
+ poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ }
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1041,6 +1158,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
+ if (full)
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
+
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -1682,9 +1802,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(cpu_buffer->reader_page);
- rb_head_page_deactivate(cpu_buffer);
-
if (head) {
+ rb_head_page_deactivate(cpu_buffer);
+
list_for_each_entry_safe(bpage, tmp, head, list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
@@ -1920,6 +2040,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
*/
local_add(page_entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_inc(&cpu_buffer->pages_lost);
}
/*
@@ -2404,6 +2525,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_add(entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_inc(&cpu_buffer->pages_lost);
/*
* The entries will be zeroed out when we move the
@@ -2572,6 +2694,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Mark the rest of the page with padding */
rb_event_set_padding(event);
+ /* Make sure the padding is visible before the write update */
+ smp_wmb();
+
/* Set the write back to the previous setting */
local_sub(length, &tail_page->write);
return;
@@ -2583,6 +2708,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* time delta must be non zero */
event->time_delta = 1;
+ /* Make sure the padding is visible before the tail_page->write update */
+ smp_wmb();
+
/* Set write to end of buffer */
length = (tail + length) - BUF_PAGE_SIZE;
local_sub(length, &tail_page->write);
@@ -2754,8 +2882,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
if (unlikely(info->delta > (1ULL << 59))) {
+ /*
+ * Some timers can use more than 59 bits, and when a timestamp
+ * is added to the buffer, it will lose those bits.
+ */
+ if (abs && (info->ts & TS_MSB)) {
+ info->delta &= ABS_TS_MASK;
+
/* did the clock go backwards */
- if (info->before == info->after && info->before > info->ts) {
+ } else if (info->before == info->after && info->before > info->ts) {
/* not interrupted */
static int once;
@@ -3055,10 +3190,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- size_t nr_pages;
- size_t dirty;
- size_t full;
-
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -3082,10 +3213,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
- full = cpu_buffer->shortest_full;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
- if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
@@ -3304,7 +3432,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
break;
@@ -3380,7 +3508,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
break;
case RINGBUF_TYPE_PADDING:
@@ -4367,6 +4495,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
cpu_buffer->read_stamp = delta;
return;
@@ -4397,6 +4526,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, iter->read_stamp);
iter->read_stamp = delta;
return;
@@ -4542,6 +4672,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ /*
+ * The writer has preempt disable, wait for it. But not forever
+ * Although, 1 second is pretty much "forever"
+ */
+#define USECS_WAIT 1000000
+ for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
+ /* If the write is past the end of page, a writer is still updating it */
+ if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ break;
+
+ udelay(1);
+
+ /* Get the latest version of the reader write value */
+ smp_rmb();
+ }
+
+ /* The writer is not moving forward? Something is wrong */
+ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
+ reader = NULL;
+
+ /*
+ * Make sure we see any padding after the write update
+ * (see rb_reset_tail())
+ */
+ smp_rmb();
+
+
return reader;
}
@@ -4650,6 +4807,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4741,6 +4899,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -5117,6 +5276,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
local_set(&cpu_buffer->pages_touched, 0);
+ local_set(&cpu_buffer->pages_lost, 0);
local_set(&cpu_buffer->pages_read, 0);
cpu_buffer->last_pages_touch = 0;
cpu_buffer->shortest_full = 0;
@@ -5185,7 +5345,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
- * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
@@ -5255,7 +5415,7 @@ void ring_buffer_reset(struct trace_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_reset);
/**
- * rind_buffer_empty - is the ring buffer empty?
+ * ring_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
@@ -5569,7 +5729,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int pos = 0;
unsigned int size;
- if (full)
+ /*
+ * If a full page is expected, this can still be returned
+ * if there's been a previous partial read and the
+ * rest of the page can be read and the commit page is off
+ * the reader page.
+ */
+ if (full &&
+ (!read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))
@@ -6011,10 +6179,10 @@ static __init int test_ringbuffer(void)
pr_info(" total events: %ld\n", total_lost + total_read);
pr_info(" recorded len bytes: %ld\n", total_len);
pr_info(" recorded size bytes: %ld\n", total_size);
- if (total_lost)
+ if (total_lost) {
pr_info(" With dropped events, record len and size may not match\n"
" alloced and written from above\n");
- if (!total_lost) {
+ } else {
if (RB_WARN_ON(buffer, total_len != total_alloc ||
total_size != total_written))
break;
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
new file mode 100644
index 000000000000..831779607e84
--- /dev/null
+++ b/kernel/trace/rv/Kconfig
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config DA_MON_EVENTS
+ bool
+
+config DA_MON_EVENTS_IMPLICIT
+ select DA_MON_EVENTS
+ bool
+
+config DA_MON_EVENTS_ID
+ select DA_MON_EVENTS
+ bool
+
+menuconfig RV
+ bool "Runtime Verification"
+ depends on TRACING
+ help
+ Enable the kernel runtime verification infrastructure. RV is a
+ lightweight (yet rigorous) method that complements classical
+ exhaustive verification techniques (such as model checking and
+ theorem proving). RV works by analyzing the trace of the system's
+ actual execution, comparing it against a formal specification of
+ the system behavior.
+
+ For further information, see:
+ Documentation/trace/rv/runtime-verification.rst
+
+config RV_MON_WIP
+ depends on RV
+ depends on PREEMPT_TRACER
+ select DA_MON_EVENTS_IMPLICIT
+ bool "wip monitor"
+ help
+ Enable wip (wakeup in preemptive) sample monitor that illustrates
+ the usage of per-cpu monitors, and one limitation of the
+ preempt_disable/enable events.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wip.rst
+
+config RV_MON_WWNR
+ depends on RV
+ select DA_MON_EVENTS_ID
+ bool "wwnr monitor"
+ help
+ Enable wwnr (wakeup while not running) sample monitor, this is a
+ sample monitor that illustrates the usage of per-task monitor.
+ The model is borken on purpose: it serves to test reactors.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wwnr.rst
+
+config RV_REACTORS
+ bool "Runtime verification reactors"
+ default y
+ depends on RV
+ help
+ Enables the online runtime verification reactors. A runtime
+ monitor can cause a reaction to the detection of an exception
+ on the model's execution. By default, the monitors have
+ tracing reactions, printing the monitor output via tracepoints,
+ but other reactions can be added (on-demand) via this interface.
+
+config RV_REACT_PRINTK
+ bool "Printk reactor"
+ depends on RV_REACTORS
+ default y
+ help
+ Enables the printk reactor. The printk reactor emits a printk()
+ message if an exception is found.
+
+config RV_REACT_PANIC
+ bool "Panic reactor"
+ depends on RV_REACTORS
+ default y
+ help
+ Enables the panic reactor. The panic reactor emits a printk()
+ message if an exception is found and panic()s the system.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
new file mode 100644
index 000000000000..963d14875b45
--- /dev/null
+++ b/kernel/trace/rv/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
+obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
+obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
+obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
new file mode 100644
index 000000000000..b2b49a27e886
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "wip"
+
+#include <trace/events/rv.h>
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+
+#include "wip.h"
+
+static struct rv_monitor rv_wip;
+DECLARE_DA_MON_PER_CPU(wip, unsigned char);
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_wip(preempt_disable_wip);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_wip(preempt_enable_wip);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+ da_handle_event_wip(sched_waking_wip);
+}
+
+static int enable_wip(void)
+{
+ int retval;
+
+ retval = da_monitor_init_wip();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("wip", sched_waking, handle_sched_waking);
+ rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable);
+
+ return 0;
+}
+
+static void disable_wip(void)
+{
+ rv_wip.enabled = 0;
+
+ rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("wip", sched_waking, handle_sched_waking);
+
+ da_monitor_destroy_wip();
+}
+
+static struct rv_monitor rv_wip = {
+ .name = "wip",
+ .description = "wakeup in preemptive per-cpu testing monitor.",
+ .enable = enable_wip,
+ .disable = disable_wip,
+ .reset = da_monitor_reset_all_wip,
+ .enabled = 0,
+};
+
+static int __init register_wip(void)
+{
+ rv_register_monitor(&rv_wip);
+ return 0;
+}
+
+static void __exit unregister_wip(void)
+{
+ rv_unregister_monitor(&rv_wip);
+}
+
+module_init(register_wip);
+module_exit(unregister_wip);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>");
+MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor.");
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
new file mode 100644
index 000000000000..dacc37b62a2c
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -0,0 +1,46 @@
+/*
+ * Automatically generated C representation of wip automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_wip {
+ preemptive_wip = 0,
+ non_preemptive_wip,
+ state_max_wip
+};
+
+#define INVALID_STATE state_max_wip
+
+enum events_wip {
+ preempt_disable_wip = 0,
+ preempt_enable_wip,
+ sched_waking_wip,
+ event_max_wip
+};
+
+struct automaton_wip {
+ char *state_names[state_max_wip];
+ char *event_names[event_max_wip];
+ unsigned char function[state_max_wip][event_max_wip];
+ unsigned char initial_state;
+ bool final_states[state_max_wip];
+};
+
+static struct automaton_wip automaton_wip = {
+ .state_names = {
+ "preemptive",
+ "non_preemptive"
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "sched_waking"
+ },
+ .function = {
+ { non_preemptive_wip, INVALID_STATE, INVALID_STATE },
+ { INVALID_STATE, preemptive_wip, non_preemptive_wip },
+ },
+ .initial_state = preemptive_wip,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
new file mode 100644
index 000000000000..0e43dd2db685
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "wwnr"
+
+#include <trace/events/rv.h>
+#include <trace/events/sched.h>
+
+#include "wwnr.h"
+
+static struct rv_monitor rv_wwnr;
+DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
+
+static void handle_switch(void *data, bool preempt, struct task_struct *p,
+ struct task_struct *n, unsigned int prev_state)
+{
+ /* start monitoring only after the first suspension */
+ if (prev_state == TASK_INTERRUPTIBLE)
+ da_handle_start_event_wwnr(p, switch_out_wwnr);
+ else
+ da_handle_event_wwnr(p, switch_out_wwnr);
+
+ da_handle_event_wwnr(n, switch_in_wwnr);
+}
+
+static void handle_wakeup(void *data, struct task_struct *p)
+{
+ da_handle_event_wwnr(p, wakeup_wwnr);
+}
+
+static int enable_wwnr(void)
+{
+ int retval;
+
+ retval = da_monitor_init_wwnr();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("wwnr", sched_switch, handle_switch);
+ rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
+
+ return 0;
+}
+
+static void disable_wwnr(void)
+{
+ rv_wwnr.enabled = 0;
+
+ rv_detach_trace_probe("wwnr", sched_switch, handle_switch);
+ rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
+
+ da_monitor_destroy_wwnr();
+}
+
+static struct rv_monitor rv_wwnr = {
+ .name = "wwnr",
+ .description = "wakeup while not running per-task testing model.",
+ .enable = enable_wwnr,
+ .disable = disable_wwnr,
+ .reset = da_monitor_reset_all_wwnr,
+ .enabled = 0,
+};
+
+static int __init register_wwnr(void)
+{
+ rv_register_monitor(&rv_wwnr);
+ return 0;
+}
+
+static void __exit unregister_wwnr(void)
+{
+ rv_unregister_monitor(&rv_wwnr);
+}
+
+module_init(register_wwnr);
+module_exit(unregister_wwnr);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>");
+MODULE_DESCRIPTION("wwnr: wakeup while not running monitor");
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
new file mode 100644
index 000000000000..118e576b91b4
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -0,0 +1,46 @@
+/*
+ * Automatically generated C representation of wwnr automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_wwnr {
+ not_running_wwnr = 0,
+ running_wwnr,
+ state_max_wwnr
+};
+
+#define INVALID_STATE state_max_wwnr
+
+enum events_wwnr {
+ switch_in_wwnr = 0,
+ switch_out_wwnr,
+ wakeup_wwnr,
+ event_max_wwnr
+};
+
+struct automaton_wwnr {
+ char *state_names[state_max_wwnr];
+ char *event_names[event_max_wwnr];
+ unsigned char function[state_max_wwnr][event_max_wwnr];
+ unsigned char initial_state;
+ bool final_states[state_max_wwnr];
+};
+
+static struct automaton_wwnr automaton_wwnr = {
+ .state_names = {
+ "not_running",
+ "running"
+ },
+ .event_names = {
+ "switch_in",
+ "switch_out",
+ "wakeup"
+ },
+ .function = {
+ { running_wwnr, INVALID_STATE, not_running_wwnr },
+ { INVALID_STATE, not_running_wwnr, INVALID_STATE },
+ },
+ .initial_state = not_running_wwnr,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
new file mode 100644
index 000000000000..d65f6c25a87c
--- /dev/null
+++ b/kernel/trace/rv/reactor_panic.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Panic RV reactor:
+ * Prints the exception msg to the kernel message log and panic().
+ */
+
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+static void rv_panic_reaction(char *msg)
+{
+ panic(msg);
+}
+
+static struct rv_reactor rv_panic = {
+ .name = "panic",
+ .description = "panic the system if an exception is found.",
+ .react = rv_panic_reaction
+};
+
+static int __init register_react_panic(void)
+{
+ rv_register_reactor(&rv_panic);
+ return 0;
+}
+
+static void __exit unregister_react_panic(void)
+{
+ rv_unregister_reactor(&rv_panic);
+}
+
+module_init(register_react_panic);
+module_exit(unregister_react_panic);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira");
+MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found.");
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
new file mode 100644
index 000000000000..4b6b7106a477
--- /dev/null
+++ b/kernel/trace/rv/reactor_printk.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Printk RV reactor:
+ * Prints the exception msg to the kernel message log.
+ */
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+static void rv_printk_reaction(char *msg)
+{
+ printk_deferred(msg);
+}
+
+static struct rv_reactor rv_printk = {
+ .name = "printk",
+ .description = "prints the exception msg to the kernel message log.",
+ .react = rv_printk_reaction
+};
+
+static int __init register_react_printk(void)
+{
+ rv_register_reactor(&rv_printk);
+ return 0;
+}
+
+static void __exit unregister_react_printk(void)
+{
+ rv_unregister_reactor(&rv_printk);
+}
+
+module_init(register_react_printk);
+module_exit(unregister_react_printk);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Bristot de Oliveira");
+MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit.");
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
new file mode 100644
index 000000000000..6c97cc2d754a
--- /dev/null
+++ b/kernel/trace/rv/rv.c
@@ -0,0 +1,799 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * This is the online Runtime Verification (RV) interface.
+ *
+ * RV is a lightweight (yet rigorous) method that complements classical
+ * exhaustive verification techniques (such as model checking and
+ * theorem proving) with a more practical approach to complex systems.
+ *
+ * RV works by analyzing the trace of the system's actual execution,
+ * comparing it against a formal specification of the system behavior.
+ * RV can give precise information on the runtime behavior of the
+ * monitored system while enabling the reaction for unexpected
+ * events, avoiding, for example, the propagation of a failure on
+ * safety-critical systems.
+ *
+ * The development of this interface roots in the development of the
+ * paper:
+ *
+ * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
+ * Silva. Efficient formal verification for the Linux kernel. In:
+ * International Conference on Software Engineering and Formal Methods.
+ * Springer, Cham, 2019. p. 315-332.
+ *
+ * And:
+ *
+ * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis
+ * and verification of the real-time Linux kernel. PhD Thesis, 2020.
+ *
+ * == Runtime monitor interface ==
+ *
+ * A monitor is the central part of the runtime verification of a system.
+ *
+ * The monitor stands in between the formal specification of the desired
+ * (or undesired) behavior, and the trace of the actual system.
+ *
+ * In Linux terms, the runtime verification monitors are encapsulated
+ * inside the "RV monitor" abstraction. A RV monitor includes a reference
+ * model of the system, a set of instances of the monitor (per-cpu monitor,
+ * per-task monitor, and so on), and the helper functions that glue the
+ * monitor to the system via trace. Generally, a monitor includes some form
+ * of trace output as a reaction for event parsing and exceptions,
+ * as depicted bellow:
+ *
+ * Linux +----- RV Monitor ----------------------------------+ Formal
+ * Realm | | Realm
+ * +-------------------+ +----------------+ +-----------------+
+ * | Linux kernel | | Monitor | | Reference |
+ * | Tracing | -> | Instance(s) | <- | Model |
+ * | (instrumentation) | | (verification) | | (specification) |
+ * +-------------------+ +----------------+ +-----------------+
+ * | | |
+ * | V |
+ * | +----------+ |
+ * | | Reaction | |
+ * | +--+--+--+-+ |
+ * | | | | |
+ * | | | +-> trace output ? |
+ * +------------------------|--|----------------------+
+ * | +----> panic ?
+ * +-------> <user-specified>
+ *
+ * This file implements the interface for loading RV monitors, and
+ * to control the verification session.
+ *
+ * == Registering monitors ==
+ *
+ * The struct rv_monitor defines a set of callback functions to control
+ * a verification session. For instance, when a given monitor is enabled,
+ * the "enable" callback function is called to hook the instrumentation
+ * functions to the kernel trace events. The "disable" function is called
+ * when disabling the verification session.
+ *
+ * A RV monitor is registered via:
+ * int rv_register_monitor(struct rv_monitor *monitor);
+ * And unregistered via:
+ * int rv_unregister_monitor(struct rv_monitor *monitor);
+ *
+ * == User interface ==
+ *
+ * The user interface resembles kernel tracing interface. It presents
+ * these files:
+ *
+ * "available_monitors"
+ * - List the available monitors, one per line.
+ *
+ * For example:
+ * # cat available_monitors
+ * wip
+ * wwnr
+ *
+ * "enabled_monitors"
+ * - Lists the enabled monitors, one per line;
+ * - Writing to it enables a given monitor;
+ * - Writing a monitor name with a '!' prefix disables it;
+ * - Truncating the file disables all enabled monitors.
+ *
+ * For example:
+ * # cat enabled_monitors
+ * # echo wip > enabled_monitors
+ * # echo wwnr >> enabled_monitors
+ * # cat enabled_monitors
+ * wip
+ * wwnr
+ * # echo '!wip' >> enabled_monitors
+ * # cat enabled_monitors
+ * wwnr
+ * # echo > enabled_monitors
+ * # cat enabled_monitors
+ * #
+ *
+ * Note that more than one monitor can be enabled concurrently.
+ *
+ * "monitoring_on"
+ * - It is an on/off general switcher for monitoring. Note
+ * that it does not disable enabled monitors or detach events,
+ * but stops the per-entity monitors from monitoring the events
+ * received from the instrumentation. It resembles the "tracing_on"
+ * switcher.
+ *
+ * "monitors/"
+ * Each monitor will have its own directory inside "monitors/". There
+ * the monitor specific files will be presented.
+ * The "monitors/" directory resembles the "events" directory on
+ * tracefs.
+ *
+ * For example:
+ * # cd monitors/wip/
+ * # ls
+ * desc enable
+ * # cat desc
+ * auto-generated wakeup in preemptive monitor.
+ * # cat enable
+ * 0
+ *
+ * For further information, see:
+ * Documentation/trace/rv/runtime-verification.rst
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_DA_MON_EVENTS
+#define CREATE_TRACE_POINTS
+#include <trace/events/rv.h>
+#endif
+
+#include "rv.h"
+
+DEFINE_MUTEX(rv_interface_lock);
+
+static struct rv_interface rv_root;
+
+struct dentry *get_monitors_root(void)
+{
+ return rv_root.monitors_dir;
+}
+
+/*
+ * Interface for the monitor register.
+ */
+static LIST_HEAD(rv_monitors_list);
+
+static int task_monitor_count;
+static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+
+int rv_get_task_monitor_slot(void)
+{
+ int i;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (task_monitor_count == RV_PER_TASK_MONITORS)
+ return -EBUSY;
+
+ task_monitor_count++;
+
+ for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+ if (task_monitor_slots[i] == false) {
+ task_monitor_slots[i] = true;
+ return i;
+ }
+ }
+
+ WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n");
+
+ return -EINVAL;
+}
+
+void rv_put_task_monitor_slot(int slot)
+{
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+ WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
+ return;
+ }
+
+ WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n",
+ slot);
+
+ task_monitor_count--;
+ task_monitor_slots[slot] = false;
+}
+
+/*
+ * This section collects the monitor/ files and folders.
+ */
+static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ struct rv_monitor_def *mdef = filp->private_data;
+ const char *buff;
+
+ buff = mdef->monitor->enabled ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+/*
+ * __rv_disable_monitor - disabled an enabled monitor
+ */
+static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+{
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mdef->monitor->enabled) {
+ mdef->monitor->enabled = 0;
+ mdef->monitor->disable();
+
+ /*
+ * Wait for the execution of all events to finish.
+ * Otherwise, the data used by the monitor could
+ * be inconsistent. i.e., if the monitor is re-enabled.
+ */
+ if (sync)
+ tracepoint_synchronize_unregister();
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * rv_disable_monitor - disable a given runtime monitor
+ *
+ * Returns 0 on success.
+ */
+int rv_disable_monitor(struct rv_monitor_def *mdef)
+{
+ __rv_disable_monitor(mdef, true);
+ return 0;
+}
+
+/**
+ * rv_enable_monitor - enable a given runtime monitor
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int rv_enable_monitor(struct rv_monitor_def *mdef)
+{
+ int retval;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mdef->monitor->enabled)
+ return 0;
+
+ retval = mdef->monitor->enable();
+
+ if (!retval)
+ mdef->monitor->enabled = 1;
+
+ return retval;
+}
+
+/*
+ * interface for enabling/disabling a monitor.
+ */
+static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct rv_monitor_def *mdef = filp->private_data;
+ int retval;
+ bool val;
+
+ retval = kstrtobool_from_user(user_buf, count, &val);
+ if (retval)
+ return retval;
+
+ retval = count;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (val)
+ retval = rv_enable_monitor(mdef);
+ else
+ retval = rv_disable_monitor(mdef);
+
+ mutex_unlock(&rv_interface_lock);
+
+ return retval ? : count;
+}
+
+static const struct file_operations interface_enable_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = monitor_enable_write_data,
+ .read = monitor_enable_read_data,
+};
+
+/*
+ * Interface to read monitors description.
+ */
+static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ struct rv_monitor_def *mdef = filp->private_data;
+ char buff[256];
+
+ memset(buff, 0, sizeof(buff));
+
+ snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static const struct file_operations interface_desc_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .read = monitor_desc_read_data,
+};
+
+/*
+ * During the registration of a monitor, this function creates
+ * the monitor dir, where the specific options of the monitor
+ * are exposed.
+ */
+static int create_monitor_dir(struct rv_monitor_def *mdef)
+{
+ struct dentry *root = get_monitors_root();
+ const char *name = mdef->monitor->name;
+ struct dentry *tmp;
+ int retval;
+
+ mdef->root_d = rv_create_dir(name, root);
+ if (!mdef->root_d)
+ return -ENOMEM;
+
+ tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+ if (!tmp) {
+ retval = -ENOMEM;
+ goto out_remove_root;
+ }
+
+ tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+ if (!tmp) {
+ retval = -ENOMEM;
+ goto out_remove_root;
+ }
+
+ retval = reactor_populate_monitor(mdef);
+ if (retval)
+ goto out_remove_root;
+
+ return 0;
+
+out_remove_root:
+ rv_remove(mdef->root_d);
+ return retval;
+}
+
+/*
+ * Available/Enable monitor shared seq functions.
+ */
+static int monitors_show(struct seq_file *m, void *p)
+{
+ struct rv_monitor_def *mon_def = p;
+
+ seq_printf(m, "%s\n", mon_def->monitor->name);
+ return 0;
+}
+
+/*
+ * Used by the seq file operations at the end of a read
+ * operation.
+ */
+static void monitors_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&rv_interface_lock);
+}
+
+/*
+ * Available monitor seq functions.
+ */
+static void *available_monitors_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&rv_interface_lock);
+ return seq_list_start(&rv_monitors_list, *pos);
+}
+
+static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &rv_monitors_list, pos);
+}
+
+/*
+ * Enable monitor seq functions.
+ */
+static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct rv_monitor_def *m_def = p;
+
+ (*pos)++;
+
+ list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
+ if (m_def->monitor->enabled)
+ return m_def;
+ }
+
+ return NULL;
+}
+
+static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
+{
+ struct rv_monitor_def *m_def;
+ loff_t l;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (list_empty(&rv_monitors_list))
+ return NULL;
+
+ m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+
+ for (l = 0; l <= *pos; ) {
+ m_def = enabled_monitors_next(m, m_def, &l);
+ if (!m_def)
+ break;
+ }
+
+ return m_def;
+}
+
+/*
+ * available/enabled monitors seq definition.
+ */
+static const struct seq_operations available_monitors_seq_ops = {
+ .start = available_monitors_start,
+ .next = available_monitors_next,
+ .stop = monitors_stop,
+ .show = monitors_show
+};
+
+static const struct seq_operations enabled_monitors_seq_ops = {
+ .start = enabled_monitors_start,
+ .next = enabled_monitors_next,
+ .stop = monitors_stop,
+ .show = monitors_show
+};
+
+/*
+ * available_monitors interface.
+ */
+static int available_monitors_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &available_monitors_seq_ops);
+};
+
+static const struct file_operations available_monitors_ops = {
+ .open = available_monitors_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * enabled_monitors interface.
+ */
+static void disable_all_monitors(void)
+{
+ struct rv_monitor_def *mdef;
+ int enabled = 0;
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry(mdef, &rv_monitors_list, list)
+ enabled += __rv_disable_monitor(mdef, false);
+
+ if (enabled) {
+ /*
+ * Wait for the execution of all events to finish.
+ * Otherwise, the data used by the monitor could
+ * be inconsistent. i.e., if the monitor is re-enabled.
+ */
+ tracepoint_synchronize_unregister();
+ }
+
+ mutex_unlock(&rv_interface_lock);
+}
+
+static int enabled_monitors_open(struct inode *inode, struct file *file)
+{
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+ disable_all_monitors();
+
+ return seq_open(file, &enabled_monitors_seq_ops);
+};
+
+static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
+ struct rv_monitor_def *mdef;
+ int retval = -EINVAL;
+ bool enable = true;
+ char *ptr = buff;
+ int len;
+
+ if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
+ return -EINVAL;
+
+ memset(buff, 0, sizeof(buff));
+
+ retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+ if (retval < 0)
+ return -EFAULT;
+
+ ptr = strim(buff);
+
+ if (ptr[0] == '!') {
+ enable = false;
+ ptr++;
+ }
+
+ len = strlen(ptr);
+ if (!len)
+ return count;
+
+ mutex_lock(&rv_interface_lock);
+
+ retval = -EINVAL;
+
+ list_for_each_entry(mdef, &rv_monitors_list, list) {
+ if (strcmp(ptr, mdef->monitor->name) != 0)
+ continue;
+
+ /*
+ * Monitor found!
+ */
+ if (enable)
+ retval = rv_enable_monitor(mdef);
+ else
+ retval = rv_disable_monitor(mdef);
+
+ if (!retval)
+ retval = count;
+
+ break;
+ }
+
+ mutex_unlock(&rv_interface_lock);
+ return retval;
+}
+
+static const struct file_operations enabled_monitors_ops = {
+ .open = enabled_monitors_open,
+ .read = seq_read,
+ .write = enabled_monitors_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Monitoring on global switcher!
+ */
+static bool __read_mostly monitoring_on;
+
+/**
+ * rv_monitoring_on - checks if monitoring is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_monitoring_on(void)
+{
+ /* Ensures that concurrent monitors read consistent monitoring_on */
+ smp_rmb();
+ return READ_ONCE(monitoring_on);
+}
+
+/*
+ * monitoring_on general switcher.
+ */
+static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ const char *buff;
+
+ buff = rv_monitoring_on() ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static void turn_monitoring_off(void)
+{
+ WRITE_ONCE(monitoring_on, false);
+ /* Ensures that concurrent monitors read consistent monitoring_on */
+ smp_wmb();
+}
+
+static void reset_all_monitors(void)
+{
+ struct rv_monitor_def *mdef;
+
+ list_for_each_entry(mdef, &rv_monitors_list, list) {
+ if (mdef->monitor->enabled)
+ mdef->monitor->reset();
+ }
+}
+
+static void turn_monitoring_on(void)
+{
+ WRITE_ONCE(monitoring_on, true);
+ /* Ensures that concurrent monitors read consistent monitoring_on */
+ smp_wmb();
+}
+
+static void turn_monitoring_on_with_reset(void)
+{
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (rv_monitoring_on())
+ return;
+
+ /*
+ * Monitors might be out of sync with the system if events were not
+ * processed because of !rv_monitoring_on().
+ *
+ * Reset all monitors, forcing a re-sync.
+ */
+ reset_all_monitors();
+ turn_monitoring_on();
+}
+
+static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int retval;
+ bool val;
+
+ retval = kstrtobool_from_user(user_buf, count, &val);
+ if (retval)
+ return retval;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (val)
+ turn_monitoring_on_with_reset();
+ else
+ turn_monitoring_off();
+
+ /*
+ * Wait for the execution of all events to finish
+ * before returning to user-space.
+ */
+ tracepoint_synchronize_unregister();
+
+ mutex_unlock(&rv_interface_lock);
+
+ return count;
+}
+
+static const struct file_operations monitoring_on_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = monitoring_on_write_data,
+ .read = monitoring_on_read_data,
+};
+
+static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+{
+ reactor_cleanup_monitor(mdef);
+ rv_remove(mdef->root_d);
+}
+
+/**
+ * rv_register_monitor - register a rv monitor.
+ * @monitor: The rv_monitor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_monitor(struct rv_monitor *monitor)
+{
+ struct rv_monitor_def *r;
+ int retval = 0;
+
+ if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
+ pr_info("Monitor %s has a name longer than %d\n", monitor->name,
+ MAX_RV_MONITOR_NAME_SIZE);
+ return -1;
+ }
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry(r, &rv_monitors_list, list) {
+ if (strcmp(monitor->name, r->monitor->name) == 0) {
+ pr_info("Monitor %s is already registered\n", monitor->name);
+ retval = -1;
+ goto out_unlock;
+ }
+ }
+
+ r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
+ if (!r) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ r->monitor = monitor;
+
+ retval = create_monitor_dir(r);
+ if (retval) {
+ kfree(r);
+ goto out_unlock;
+ }
+
+ list_add_tail(&r->list, &rv_monitors_list);
+
+out_unlock:
+ mutex_unlock(&rv_interface_lock);
+ return retval;
+}
+
+/**
+ * rv_unregister_monitor - unregister a rv monitor.
+ * @monitor: The rv_monitor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_monitor(struct rv_monitor *monitor)
+{
+ struct rv_monitor_def *ptr, *next;
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
+ if (strcmp(monitor->name, ptr->monitor->name) == 0) {
+ rv_disable_monitor(ptr);
+ list_del(&ptr->list);
+ destroy_monitor_dir(ptr);
+ }
+ }
+
+ mutex_unlock(&rv_interface_lock);
+ return 0;
+}
+
+int __init rv_init_interface(void)
+{
+ struct dentry *tmp;
+ int retval;
+
+ rv_root.root_dir = rv_create_dir("rv", NULL);
+ if (!rv_root.root_dir)
+ goto out_err;
+
+ rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir);
+ if (!rv_root.monitors_dir)
+ goto out_err;
+
+ tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL,
+ &available_monitors_ops);
+ if (!tmp)
+ goto out_err;
+
+ tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ &enabled_monitors_ops);
+ if (!tmp)
+ goto out_err;
+
+ tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL,
+ &monitoring_on_fops);
+ if (!tmp)
+ goto out_err;
+ retval = init_rv_reactors(rv_root.root_dir);
+ if (retval)
+ goto out_err;
+
+ turn_monitoring_on();
+
+ return 0;
+
+out_err:
+ rv_remove(rv_root.root_dir);
+ printk(KERN_ERR "RV: Error while creating the RV interface\n");
+ return 1;
+}
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
new file mode 100644
index 000000000000..db6cb0913dbd
--- /dev/null
+++ b/kernel/trace/rv/rv.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/mutex.h>
+
+struct rv_interface {
+ struct dentry *root_dir;
+ struct dentry *monitors_dir;
+};
+
+#include "../trace.h"
+#include <linux/tracefs.h>
+#include <linux/rv.h>
+
+#define RV_MODE_WRITE TRACE_MODE_WRITE
+#define RV_MODE_READ TRACE_MODE_READ
+
+#define rv_create_dir tracefs_create_dir
+#define rv_create_file tracefs_create_file
+#define rv_remove tracefs_remove
+
+#define MAX_RV_MONITOR_NAME_SIZE 32
+#define MAX_RV_REACTOR_NAME_SIZE 32
+
+extern struct mutex rv_interface_lock;
+
+#ifdef CONFIG_RV_REACTORS
+struct rv_reactor_def {
+ struct list_head list;
+ struct rv_reactor *reactor;
+ /* protected by the monitor interface lock */
+ int counter;
+};
+#endif
+
+struct rv_monitor_def {
+ struct list_head list;
+ struct rv_monitor *monitor;
+ struct dentry *root_d;
+#ifdef CONFIG_RV_REACTORS
+ struct rv_reactor_def *rdef;
+ bool reacting;
+#endif
+ bool task_monitor;
+};
+
+struct dentry *get_monitors_root(void);
+int rv_disable_monitor(struct rv_monitor_def *mdef);
+int rv_enable_monitor(struct rv_monitor_def *mdef);
+
+#ifdef CONFIG_RV_REACTORS
+int reactor_populate_monitor(struct rv_monitor_def *mdef);
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int init_rv_reactors(struct dentry *root_dir);
+#else
+static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+ return 0;
+}
+
+static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+ return;
+}
+
+static inline int init_rv_reactors(struct dentry *root_dir)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
new file mode 100644
index 000000000000..6aae106695b6
--- /dev/null
+++ b/kernel/trace/rv/rv_reactors.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Runtime reactor interface.
+ *
+ * A runtime monitor can cause a reaction to the detection of an
+ * exception on the model's execution. By default, the monitors have
+ * tracing reactions, printing the monitor output via tracepoints.
+ * But other reactions can be added (on-demand) via this interface.
+ *
+ * == Registering reactors ==
+ *
+ * The struct rv_reactor defines a callback function to be executed
+ * in case of a model exception happens. The callback function
+ * receives a message to be (optionally) printed before executing
+ * the reaction.
+ *
+ * A RV reactor is registered via:
+ * int rv_register_reactor(struct rv_reactor *reactor)
+ * And unregistered via:
+ * int rv_unregister_reactor(struct rv_reactor *reactor)
+ *
+ * These functions are exported to modules, enabling reactors to be
+ * dynamically loaded.
+ *
+ * == User interface ==
+ *
+ * The user interface resembles the kernel tracing interface and
+ * presents these files:
+ *
+ * "available_reactors"
+ * - List the available reactors, one per line.
+ *
+ * For example:
+ * # cat available_reactors
+ * nop
+ * panic
+ * printk
+ *
+ * "reacting_on"
+ * - It is an on/off general switch for reactors, disabling
+ * all reactions.
+ *
+ * "monitors/MONITOR/reactors"
+ * - List available reactors, with the select reaction for the given
+ * MONITOR inside []. The default one is the nop (no operation)
+ * reactor.
+ * - Writing the name of an reactor enables it to the given
+ * MONITOR.
+ *
+ * For example:
+ * # cat monitors/wip/reactors
+ * [nop]
+ * panic
+ * printk
+ * # echo panic > monitors/wip/reactors
+ * # cat monitors/wip/reactors
+ * nop
+ * [panic]
+ * printk
+ */
+
+#include <linux/slab.h>
+
+#include "rv.h"
+
+/*
+ * Interface for the reactor register.
+ */
+static LIST_HEAD(rv_reactors_list);
+
+static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+{
+ struct rv_reactor_def *r;
+
+ list_for_each_entry(r, &rv_reactors_list, list) {
+ if (strcmp(name, r->reactor->name) == 0)
+ return r;
+ }
+ return NULL;
+}
+
+/*
+ * Available reactors seq functions.
+ */
+static int reactors_show(struct seq_file *m, void *p)
+{
+ struct rv_reactor_def *rea_def = p;
+
+ seq_printf(m, "%s\n", rea_def->reactor->name);
+ return 0;
+}
+
+static void reactors_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&rv_interface_lock);
+}
+
+static void *reactors_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&rv_interface_lock);
+ return seq_list_start(&rv_reactors_list, *pos);
+}
+
+static void *reactors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &rv_reactors_list, pos);
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations available_reactors_seq_ops = {
+ .start = reactors_start,
+ .next = reactors_next,
+ .stop = reactors_stop,
+ .show = reactors_show
+};
+
+/*
+ * available_reactors interface.
+ */
+static int available_reactors_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &available_reactors_seq_ops);
+};
+
+static const struct file_operations available_reactors_ops = {
+ .open = available_reactors_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * Monitor's reactor file.
+ */
+static int monitor_reactor_show(struct seq_file *m, void *p)
+{
+ struct rv_monitor_def *mdef = m->private;
+ struct rv_reactor_def *rdef = p;
+
+ if (mdef->rdef == rdef)
+ seq_printf(m, "[%s]\n", rdef->reactor->name);
+ else
+ seq_printf(m, "%s\n", rdef->reactor->name);
+ return 0;
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations monitor_reactors_seq_ops = {
+ .start = reactors_start,
+ .next = reactors_next,
+ .stop = reactors_stop,
+ .show = monitor_reactor_show
+};
+
+static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
+ bool reacting)
+{
+ bool monitor_enabled;
+
+ /* nothing to do */
+ if (mdef->rdef == rdef)
+ return;
+
+ monitor_enabled = mdef->monitor->enabled;
+ if (monitor_enabled)
+ rv_disable_monitor(mdef);
+
+ /* swap reactor's usage */
+ mdef->rdef->counter--;
+ rdef->counter++;
+
+ mdef->rdef = rdef;
+ mdef->reacting = reacting;
+ mdef->monitor->react = rdef->reactor->react;
+
+ if (monitor_enabled)
+ rv_enable_monitor(mdef);
+}
+
+static ssize_t
+monitor_reactors_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
+ struct rv_monitor_def *mdef;
+ struct rv_reactor_def *rdef;
+ struct seq_file *seq_f;
+ int retval = -EINVAL;
+ bool enable;
+ char *ptr;
+ int len;
+
+ if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1)
+ return -EINVAL;
+
+ memset(buff, 0, sizeof(buff));
+
+ retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+ if (retval < 0)
+ return -EFAULT;
+
+ ptr = strim(buff);
+
+ len = strlen(ptr);
+ if (!len)
+ return count;
+
+ /*
+ * See monitor_reactors_open()
+ */
+ seq_f = file->private_data;
+ mdef = seq_f->private;
+
+ mutex_lock(&rv_interface_lock);
+
+ retval = -EINVAL;
+
+ list_for_each_entry(rdef, &rv_reactors_list, list) {
+ if (strcmp(ptr, rdef->reactor->name) != 0)
+ continue;
+
+ if (rdef == get_reactor_rdef_by_name("nop"))
+ enable = false;
+ else
+ enable = true;
+
+ monitor_swap_reactors(mdef, rdef, enable);
+
+ retval = count;
+ break;
+ }
+
+ mutex_unlock(&rv_interface_lock);
+
+ return retval;
+}
+
+/*
+ * available_reactors interface.
+ */
+static int monitor_reactors_open(struct inode *inode, struct file *file)
+{
+ struct rv_monitor_def *mdef = inode->i_private;
+ struct seq_file *seq_f;
+ int ret;
+
+ ret = seq_open(file, &monitor_reactors_seq_ops);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * seq_open stores the seq_file on the file->private data.
+ */
+ seq_f = file->private_data;
+
+ /*
+ * Copy the create file "private" data to the seq_file private data.
+ */
+ seq_f->private = mdef;
+
+ return 0;
+};
+
+static const struct file_operations monitor_reactors_ops = {
+ .open = monitor_reactors_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = monitor_reactors_write
+};
+
+static int __rv_register_reactor(struct rv_reactor *reactor)
+{
+ struct rv_reactor_def *r;
+
+ list_for_each_entry(r, &rv_reactors_list, list) {
+ if (strcmp(reactor->name, r->reactor->name) == 0) {
+ pr_info("Reactor %s is already registered\n", reactor->name);
+ return -EINVAL;
+ }
+ }
+
+ r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->reactor = reactor;
+ r->counter = 0;
+
+ list_add_tail(&r->list, &rv_reactors_list);
+
+ return 0;
+}
+
+/**
+ * rv_register_reactor - register a rv reactor.
+ * @reactor: The rv_reactor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_reactor(struct rv_reactor *reactor)
+{
+ int retval = 0;
+
+ if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) {
+ pr_info("Reactor %s has a name longer than %d\n",
+ reactor->name, MAX_RV_MONITOR_NAME_SIZE);
+ return -EINVAL;
+ }
+
+ mutex_lock(&rv_interface_lock);
+ retval = __rv_register_reactor(reactor);
+ mutex_unlock(&rv_interface_lock);
+ return retval;
+}
+
+/**
+ * rv_unregister_reactor - unregister a rv reactor.
+ * @reactor: The rv_reactor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_reactor(struct rv_reactor *reactor)
+{
+ struct rv_reactor_def *ptr, *next;
+ int ret = 0;
+
+ mutex_lock(&rv_interface_lock);
+
+ list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
+ if (strcmp(reactor->name, ptr->reactor->name) == 0) {
+
+ if (!ptr->counter) {
+ list_del(&ptr->list);
+ } else {
+ printk(KERN_WARNING
+ "rv: the rv_reactor %s is in use by %d monitor(s)\n",
+ ptr->reactor->name, ptr->counter);
+ printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
+ ptr->reactor->name);
+ ret = -EBUSY;
+ break;
+ }
+ }
+ }
+
+ mutex_unlock(&rv_interface_lock);
+ return ret;
+}
+
+/*
+ * reacting_on interface.
+ */
+static bool __read_mostly reacting_on;
+
+/**
+ * rv_reacting_on - checks if reacting is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_reacting_on(void)
+{
+ /* Ensures that concurrent monitors read consistent reacting_on */
+ smp_rmb();
+ return READ_ONCE(reacting_on);
+}
+
+static ssize_t reacting_on_read_data(struct file *filp,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char *buff;
+
+ buff = rv_reacting_on() ? "1\n" : "0\n";
+
+ return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+static void turn_reacting_off(void)
+{
+ WRITE_ONCE(reacting_on, false);
+ /* Ensures that concurrent monitors read consistent reacting_on */
+ smp_wmb();
+}
+
+static void turn_reacting_on(void)
+{
+ WRITE_ONCE(reacting_on, true);
+ /* Ensures that concurrent monitors read consistent reacting_on */
+ smp_wmb();
+}
+
+static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int retval;
+ bool val;
+
+ retval = kstrtobool_from_user(user_buf, count, &val);
+ if (retval)
+ return retval;
+
+ mutex_lock(&rv_interface_lock);
+
+ if (val)
+ turn_reacting_on();
+ else
+ turn_reacting_off();
+
+ /*
+ * Wait for the execution of all events to finish
+ * before returning to user-space.
+ */
+ tracepoint_synchronize_unregister();
+
+ mutex_unlock(&rv_interface_lock);
+
+ return count;
+}
+
+static const struct file_operations reacting_on_fops = {
+ .open = simple_open,
+ .llseek = no_llseek,
+ .write = reacting_on_write_data,
+ .read = reacting_on_read_data,
+};
+
+/**
+ * reactor_populate_monitor - creates per monitor reactors file
+ * @mdef: monitor's definition.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+ struct dentry *tmp;
+
+ tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+ if (!tmp)
+ return -ENOMEM;
+
+ /*
+ * Configure as the rv_nop reactor.
+ */
+ mdef->rdef = get_reactor_rdef_by_name("nop");
+ mdef->rdef->counter++;
+ mdef->reacting = false;
+
+ return 0;
+}
+
+/**
+ * reactor_cleanup_monitor - cleanup a monitor reference
+ * @mdef: monitor's definition.
+ */
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+ lockdep_assert_held(&rv_interface_lock);
+ mdef->rdef->counter--;
+ WARN_ON_ONCE(mdef->rdef->counter < 0);
+}
+
+/*
+ * Nop reactor register
+ */
+static void rv_nop_reaction(char *msg)
+{
+}
+
+static struct rv_reactor rv_nop = {
+ .name = "nop",
+ .description = "no-operation reactor: do nothing.",
+ .react = rv_nop_reaction
+};
+
+int init_rv_reactors(struct dentry *root_dir)
+{
+ struct dentry *available, *reacting;
+ int retval;
+
+ available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL,
+ &available_reactors_ops);
+ if (!available)
+ goto out_err;
+
+ reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
+ if (!reacting)
+ goto rm_available;
+
+ retval = __rv_register_reactor(&rv_nop);
+ if (retval)
+ goto rm_reacting;
+
+ turn_reacting_on();
+
+ return 0;
+
+rm_reacting:
+ rv_remove(reacting);
+rm_available:
+ rv_remove(available);
+out_err:
+ return -ENOMEM;
+}
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 0b15e975d2c2..8d77526892f4 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -120,15 +120,13 @@ static int __init test_gen_synth_cmd(void)
/* Now generate a gen_synth_test event */
ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals));
- out:
+ free:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
synth_event_delete("gen_synth_test");
- free:
- kfree(buf);
-
- goto out;
+ goto free;
}
/*
@@ -227,15 +225,13 @@ static int __init test_empty_synth_event(void)
/* Now trace an empty_synth_test event */
ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals));
- out:
+ free:
+ kfree(buf);
return ret;
delete:
/* We got an error after creating the event, delete it */
synth_event_delete("empty_synth_test");
- free:
- kfree(buf);
-
- goto out;
+ goto free;
}
static struct synth_field_desc create_synth_test_fields[] = {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c860f582b078..5cfc95a52bc3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
static int __init set_cmdline_ftrace(char *str)
{
@@ -230,12 +231,21 @@ static int __init boot_alloc_snapshot(char *str)
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- return 0;
+ return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -246,12 +256,16 @@ static int __init set_trace_boot_clock(char *str)
{
strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
trace_boot_clock = trace_boot_clock_buf;
- return 0;
+ return 1;
}
__setup("trace_clock=", set_trace_boot_clock);
static int __init set_tracepoint_printk(char *str)
{
+ /* Ignore the "tp_printk_stop_on_boot" param */
+ if (*str == '_')
+ return 0;
+
if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
tracepoint_printk = 1;
return 1;
@@ -707,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
pos = 0;
ret = trace_get_user(&parser, ubuf, cnt, &pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
+ if (ret < 0)
break;
read += ret;
ubuf += ret;
cnt -= ret;
+ if (!trace_parser_loaded(&parser))
+ break;
+
ret = -EINVAL;
if (kstrtoul(parser.buffer, 0, &val))
break;
@@ -739,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
if (!nr_pids) {
/* Cleared the list of pids */
trace_pid_list_free(pid_list);
- read = ret;
pid_list = NULL;
}
@@ -1160,7 +1176,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
/**
- * tracing_snapshot_cond_data - get the user data associated with a snapshot
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
* @tr: The tracing instance
*
* When the user enables a conditional snapshot using
@@ -1177,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
{
void *cond_data = NULL;
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
cond_data = tr->cond_snapshot->cond_data;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
return cond_data;
}
@@ -1318,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
goto fail_unlock;
}
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
tr->cond_snapshot = cond_snapshot;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
mutex_unlock(&trace_types_lock);
@@ -1347,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
{
int ret = 0;
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (!tr->cond_snapshot)
@@ -1357,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
return ret;
}
@@ -1470,10 +1492,12 @@ static int __init set_buf_size(char *str)
if (!str)
return 0;
buf_size = memparse(str, &str);
- /* nr_entries can not be zero */
- if (buf_size == 0)
- return 0;
- trace_buf_size = buf_size;
+ /*
+ * nr_entries can not be zero and the startup
+ * tests require some buffer space. Therefore
+ * ensure we have at least 4096 bytes of buffer.
+ */
+ trace_buf_size = max(4096UL, buf_size);
return 1;
}
__setup("trace_buf_size=", set_buf_size);
@@ -1526,6 +1550,7 @@ static struct {
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
{ ktime_get_boot_fast_ns, "boot", 1 },
+ { ktime_get_tai_fast_ns, "tai", 1 },
ARCH_TRACE_CLOCKS
};
@@ -2155,10 +2180,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -2170,6 +2197,13 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
/*
* The tgid_map array maps from pid to tgid; i.e. the value stored at index i
* is the tgid last observed corresponding to pid=i.
@@ -2181,6 +2215,11 @@ static size_t tgid_map_max;
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -2393,7 +2432,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
+ *
+ * This is called within the scheduler and wake up, so interrupts
+ * had better been disabled and run queue lock been held.
*/
+ lockdep_assert_preemption_disabled();
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
@@ -2819,7 +2862,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock);
static DEFINE_MUTEX(tracepoint_printk_mutex);
static void output_printk(struct trace_event_buffer *fbuffer)
@@ -2847,14 +2890,14 @@ static void output_printk(struct trace_event_buffer *fbuffer)
event = &fbuffer->trace_file->event_call->event;
- spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ raw_spin_lock_irqsave(&tracepoint_iter_lock, flags);
trace_seq_init(&iter->seq);
iter->ent = fbuffer->entry;
event_call->event.funcs->trace(iter, 0, event);
trace_seq_putc(&iter->seq, 0);
printk("%s", iter->seq.buffer);
- spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+ raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
@@ -3086,17 +3129,17 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
}
/*
- * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+ * When an NMI triggers, RCU is enabled via ct_nmi_enter(),
* but if the above rcu_is_watching() failed, then the NMI
- * triggered someplace critical, and rcu_irq_enter() should
+ * triggered someplace critical, and ct_irq_enter() should
* not be called from NMI.
*/
if (unlikely(in_nmi()))
return;
- rcu_irq_enter_irqson();
+ ct_irq_enter_irqson();
__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
- rcu_irq_exit_irqson();
+ ct_irq_exit_irqson();
}
/**
@@ -3657,12 +3700,17 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
}
/* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+ bool star, int len)
{
unsigned long addr = (unsigned long)str;
struct trace_event *trace_event;
struct trace_event_call *event;
+ /* Ignore strings with no length */
+ if (star && !len)
+ return true;
+
/* OK if part of the event data */
if ((addr >= (unsigned long)iter->ent) &&
(addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3848,7 +3896,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str),
+ if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
"fmt: '%s' current_buffer: '%s'",
fmt, show_buffer(&iter->seq))) {
int ret;
@@ -4228,7 +4276,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
- const char *space = " ";
+ static const char space[] = " ";
int prec = tgid ? 12 : 2;
print_event_info(buf, m);
@@ -4252,9 +4300,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
struct tracer *type = iter->trace;
unsigned long entries;
unsigned long total;
- const char *name = "preemption";
-
- name = type->name;
+ const char *name = type->name;
get_total_entries(buf, &total, &entries);
@@ -4268,17 +4314,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
-#if defined(CONFIG_PREEMPT_NONE)
- "server",
-#elif defined(CONFIG_PREEMPT_VOLUNTARY)
- "desktop",
-#elif defined(CONFIG_PREEMPT)
- "preempt",
-#elif defined(CONFIG_PREEMPT_RT)
- "preempt_rt",
-#else
+ preempt_model_none() ? "server" :
+ preempt_model_voluntary() ? "desktop" :
+ preempt_model_full() ? "preempt" :
+ preempt_model_rt() ? "preempt_rt" :
"unknown",
-#endif
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
@@ -5454,7 +5494,7 @@ static const char readme_msg[] =
" error_log\t- error log for failed commands (that support it)\n"
" buffer_size_kb\t- view and modify size of per cpu buffer\n"
" buffer_total_size_kb - view total size of all cpu buffers\n\n"
- " trace_clock\t\t-change the clock used to order events\n"
+ " trace_clock\t\t- change the clock used to order events\n"
" local: Per cpu clock but may not be synced across CPUs\n"
" global: Synced across CPUs but slows tracing down.\n"
" counter: Not a clock, but just an increment\n"
@@ -5463,7 +5503,7 @@ static const char readme_msg[] =
#ifdef CONFIG_X86_64
" x86-tsc: TSC cycle counter\n"
#endif
- "\n timestamp_mode\t-view the mode used to timestamp events\n"
+ "\n timestamp_mode\t- view the mode used to timestamp events\n"
" delta: Delta difference against a buffer-wide timestamp\n"
" absolute: Absolute (standalone) timestamp\n"
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
@@ -5553,13 +5593,13 @@ static const char readme_msg[] =
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
- "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
- "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
+ "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n"
+ "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
- "\t e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]\n"
- "\t -:[<group>/]<event>\n"
+ "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n"
+ "\t -:[<group>/][<event>]\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
"place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n"
@@ -5874,9 +5914,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
char buf[64];
int r;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -5901,10 +5943,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -6311,12 +6355,18 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
+static bool tracer_options_updated;
+
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
+ /* Only create trace option files after update_tracer_options finish */
+ if (!tracer_options_updated)
+ return;
+
create_trace_option_files(tr, t);
}
@@ -6351,10 +6401,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
if (ret)
goto out;
}
@@ -6385,12 +6437,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ had_max_tr = tr->current_trace->use_max_tr;
+
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
-#ifdef CONFIG_TRACER_MAX_TRACE
- had_max_tr = tr->allocated_snapshot;
-
if (had_max_tr && !t->use_max_tr) {
/*
* We need to make sure that the update_max_tr sees that
@@ -6402,14 +6454,14 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
synchronize_rcu();
free_snapshot(tr);
}
-#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (t->use_max_tr && !had_max_tr) {
+ if (t->use_max_tr && !tr->allocated_snapshot) {
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
+#else
+ tr->current_trace = &nop_trace;
#endif
if (t->init) {
@@ -6433,7 +6485,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+1];
- int i;
+ char *name;
size_t ret;
int err;
@@ -6447,11 +6499,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
- /* strip ending whitespace. */
- for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
- buf[i] = 0;
+ name = strim(buf);
- err = tracing_set_tracer(tr, buf);
+ err = tracing_set_tracer(tr, name);
if (err)
return err;
@@ -6616,6 +6666,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
+ kfree(iter->fmt);
mutex_destroy(&iter->mutex);
kfree(iter);
@@ -6640,7 +6691,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
- filp, poll_table);
+ filp, poll_table, iter->tr->buffer_percent);
}
static __poll_t
@@ -7418,10 +7469,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
goto out;
}
+ local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
if (ret)
goto out;
@@ -7719,7 +7772,7 @@ const struct file_operations trace_min_max_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7727,26 +7780,53 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
{
struct tracing_log_err *err;
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
+{
+ struct tracing_log_err *err;
+ char *cmd;
+
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- else
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
tr->n_err_log_entries++;
return err;
}
-
+ cmd = kzalloc(len, GFP_KERNEL);
+ if (!cmd)
+ return ERR_PTR(-ENOMEM);
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = cmd;
list_del(&err->list);
return err;
@@ -7807,22 +7887,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7840,7 +7923,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7868,9 +7951,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -8090,6 +8173,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8243,6 +8332,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ long wait_index;
+
if (ret)
goto out;
@@ -8250,10 +8341,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
+ wait_index = READ_ONCE(iter->wait_index);
+
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
if (ret)
goto out;
+ /* No need to wait after waking up when tracing is off */
+ if (!tracer_tracing_is_on(iter->tr))
+ goto out;
+
+ /* Make sure we see the new wait_index */
+ smp_rmb();
+ if (wait_index != iter->wait_index)
+ goto out;
+
goto again;
}
@@ -8264,12 +8366,34 @@ out:
return ret;
}
+/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ if (cmd)
+ return -ENOIOCTLCMD;
+
+ mutex_lock(&trace_types_lock);
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ mutex_unlock(&trace_types_lock);
+ return 0;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
+ .unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
};
@@ -8958,6 +9082,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
tracer_tracing_off(tr);
if (tr->current_trace->stop)
tr->current_trace->stop(tr);
+ /* Wake up any waiters */
+ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
mutex_unlock(&trace_types_lock);
}
@@ -9054,6 +9180,16 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
return 0;
}
+static void free_trace_buffer(struct array_buffer *buf)
+{
+ if (buf->buffer) {
+ ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
+ free_percpu(buf->data);
+ buf->data = NULL;
+ }
+}
+
static int allocate_trace_buffers(struct trace_array *tr, int size)
{
int ret;
@@ -9066,10 +9202,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
- ring_buffer_free(tr->array_buffer.buffer);
- tr->array_buffer.buffer = NULL;
- free_percpu(tr->array_buffer.data);
- tr->array_buffer.data = NULL;
+ free_trace_buffer(&tr->array_buffer);
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
@@ -9084,16 +9217,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return 0;
}
-static void free_trace_buffer(struct array_buffer *buf)
-{
- if (buf->buffer) {
- ring_buffer_free(buf->buffer);
- buf->buffer = NULL;
- free_percpu(buf->data);
- buf->data = NULL;
- }
-}
-
static void free_trace_buffers(struct trace_array *tr)
{
if (!tr)
@@ -9126,6 +9249,7 @@ static void __update_tracer_options(struct trace_array *tr)
static void update_tracer_options(struct trace_array *tr)
{
mutex_lock(&trace_types_lock);
+ tracer_options_updated = true;
__update_tracer_options(tr);
mutex_unlock(&trace_types_lock);
}
@@ -9558,6 +9682,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[];
static struct workqueue_struct *eval_map_wq __initdata;
static struct work_struct eval_map_work __initdata;
+static struct work_struct tracerfs_init_work __initdata;
static void __init eval_map_work_func(struct work_struct *work)
{
@@ -9583,6 +9708,8 @@ static int __init trace_eval_init(void)
return 0;
}
+subsys_initcall(trace_eval_init);
+
static int __init trace_eval_sync(void)
{
/* Make sure the eval map updates are finished */
@@ -9665,15 +9792,8 @@ static struct notifier_block trace_module_nb = {
};
#endif /* CONFIG_MODULES */
-static __init int tracer_init_tracefs(void)
+static __init void tracer_init_tracefs_work_func(struct work_struct *work)
{
- int ret;
-
- trace_access_lock_init();
-
- ret = tracing_init_dentry();
- if (ret)
- return 0;
event_trace_init();
@@ -9695,8 +9815,6 @@ static __init int tracer_init_tracefs(void)
trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_tgids_fops);
- trace_eval_init();
-
trace_create_eval_file(NULL);
#ifdef CONFIG_MODULES
@@ -9711,6 +9829,26 @@ static __init int tracer_init_tracefs(void)
create_trace_instances(NULL);
update_tracer_options(&global_trace);
+}
+
+static __init int tracer_init_tracefs(void)
+{
+ int ret;
+
+ trace_access_lock_init();
+
+ ret = tracing_init_dentry();
+ if (ret)
+ return 0;
+
+ if (eval_map_wq) {
+ INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
+ queue_work(eval_map_wq, &tracerfs_init_work);
+ } else {
+ tracer_init_tracefs_work_func(NULL);
+ }
+
+ rv_init_interface();
return 0;
}
@@ -9804,6 +9942,12 @@ void trace_init_global_iter(struct trace_iterator *iter)
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
if (trace_clocks[iter->tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ /* Can not use kmalloc for iter.temp and iter.fmt */
+ iter->temp = static_temp_buf;
+ iter->temp_size = STATIC_TEMP_BUF_SIZE;
+ iter->fmt = static_fmt_buf;
+ iter->fmt_size = STATIC_FMT_BUF_SIZE;
}
void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
@@ -9836,11 +9980,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* Simulate the iterator */
trace_init_global_iter(&iter);
- /* Can not use kmalloc for iter.temp and iter.fmt */
- iter.temp = static_temp_buf;
- iter.temp_size = STATIC_TEMP_BUF_SIZE;
- iter.fmt = static_fmt_buf;
- iter.fmt_size = STATIC_FMT_BUF_SIZE;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -10031,7 +10170,7 @@ __init static int tracer_alloc_buffers(void)
* buffer. The memory will be removed once the "instance" is removed.
*/
ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
- "trace/RB:preapre", trace_rb_cpu_prepare,
+ "trace/RB:prepare", trace_rb_cpu_prepare,
NULL);
if (ret < 0)
goto out_free_cpumask;
@@ -10116,6 +10255,14 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+ if (snapshot_at_boot) {
+ tracing_snapshot();
+ internal_trace_puts("** Boot snapshot taken **\n");
+ }
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d038ddbf1bea..d42e24507152 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,7 +136,6 @@ struct kprobe_trace_entry_head {
struct eprobe_trace_entry_head {
struct trace_entry ent;
- unsigned int type;
};
struct kretprobe_trace_entry_head {
@@ -581,6 +580,7 @@ int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
@@ -1436,8 +1436,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
struct filter_pred;
struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
enum regex_type {
@@ -1456,17 +1454,6 @@ struct regex {
regex_match_func match;
};
-struct filter_pred {
- filter_pred_fn_t fn;
- u64 val;
- struct regex regex;
- unsigned short *ops;
- struct ftrace_event_field *field;
- int offset;
- int not;
- int op;
-};
-
static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
@@ -1574,13 +1561,12 @@ struct enable_trigger_data {
};
extern int event_enable_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
- struct event_trigger_data *data);
-extern void event_enable_trigger_free(struct event_trigger_ops *ops,
struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_data *data);
extern int event_enable_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+ char *glob, char *cmd,
+ char *param_and_filter);
extern int event_enable_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file);
@@ -1588,8 +1574,7 @@ extern void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file);
extern void trigger_data_free(struct event_trigger_data *data);
-extern int event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
+extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
int trigger_enable);
extern void update_cond_flag(struct trace_event_file *file);
@@ -1630,10 +1615,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops,
extern int event_trigger_register(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob,
- char *cmd,
- char *trigger,
- struct event_trigger_data *trigger_data,
- int *n_registered);
+ struct event_trigger_data *trigger_data);
+extern void event_trigger_unregister(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data);
/**
* struct event_trigger_ops - callbacks for trace event triggers
@@ -1687,12 +1673,9 @@ struct event_trigger_ops {
struct trace_buffer *buffer,
void *rec,
struct ring_buffer_event *rbe);
- int (*init)(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
- void (*free)(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
+ int (*init)(struct event_trigger_data *data);
+ void (*free)(struct event_trigger_data *data);
int (*print)(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data);
};
@@ -1878,7 +1861,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
@@ -2010,4 +1993,13 @@ struct trace_min_max_param {
extern const struct file_operations trace_min_max_fops;
+#ifdef CONFIG_RV
+extern int rv_init_interface(void);
+#else
+static inline int rv_init_interface(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 801c2a7f7605..54d5fa35c90a 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
local_irq_disable();
start = trace_clock_local();
- trace_benchmark_event(bm_str);
+ trace_benchmark_event(bm_str, bm_last);
stop = trace_clock_local();
local_irq_enable();
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 79e6fbe5b365..c3e91060dc94 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
TRACE_EVENT_FN(benchmark_event,
- TP_PROTO(const char *str),
+ TP_PROTO(const char *str, u64 delta),
- TP_ARGS(str),
+ TP_ARGS(str, delta),
TP_STRUCT__entry(
__array( char, str, BENCHMARK_EVENT_STRLEN )
+ __field( u64, delta)
),
TP_fast_assign(
memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ __entry->delta = delta;
),
- TP_printk("%s", __entry->str),
+ TP_printk("%s delta=%llu", __entry->str, __entry->delta),
trace_benchmark_reg, trace_benchmark_unreg
);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0580287d7a0d..778200dd8ede 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
{
struct xbc_node *node;
const char *p, *handler;
- int ret;
+ int ret = 0;
handler = xbc_node_get_data(hnode);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index e34e8182ee4b..4376887e0d8a 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -101,7 +101,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (event[0] == '\0') {
+ if (!system && event[0] == '\0') {
ret = -EINVAL;
goto out;
}
@@ -118,6 +118,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
if (ret)
break;
}
+ tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
out:
argv_free(argv);
@@ -214,6 +215,7 @@ int dyn_events_release_all(struct dyn_event_operations *type)
break;
}
out:
+ tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
return ret;
@@ -255,19 +257,14 @@ static const struct file_operations dynamic_events_ops = {
/* Make a tracefs interface for controlling dynamic events */
static __init int init_dynamic_event(void)
{
- struct dentry *entry;
int ret;
ret = tracing_init_dentry();
if (ret)
return 0;
- entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
- NULL, &dynamic_events_ops);
-
- /* Event list interface */
- if (!entry)
- pr_warn("Could not create tracefs 'dynamic_events' entry\n");
+ trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
+ NULL, &dynamic_events_ops);
return 0;
}
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 191db32dec46..352b65e2b910 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -16,6 +16,7 @@
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -26,6 +27,9 @@ struct trace_eprobe {
/* tracepoint event */
const char *event_name;
+ /* filter string for the tracepoint */
+ char *filter_str;
+
struct trace_event_call *event;
struct dyn_event devent;
@@ -48,6 +52,7 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep)
kfree(ep->event_system);
if (ep->event)
trace_event_put_ref(ep->event);
+ kfree(ep->filter_str);
kfree(ep);
}
@@ -125,6 +130,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event,
* We match the following:
* event only - match all eprobes with event name
* system and event only - match all system/event probes
+ * system only - match all system probes
*
* The below has the above satisfied with more arguments:
*
@@ -143,7 +149,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event,
return false;
/* Must match the event name */
- if (strcmp(trace_probe_name(&ep->tp), event) != 0)
+ if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0)
return false;
/* No arguments match all */
@@ -226,6 +232,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
struct probe_arg *parg = &ep->tp.args[i];
struct ftrace_event_field *field;
struct list_head *head;
+ int ret = -ENOENT;
head = trace_get_fields(ep->event);
list_for_each_entry(field, head, link) {
@@ -235,14 +242,24 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
return 0;
}
}
+
+ /*
+ * Argument not found on event. But allow for comm and COMM
+ * to be used to get the current->comm.
+ */
+ if (strcmp(parg->code->data, "COMM") == 0 ||
+ strcmp(parg->code->data, "comm") == 0) {
+ parg->code->op = FETCH_OP_COMM;
+ ret = 0;
+ }
+
kfree(parg->code->data);
parg->code->data = NULL;
- return -ENOENT;
+ return ret;
}
static int eprobe_event_define_fields(struct trace_event_call *event_call)
{
- int ret;
struct eprobe_trace_entry_head field;
struct trace_probe *tp;
@@ -250,8 +267,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call)
if (WARN_ON_ONCE(!tp))
return -ENOENT;
- DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0);
-
return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}
@@ -270,7 +285,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
struct trace_event_call *pevent;
struct trace_event *probed_event;
struct trace_seq *s = &iter->seq;
+ struct trace_eprobe *ep;
struct trace_probe *tp;
+ unsigned int type;
field = (struct eprobe_trace_entry_head *)iter->ent;
tp = trace_probe_primary_from_call(
@@ -278,15 +295,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
if (WARN_ON_ONCE(!tp))
goto out;
+ ep = container_of(tp, struct trace_eprobe, tp);
+ type = ep->event->event.type;
+
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- probed_event = ftrace_find_event(field->type);
+ probed_event = ftrace_find_event(type);
if (probed_event) {
pevent = container_of(probed_event, struct trace_event_call, event);
trace_seq_printf(s, "%s.%s", pevent->class->system,
trace_event_name(pevent));
} else {
- trace_seq_printf(s, "%u", field->type);
+ trace_seq_printf(s, "%u", type);
}
trace_seq_putc(s, ')');
@@ -308,6 +328,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec)
addr = rec + field->offset;
+ if (is_string_field(field)) {
+ switch (field->filter_type) {
+ case FILTER_DYN_STRING:
+ val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_RDYN_STRING:
+ val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_STATIC_STRING:
+ val = (unsigned long)addr;
+ break;
+ case FILTER_PTR_STRING:
+ val = (unsigned long)(*(char *)addr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+ return val;
+ }
+
switch (field->size) {
case 1:
if (field->is_signed)
@@ -339,16 +380,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec)
static int get_eprobe_size(struct trace_probe *tp, void *rec)
{
+ struct fetch_insn *code;
struct probe_arg *arg;
int i, len, ret = 0;
for (i = 0; i < tp->nr_args; i++) {
arg = tp->args + i;
- if (unlikely(arg->dynamic)) {
+ if (arg->dynamic) {
unsigned long val;
- val = get_event_field(arg->code, rec);
- len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL);
+ code = arg->code;
+ retry:
+ switch (code->op) {
+ case FETCH_OP_TP_ARG:
+ val = get_event_field(code, rec);
+ break;
+ case FETCH_OP_IMM:
+ val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ continue;
+ }
+ code++;
+ len = process_fetch_insn_bottom(code, val, NULL, NULL);
if (len > 0)
ret += len;
}
@@ -366,8 +429,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
{
unsigned long val;
- val = get_event_field(code, rec);
- return process_fetch_insn_bottom(code + 1, val, dest, base);
+ retry:
+ switch (code->op) {
+ case FETCH_OP_TP_ARG:
+ val = get_event_field(code, rec);
+ break;
+ case FETCH_OP_IMM:
+ val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ return -EILSEQ;
+ }
+ code++;
+ return process_fetch_insn_bottom(code, val, dest, base);
}
NOKPROBE_SYMBOL(process_fetch_insn)
@@ -375,29 +458,14 @@ NOKPROBE_SYMBOL(process_fetch_insn)
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
+ return kern_fetch_store_strlen(addr);
}
/*
@@ -407,21 +475,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -431,29 +485,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
@@ -498,10 +530,6 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
return;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
- if (edata->ep->event)
- entry->type = edata->ep->event->event.type;
- else
- entry->type = 0;
store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
@@ -513,20 +541,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
* functions are just stubs to fulfill what is needed to use the trigger
* infrastructure.
*/
-static int eprobe_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int eprobe_trigger_init(struct event_trigger_data *data)
{
return 0;
}
-static void eprobe_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void eprobe_trigger_free(struct event_trigger_data *data)
{
}
static int eprobe_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
/* Do not print eprobe event triggers */
@@ -539,6 +564,9 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
{
struct eprobe_data *edata = data->private_data;
+ if (unlikely(!rec))
+ return;
+
__eprobe_trace_func(edata, rec);
}
@@ -551,7 +579,8 @@ static struct event_trigger_ops eprobe_trigger_ops = {
static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd,
+ char *param_and_filter)
{
return -1;
}
@@ -592,14 +621,15 @@ static struct event_trigger_data *
new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
{
struct event_trigger_data *trigger;
+ struct event_filter *filter = NULL;
struct eprobe_data *edata;
+ int ret;
edata = kzalloc(sizeof(*edata), GFP_KERNEL);
trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
if (!trigger || !edata) {
- kfree(edata);
- kfree(trigger);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto error;
}
trigger->flags = EVENT_TRIGGER_FL_PROBE;
@@ -614,13 +644,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
trigger->cmd_ops = &event_trigger_cmd;
INIT_LIST_HEAD(&trigger->list);
- RCU_INIT_POINTER(trigger->filter, NULL);
+
+ if (ep->filter_str) {
+ ret = create_event_filter(file->tr, ep->event,
+ ep->filter_str, false, &filter);
+ if (ret)
+ goto error;
+ }
+ RCU_INIT_POINTER(trigger->filter, filter);
edata->file = file;
edata->ep = ep;
trigger->private_data = edata;
return trigger;
+error:
+ free_event_filter(filter);
+ kfree(edata);
+ kfree(trigger);
+ return ERR_PTR(ret);
}
static int enable_eprobe(struct trace_eprobe *ep,
@@ -652,22 +694,25 @@ static struct trace_event_functions eprobe_funcs = {
static int disable_eprobe(struct trace_eprobe *ep,
struct trace_array *tr)
{
- struct event_trigger_data *trigger;
+ struct event_trigger_data *trigger = NULL, *iter;
struct trace_event_file *file;
+ struct event_filter *filter;
struct eprobe_data *edata;
file = find_event_file(tr, ep->event_system, ep->event_name);
if (!file)
return -ENOENT;
- list_for_each_entry(trigger, &file->triggers, list) {
- if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (!(iter->flags & EVENT_TRIGGER_FL_PROBE))
continue;
- edata = trigger->private_data;
- if (edata->ep == ep)
+ edata = iter->private_data;
+ if (edata->ep == ep) {
+ trigger = iter;
break;
+ }
}
- if (list_entry_is_head(trigger, &file->triggers, list))
+ if (!trigger)
return -ENODEV;
list_del_rcu(&trigger->list);
@@ -678,6 +723,10 @@ static int disable_eprobe(struct trace_eprobe *ep,
/* Make sure nothing is using the edata or trigger */
tracepoint_synchronize_unregister();
+ filter = rcu_access_pointer(trigger->filter);
+
+ if (filter)
+ free_event_filter(filter);
kfree(edata);
kfree(trigger);
@@ -840,9 +889,66 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
if (ret)
return ret;
- if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG)
+ if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) {
ret = trace_eprobe_tp_arg_update(ep, i);
+ if (ret)
+ trace_probe_log_err(0, BAD_ATTACH_ARG);
+ }
+
+ /* Handle symbols "@" */
+ if (!ret)
+ ret = traceprobe_update_arg(&ep->tp.args[i]);
+
+ return ret;
+}
+
+static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
+{
+ struct event_filter *dummy = NULL;
+ int i, ret, len = 0;
+ char *p;
+
+ if (argc == 0) {
+ trace_probe_log_err(0, NO_EP_FILTER);
+ return -EINVAL;
+ }
+
+ /* Recover the filter string */
+ for (i = 0; i < argc; i++)
+ len += strlen(argv[i]) + 1;
+
+ ep->filter_str = kzalloc(len, GFP_KERNEL);
+ if (!ep->filter_str)
+ return -ENOMEM;
+
+ p = ep->filter_str;
+ for (i = 0; i < argc; i++) {
+ ret = snprintf(p, len, "%s ", argv[i]);
+ if (ret < 0)
+ goto error;
+ if (ret > len) {
+ ret = -E2BIG;
+ goto error;
+ }
+ p += ret;
+ len -= ret;
+ }
+ p[-1] = '\0';
+
+ /*
+ * Ensure the filter string can be parsed correctly. Note, this
+ * filter string is for the original event, not for the eprobe.
+ */
+ ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
+ true, &dummy);
+ free_event_filter(dummy);
+ if (ret)
+ goto error;
+ return 0;
+error:
+ kfree(ep->filter_str);
+ ep->filter_str = NULL;
return ret;
}
@@ -850,8 +956,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
- * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS]
- * Fetch args:
+ * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
+ * Fetch args (no space):
* <name>=$<field>[:TYPE]
*/
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
@@ -860,8 +966,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
struct trace_eprobe *ep = NULL;
char buf1[MAX_EVENT_NAME_LEN];
char buf2[MAX_EVENT_NAME_LEN];
- int ret = 0;
- int i;
+ char gbuf[MAX_EVENT_NAME_LEN];
+ int ret = 0, filter_idx = 0;
+ int i, filter_cnt;
if (argc < 2 || argv[0][0] != 'e')
return -ECANCELED;
@@ -871,25 +978,33 @@ static int __trace_eprobe_create(int argc, const char *argv[])
event = strchr(&argv[0][1], ':');
if (event) {
event++;
- ret = traceprobe_parse_event_name(&event, &group, buf1,
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
goto parse_error;
- } else {
- strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
- sanitize_event_name(buf1);
- event = buf1;
}
- if (!is_good_name(event) || !is_good_name(group))
- goto parse_error;
+ trace_probe_log_set_index(1);
sys_event = argv[1];
- ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2,
- sys_event - argv[1]);
- if (ret || !sys_name)
- goto parse_error;
- if (!is_good_name(sys_event) || !is_good_name(sys_name))
+ ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
+ if (ret || !sys_event || !sys_name) {
+ trace_probe_log_err(0, NO_EVENT_INFO);
goto parse_error;
+ }
+
+ if (!event) {
+ strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
+ event = buf1;
+ }
+
+ for (i = 2; i < argc; i++) {
+ if (!strcmp(argv[i], "if")) {
+ filter_idx = i + 1;
+ filter_cnt = argc - filter_idx;
+ argc = i;
+ break;
+ }
+ }
mutex_lock(&event_mutex);
event_call = find_and_get_event(sys_name, sys_event);
@@ -898,12 +1013,22 @@ static int __trace_eprobe_create(int argc, const char *argv[])
if (IS_ERR(ep)) {
ret = PTR_ERR(ep);
+ if (ret == -ENODEV)
+ trace_probe_log_err(0, BAD_ATTACH_EVENT);
/* This must return -ENOMEM or missing event, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
ep = NULL;
goto error;
}
+ if (filter_idx) {
+ trace_probe_log_set_index(filter_idx);
+ ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
+ if (ret)
+ goto parse_error;
+ } else
+ ep->filter_str = NULL;
+
argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a114549720d6..61e3a2620fa3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
int i;
if (--tp_event->perf_refcount > 0)
- goto out;
+ return;
tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
@@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
perf_trace_buf[i] = NULL;
}
}
-out:
- trace_event_put_ref(tp_event);
}
static int perf_trace_event_open(struct perf_event *p_event)
@@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
}
@@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
destroy_local_trace_kprobe(p_event->tp_event);
@@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
destroy_local_trace_uprobe(p_event->tp_event);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..f71ea6e79b3c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
static bool eventdir_initialized;
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
+
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
static struct kmem_cache *field_cachep;
@@ -168,6 +176,7 @@ static int trace_define_generic_fields(void)
__generic_field(int, CPU, FILTER_CPU);
__generic_field(int, cpu, FILTER_CPU);
+ __generic_field(int, common_cpu, FILTER_CPU);
__generic_field(char *, COMM, FILTER_COMM);
__generic_field(char *, comm, FILTER_COMM);
@@ -399,7 +408,14 @@ static void test_event_printk(struct trace_event_call *call)
a = strchr(fmt + i, '&');
if ((a && (a < r)) || test_field(r, call))
dereference_flags &= ~(1ULL << arg);
+ } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
+ (!c || r < c)) {
+ dereference_flags &= ~(1ULL << arg);
+ } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
+ (!c || r < c)) {
+ dereference_flags &= ~(1ULL << arg);
}
+
next_arg:
i--;
arg++;
@@ -759,7 +775,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +801,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -1705,9 +1725,9 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
+ struct trace_subsystem_dir *dir = NULL, *iter_dir;
+ struct trace_array *tr = NULL, *iter_tr;
struct event_subsystem *system = NULL;
- struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
- struct trace_array *tr;
int ret;
if (tracing_is_disabled())
@@ -1716,10 +1736,12 @@ static int subsystem_open(struct inode *inode, struct file *filp)
/* Make sure the system still exists */
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- list_for_each_entry(dir, &tr->systems, list) {
- if (dir == inode->i_private) {
+ list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(iter_dir, &iter_tr->systems, list) {
+ if (iter_dir == inode->i_private) {
/* Don't open systems with no events */
+ tr = iter_tr;
+ dir = iter_dir;
if (dir->nr_events) {
__get_system_dir(dir);
system = dir->subsystem;
@@ -1735,9 +1757,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
if (!system)
return -ENODEV;
- /* Some versions of gcc think dir can be uninitialized here */
- WARN_ON(!dir);
-
/* Still need to increment the ref count of the system */
if (trace_array_get(tr) < 0) {
put_system(dir);
@@ -2262,8 +2281,8 @@ static struct dentry *
event_subsystem_dir(struct trace_array *tr, const char *name,
struct trace_event_file *file, struct dentry *parent)
{
+ struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- struct event_subsystem *system;
struct dentry *entry;
/* First see if we did not already create this dir */
@@ -2277,13 +2296,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
/* Now see if the system itself exists. */
- list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0)
+ system = NULL;
+ list_for_each_entry(iter, &event_subsystems, list) {
+ if (strcmp(iter->name, name) == 0) {
+ system = iter;
break;
+ }
}
- /* Reset system variable when not found */
- if (&system->list == &event_subsystems)
- system = NULL;
dir = kmalloc(sizeof(*dir), GFP_KERNEL);
if (!dir)
@@ -2633,6 +2652,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2668,6 +2757,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
}
@@ -2758,6 +2848,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2789,7 +2880,10 @@ static int probe_remove_event_call(struct trace_event_call *call)
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
- return -EBUSY;
+ goto busy;
+
+ if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+ tr->clear_trace = true;
/*
* The do_for_each_event_file_safe() is
* a double loop. After finding the call for this
@@ -2802,6 +2896,12 @@ static int probe_remove_event_call(struct trace_event_call *call)
__trace_remove_event_call(call);
return 0;
+ busy:
+ /* No need to clear the trace now */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ tr->clear_trace = false;
+ }
+ return -EBUSY;
}
/* Remove an event_call */
@@ -2819,6 +2919,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2853,6 +2954,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2861,6 +2963,14 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
@@ -2871,7 +2981,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
@@ -3446,12 +3556,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_fops);
- if (!entry) {
- pr_warn("Could not create tracefs 'set_event' entry\n");
+ entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry)
return -ENOMEM;
- }
d_events = tracefs_create_dir("events", parent);
if (!d_events) {
@@ -3466,16 +3574,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
/* There are not as crucial, just warn if they are not created */
- entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_pid_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'set_event_pid' entry\n");
+ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_pid_fops);
- entry = tracefs_create_file("set_event_notrace_pid",
- TRACE_MODE_WRITE, parent, tr,
- &ftrace_set_event_notrace_pid_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
+ trace_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
/* ring buffer internal formats */
trace_create_file("header_page", TRACE_MODE_READ, d_events,
@@ -3690,17 +3794,14 @@ static __init int event_trace_init_fields(void)
__init int event_trace_init(void)
{
struct trace_array *tr;
- struct dentry *entry;
int ret;
tr = top_trace_array();
if (!tr)
return -ENODEV;
- entry = tracefs_create_file("available_events", TRACE_MODE_READ,
- NULL, tr, &ftrace_avail_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'available_events' entry\n");
+ trace_create_file("available_events", TRACE_MODE_READ,
+ NULL, tr, &ftrace_avail_fops);
ret = early_event_add_tracer(NULL, tr);
if (ret)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b458a9afa2c0..96acc2b71ac7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -43,6 +43,42 @@ enum filter_op_ids { OPS };
static const char * ops[] = { OPS };
+enum filter_pred_fn {
+ FILTER_PRED_FN_NOP,
+ FILTER_PRED_FN_64,
+ FILTER_PRED_FN_S64,
+ FILTER_PRED_FN_U64,
+ FILTER_PRED_FN_32,
+ FILTER_PRED_FN_S32,
+ FILTER_PRED_FN_U32,
+ FILTER_PRED_FN_16,
+ FILTER_PRED_FN_S16,
+ FILTER_PRED_FN_U16,
+ FILTER_PRED_FN_8,
+ FILTER_PRED_FN_S8,
+ FILTER_PRED_FN_U8,
+ FILTER_PRED_FN_COMM,
+ FILTER_PRED_FN_STRING,
+ FILTER_PRED_FN_STRLOC,
+ FILTER_PRED_FN_STRRELLOC,
+ FILTER_PRED_FN_PCHAR_USER,
+ FILTER_PRED_FN_PCHAR,
+ FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_,
+ FILTER_PRED_TEST_VISITED,
+};
+
+struct filter_pred {
+ enum filter_pred_fn fn_num;
+ u64 val;
+ struct regex regex;
+ unsigned short *ops;
+ struct ftrace_event_field *field;
+ int offset;
+ int not;
+ int op;
+};
+
/*
* pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
* pred_funcs_##type below must match the order of them above.
@@ -590,44 +626,48 @@ out_free:
return ERR_PTR(ret);
}
+enum pred_cmp_types {
+ PRED_CMP_TYPE_NOP,
+ PRED_CMP_TYPE_LT,
+ PRED_CMP_TYPE_LE,
+ PRED_CMP_TYPE_GT,
+ PRED_CMP_TYPE_GE,
+ PRED_CMP_TYPE_BAND,
+};
+
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr < val; \
-} \
-static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr <= val; \
-} \
-static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
+static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr > val; \
-} \
-static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return *addr >= val; \
-} \
-static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- return !!(*addr & val); \
-} \
-static const filter_pred_fn_t pred_funcs_##type[] = { \
- filter_pred_LE_##type, \
- filter_pred_LT_##type, \
- filter_pred_GE_##type, \
- filter_pred_GT_##type, \
- filter_pred_BAND_##type, \
-};
+ switch (pred->op) { \
+ case OP_LT: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr < val; \
+ } \
+ case OP_LE: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr <= val; \
+ } \
+ case OP_GT: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr > val; \
+ } \
+ case OP_GE: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return *addr >= val; \
+ } \
+ case OP_BAND: { \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ return !!(*addr & val); \
+ } \
+ default: \
+ return 0; \
+ } \
+}
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
@@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
return cmp ^ pred->not;
}
-static int filter_pred_none(struct filter_pred *pred, void *event)
-{
- return 0;
-}
-
/*
* regex_match_foo - Basic regex callbacks
*
@@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred)
}
}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int test_pred_visited_fn(struct filter_pred *pred, void *event);
+#else
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+ return 0;
+}
+#endif
+
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event);
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
@@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
for (i = 0; prog[i].pred; i++) {
struct filter_pred *pred = prog[i].pred;
- int match = pred->fn(pred, rec);
+ int match = filter_pred_fn_call(pred, rec);
if (match == prog[i].when_to_branch)
i = prog[i].target;
}
@@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
- int field_size, int field_is_signed)
+static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
+ int field_size, int field_is_signed)
{
- filter_pred_fn_t fn = NULL;
+ enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
int pred_func_index = -1;
switch (op) {
@@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
break;
default:
if (WARN_ON_ONCE(op < PRED_FUNC_START))
- return NULL;
+ return fn;
pred_func_index = op - PRED_FUNC_START;
if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
- return NULL;
+ return fn;
}
switch (field_size) {
case 8:
if (pred_func_index < 0)
- fn = filter_pred_64;
+ fn = FILTER_PRED_FN_64;
else if (field_is_signed)
- fn = pred_funcs_s64[pred_func_index];
+ fn = FILTER_PRED_FN_S64;
else
- fn = pred_funcs_u64[pred_func_index];
+ fn = FILTER_PRED_FN_U64;
break;
case 4:
if (pred_func_index < 0)
- fn = filter_pred_32;
+ fn = FILTER_PRED_FN_32;
else if (field_is_signed)
- fn = pred_funcs_s32[pred_func_index];
+ fn = FILTER_PRED_FN_S32;
else
- fn = pred_funcs_u32[pred_func_index];
+ fn = FILTER_PRED_FN_U32;
break;
case 2:
if (pred_func_index < 0)
- fn = filter_pred_16;
+ fn = FILTER_PRED_FN_16;
else if (field_is_signed)
- fn = pred_funcs_s16[pred_func_index];
+ fn = FILTER_PRED_FN_S16;
else
- fn = pred_funcs_u16[pred_func_index];
+ fn = FILTER_PRED_FN_U16;
break;
case 1:
if (pred_func_index < 0)
- fn = filter_pred_8;
+ fn = FILTER_PRED_FN_8;
else if (field_is_signed)
- fn = pred_funcs_s8[pred_func_index];
+ fn = FILTER_PRED_FN_S8;
else
- fn = pred_funcs_u8[pred_func_index];
+ fn = FILTER_PRED_FN_U8;
break;
}
return fn;
}
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event)
+{
+ switch (pred->fn_num) {
+ case FILTER_PRED_FN_64:
+ return filter_pred_64(pred, event);
+ case FILTER_PRED_FN_S64:
+ return filter_pred_s64(pred, event);
+ case FILTER_PRED_FN_U64:
+ return filter_pred_u64(pred, event);
+ case FILTER_PRED_FN_32:
+ return filter_pred_32(pred, event);
+ case FILTER_PRED_FN_S32:
+ return filter_pred_s32(pred, event);
+ case FILTER_PRED_FN_U32:
+ return filter_pred_u32(pred, event);
+ case FILTER_PRED_FN_16:
+ return filter_pred_16(pred, event);
+ case FILTER_PRED_FN_S16:
+ return filter_pred_s16(pred, event);
+ case FILTER_PRED_FN_U16:
+ return filter_pred_u16(pred, event);
+ case FILTER_PRED_FN_8:
+ return filter_pred_8(pred, event);
+ case FILTER_PRED_FN_S8:
+ return filter_pred_s8(pred, event);
+ case FILTER_PRED_FN_U8:
+ return filter_pred_u8(pred, event);
+ case FILTER_PRED_FN_COMM:
+ return filter_pred_comm(pred, event);
+ case FILTER_PRED_FN_STRING:
+ return filter_pred_string(pred, event);
+ case FILTER_PRED_FN_STRLOC:
+ return filter_pred_strloc(pred, event);
+ case FILTER_PRED_FN_STRRELLOC:
+ return filter_pred_strrelloc(pred, event);
+ case FILTER_PRED_FN_PCHAR_USER:
+ return filter_pred_pchar_user(pred, event);
+ case FILTER_PRED_FN_PCHAR:
+ return filter_pred_pchar(pred, event);
+ case FILTER_PRED_FN_CPU:
+ return filter_pred_cpu(pred, event);
+ case FILTER_PRED_TEST_VISITED:
+ return test_pred_visited_fn(pred, event);
+ default:
+ return 0;
+ }
+}
+
/* Called when a predicate is encountered by predicate_parse() */
static int parse_pred(const char *str, void *data,
int pos, struct filter_parse_error *pe,
@@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data,
parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
goto err_free;
}
- pred->fn = filter_pred_none;
+ pred->fn_num = FILTER_PRED_FN_NOP;
/*
* Quotes are not required, but if they exist then we need
@@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data,
filter_build_regex(pred);
if (field->filter_type == FILTER_COMM) {
- pred->fn = filter_pred_comm;
+ pred->fn_num = FILTER_PRED_FN_COMM;
} else if (field->filter_type == FILTER_STATIC_STRING) {
- pred->fn = filter_pred_string;
+ pred->fn_num = FILTER_PRED_FN_STRING;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
- pred->fn = filter_pred_strloc;
+ pred->fn_num = FILTER_PRED_FN_STRLOC;
} else if (field->filter_type == FILTER_RDYN_STRING)
- pred->fn = filter_pred_strrelloc;
+ pred->fn_num = FILTER_PRED_FN_STRRELLOC;
else {
if (!ustring_per_cpu) {
@@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data,
}
if (ustring)
- pred->fn = filter_pred_pchar_user;
+ pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
else
- pred->fn = filter_pred_pchar;
+ pred->fn_num = FILTER_PRED_FN_PCHAR;
}
/* go past the last quote */
i++;
@@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data,
pred->val = val;
if (field->filter_type == FILTER_CPU)
- pred->fn = filter_pred_cpu;
+ pred->fn_num = FILTER_PRED_FN_CPU;
else {
- pred->fn = select_comparison_fn(pred->op, field->size,
- field->is_signed);
+ pred->fn_num = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
if (pred->op == OP_NE)
pred->not = 1;
}
@@ -1816,7 +1913,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
* create_filter - create a filter for a trace_event_call
* @tr: the trace array associated with these events
* @call: trace_event_call to create a filter for
- * @filter_str: filter string
+ * @filter_string: filter string
* @set_str: remember @filter_str and enable detailed error in filter
* @filterp: out param for created filter (always updated on return)
* Must be a pointer that references a NULL pointer.
@@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
struct filter_pred *pred = prog[i].pred;
struct ftrace_event_field *field = pred->field;
- WARN_ON_ONCE(!pred->fn);
+ WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
if (!field) {
WARN_ONCE(1, "all leafs should have field defined %d", i);
@@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
if (!strchr(fields, *field->name))
continue;
- pred->fn = test_pred_visited_fn;
+ pred->fn_num = FILTER_PRED_TEST_VISITED;
}
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index ada87bfb5bb8..1c82478e8dff 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -104,6 +104,38 @@ enum field_op_id {
FIELD_OP_MULT,
};
+enum hist_field_fn {
+ HIST_FIELD_FN_NOP,
+ HIST_FIELD_FN_VAR_REF,
+ HIST_FIELD_FN_COUNTER,
+ HIST_FIELD_FN_CONST,
+ HIST_FIELD_FN_LOG2,
+ HIST_FIELD_FN_BUCKET,
+ HIST_FIELD_FN_TIMESTAMP,
+ HIST_FIELD_FN_CPU,
+ HIST_FIELD_FN_STRING,
+ HIST_FIELD_FN_DYNSTRING,
+ HIST_FIELD_FN_RELDYNSTRING,
+ HIST_FIELD_FN_PSTRING,
+ HIST_FIELD_FN_S64,
+ HIST_FIELD_FN_U64,
+ HIST_FIELD_FN_S32,
+ HIST_FIELD_FN_U32,
+ HIST_FIELD_FN_S16,
+ HIST_FIELD_FN_U16,
+ HIST_FIELD_FN_S8,
+ HIST_FIELD_FN_U8,
+ HIST_FIELD_FN_UMINUS,
+ HIST_FIELD_FN_MINUS,
+ HIST_FIELD_FN_PLUS,
+ HIST_FIELD_FN_DIV,
+ HIST_FIELD_FN_MULT,
+ HIST_FIELD_FN_DIV_POWER2,
+ HIST_FIELD_FN_DIV_NOT_POWER2,
+ HIST_FIELD_FN_DIV_MULT_SHIFT,
+ HIST_FIELD_FN_EXECNAME,
+};
+
/*
* A hist_var (histogram variable) contains variable information for
* hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
@@ -123,15 +155,15 @@ struct hist_var {
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
- hist_field_fn_t fn;
- unsigned int ref;
- unsigned int size;
- unsigned int offset;
- unsigned int is_signed;
unsigned long buckets;
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
+ enum hist_field_fn fn_num;
+ unsigned int ref;
+ unsigned int size;
+ unsigned int offset;
+ unsigned int is_signed;
/*
* Variable fields contain variable-specific info in var.
@@ -166,14 +198,11 @@ struct hist_field {
u64 div_multiplier;
};
-static u64 hist_field_none(struct hist_field *field,
- struct tracing_map_elt *elt,
- struct trace_buffer *buffer,
- struct ring_buffer_event *rbe,
- void *event)
-{
- return 0;
-}
+static u64 hist_fn_call(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event);
static u64 hist_field_const(struct hist_field *field,
struct tracing_map_elt *elt,
@@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field,
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, elt, buffer, rbe, event);
+ u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
@@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field,
struct hist_field *operand = hist_field->operands[0];
unsigned long buckets = hist_field->buckets;
- u64 val = operand->fn(operand, elt, buffer, rbe, event);
+ u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
if (WARN_ON_ONCE(!buckets))
return val;
@@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 + val2;
}
@@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 - val2;
}
@@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
/* Return -1 for the undefined case */
if (!val2)
@@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
return val1 >> __ffs64(operand2->constant);
}
@@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
return div64_u64(val1, operand2->constant);
}
@@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
/*
* If the divisor is a constant, do a multiplication and shift instead.
@@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+ u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+ u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 * val2;
}
@@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
{
struct hist_field *operand = hist_field->operands[0];
- s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
+ s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
u64 val = (u64)-sval;
return val;
@@ -657,19 +686,19 @@ struct snapshot_context {
* Returns the specific division function to use if the divisor
* is constant. This avoids extra branches when the trigger is hit.
*/
-static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
{
u64 div = divisor->constant;
if (!(div & (div - 1)))
- return div_by_power_of_two;
+ return HIST_FIELD_FN_DIV_POWER2;
/* If the divisor is too large, do a regular division */
if (div > (1 << HIST_DIV_SHIFT))
- return div_by_not_power_of_two;
+ return HIST_FIELD_FN_DIV_NOT_POWER2;
divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
- return div_by_mult_and_shift;
+ return HIST_FIELD_FN_DIV_MULT_SHIFT;
}
static void track_data_free(struct track_data *track_data)
@@ -727,11 +756,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -739,12 +773,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
+ int len;
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ /* sizeof() contains the nul byte */
+ len = sizeof(HIST_PREFIX) + strlen(str);
+ kfree(last_cmd);
+ last_cmd = kzalloc(len, GFP_KERNEL);
+ if (!last_cmd)
+ return;
+
+ strcpy(last_cmd, HIST_PREFIX);
+ /* Again, sizeof() contains the nul byte */
+ len -= sizeof(HIST_PREFIX);
+ strncat(last_cmd, str, len);
if (file) {
call = file->event_call;
@@ -757,18 +801,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -935,7 +983,7 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
* A trigger can define one or more variables. If any one of them is
* currently referenced by any other trigger, this function will
* determine that.
-
+ *
* Typically used to determine whether or not a trigger can be removed
* - if there are any references to a trigger's variables, it cannot.
*
@@ -1315,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field,
return field_name;
}
-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
{
- hist_field_fn_t fn = NULL;
-
switch (field_size) {
case 8:
if (field_is_signed)
- fn = hist_field_s64;
+ return HIST_FIELD_FN_S64;
else
- fn = hist_field_u64;
- break;
+ return HIST_FIELD_FN_U64;
case 4:
if (field_is_signed)
- fn = hist_field_s32;
+ return HIST_FIELD_FN_S32;
else
- fn = hist_field_u32;
- break;
+ return HIST_FIELD_FN_U32;
case 2:
if (field_is_signed)
- fn = hist_field_s16;
+ return HIST_FIELD_FN_S16;
else
- fn = hist_field_u16;
- break;
+ return HIST_FIELD_FN_U16;
case 1:
if (field_is_signed)
- fn = hist_field_s8;
+ return HIST_FIELD_FN_S8;
else
- fn = hist_field_u8;
- break;
+ return HIST_FIELD_FN_U8;
}
- return fn;
+ return HIST_FIELD_FN_NOP;
}
static int parse_map_size(char *str)
@@ -1903,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out; /* caller will populate */
if (flags & HIST_FIELD_FL_VAR_REF) {
- hist_field->fn = hist_field_var_ref;
+ hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
goto out;
}
if (flags & HIST_FIELD_FL_HITCOUNT) {
- hist_field->fn = hist_field_counter;
+ hist_field->fn_num = HIST_FIELD_FN_COUNTER;
hist_field->size = sizeof(u64);
hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CONST) {
- hist_field->fn = hist_field_const;
+ hist_field->fn_num = HIST_FIELD_FN_CONST;
hist_field->size = sizeof(u64);
hist_field->type = kstrdup("u64", GFP_KERNEL);
if (!hist_field->type)
@@ -1924,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_STACKTRACE) {
- hist_field->fn = hist_field_none;
+ hist_field->fn_num = HIST_FIELD_FN_NOP;
goto out;
}
if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
- hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
- hist_field_bucket;
+ hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
+ HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
@@ -1941,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_TIMESTAMP) {
- hist_field->fn = hist_field_timestamp;
+ hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
hist_field->size = sizeof(u64);
hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
- hist_field->fn = hist_field_cpu;
+ hist_field->fn_num = HIST_FIELD_FN_CPU;
hist_field->size = sizeof(int);
hist_field->type = "unsigned int";
goto out;
@@ -1968,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto free;
if (field->filter_type == FILTER_STATIC_STRING) {
- hist_field->fn = hist_field_string;
+ hist_field->fn_num = HIST_FIELD_FN_STRING;
hist_field->size = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
- hist_field->fn = hist_field_dynstring;
+ hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
} else if (field->filter_type == FILTER_RDYN_STRING)
- hist_field->fn = hist_field_reldynstring;
+ hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
else
- hist_field->fn = hist_field_pstring;
+ hist_field->fn_num = HIST_FIELD_FN_PSTRING;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
@@ -1983,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field->type)
goto free;
- hist_field->fn = select_value_fn(field->size,
- field->is_signed);
- if (!hist_field->fn) {
+ hist_field->fn_num = select_value_fn(field->size,
+ field->is_signed);
+ if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
destroy_hist_field(hist_field, 0);
return NULL;
}
@@ -2074,8 +2116,11 @@ static int init_var_ref(struct hist_field *ref_field,
return err;
free:
kfree(ref_field->system);
+ ref_field->system = NULL;
kfree(ref_field->event_name);
+ ref_field->event_name = NULL;
kfree(ref_field->name);
+ ref_field->name = NULL;
goto out;
}
@@ -2289,9 +2334,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
/*
* For backward compatibility, if field_name
* was "cpu", then we treat this the same as
- * common_cpu.
+ * common_cpu. This also works for "CPU".
*/
- if (strcmp(field_name, "cpu") == 0) {
+ if (field && field->filter_type == FILTER_CPU) {
*flags |= HIST_FIELD_FL_CPU;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
@@ -2318,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
if (!alias)
return NULL;
- alias->fn = var_ref->fn;
+ alias->fn_num = var_ref->fn_num;
alias->operands[0] = var_ref;
if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
@@ -2501,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
- expr->fn = hist_field_unary_minus;
+ expr->fn_num = HIST_FIELD_FN_UMINUS;
expr->operands[0] = operand1;
expr->size = operand1->size;
expr->is_signed = operand1->is_signed;
@@ -2573,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
unsigned long operand_flags, operand2_flags;
int field_op, ret = -EINVAL;
char *sep, *operand1_str;
- hist_field_fn_t op_fn;
+ enum hist_field_fn op_fn;
bool combine_consts;
if (*n_subexprs > 3) {
@@ -2632,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
switch (field_op) {
case FIELD_OP_MINUS:
- op_fn = hist_field_minus;
+ op_fn = HIST_FIELD_FN_MINUS;
break;
case FIELD_OP_PLUS:
- op_fn = hist_field_plus;
+ op_fn = HIST_FIELD_FN_PLUS;
break;
case FIELD_OP_DIV:
- op_fn = hist_field_div;
+ op_fn = HIST_FIELD_FN_DIV;
break;
case FIELD_OP_MULT:
- op_fn = hist_field_mult;
+ op_fn = HIST_FIELD_FN_MULT;
break;
default:
ret = -EINVAL;
@@ -2697,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
op_fn = hist_field_get_div_fn(operand2);
}
+ expr->fn_num = op_fn;
+
if (combine_consts) {
if (var1)
expr->operands[0] = var1;
if (var2)
expr->operands[1] = var2;
- expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
+ expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
+ expr->fn_num = HIST_FIELD_FN_CONST;
expr->operands[0] = NULL;
expr->operands[1] = NULL;
@@ -2717,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->name = expr_str(expr, 0);
} else {
- expr->fn = op_fn;
-
/* The operand sizes should be the same, so just pick one */
expr->size = operand1->size;
expr->is_signed = operand1->is_signed;
@@ -2766,7 +2812,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
static struct event_command trigger_hist_cmd;
static int event_hist_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+ char *glob, char *cmd,
+ char *param_and_filter);
static bool compatible_keys(struct hist_trigger_data *target_hist_data,
struct hist_trigger_data *hist_data,
@@ -3042,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
struct hist_field *var = field_var->var;
struct hist_field *val = field_var->val;
- var_val = val->fn(val, elt, buffer, rbe, rec);
+ var_val = hist_fn_call(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
if (val->flags & HIST_FIELD_FL_STRING) {
@@ -3179,7 +3226,7 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* This function creates a field variable with the name var_name on
* the hist trigger currently being defined on the target event. If
* subsys_name and event_name are specified, this function simply
@@ -4142,7 +4189,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
}
-static const char *no_comm = "(no comm)";
+static const char no_comm[] = "(no comm)";
static u64 hist_field_execname(struct hist_field *hist_field,
struct tracing_map_elt *elt,
@@ -4163,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field,
return (u64)(unsigned long)(elt_data->comm);
}
+static u64 hist_fn_call(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ switch (hist_field->fn_num) {
+ case HIST_FIELD_FN_VAR_REF:
+ return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_COUNTER:
+ return hist_field_counter(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_CONST:
+ return hist_field_const(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_LOG2:
+ return hist_field_log2(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_BUCKET:
+ return hist_field_bucket(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_TIMESTAMP:
+ return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_CPU:
+ return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_STRING:
+ return hist_field_string(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DYNSTRING:
+ return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_RELDYNSTRING:
+ return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_PSTRING:
+ return hist_field_pstring(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S64:
+ return hist_field_s64(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U64:
+ return hist_field_u64(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S32:
+ return hist_field_s32(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U32:
+ return hist_field_u32(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S16:
+ return hist_field_s16(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U16:
+ return hist_field_u16(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_S8:
+ return hist_field_s8(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_U8:
+ return hist_field_u8(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_UMINUS:
+ return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_MINUS:
+ return hist_field_minus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_PLUS:
+ return hist_field_plus(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV:
+ return hist_field_div(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_MULT:
+ return hist_field_mult(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_POWER2:
+ return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_NOT_POWER2:
+ return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_DIV_MULT_SHIFT:
+ return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_EXECNAME:
+ return hist_field_execname(hist_field, elt, buffer, rbe, event);
+ default:
+ return 0;
+ }
+}
+
/* Convert a var that points to common_pid.execname to a string */
static void update_var_execname(struct hist_field *hist_field)
{
@@ -4174,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field)
kfree_const(hist_field->type);
hist_field->type = "char[]";
- hist_field->fn = hist_field_execname;
+ hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
}
static int create_var_field(struct hist_trigger_data *hist_data,
@@ -4407,6 +4522,8 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
s = kstrdup(field_str, GFP_KERNEL);
if (!s) {
+ kfree(hist_data->attrs->var_defs.name[n_vars]);
+ hist_data->attrs->var_defs.name[n_vars] = NULL;
ret = -ENOMEM;
goto free;
}
@@ -4430,7 +4547,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
ret = parse_var_defs(hist_data);
if (ret)
- goto out;
+ return ret;
ret = create_val_fields(hist_data, file);
if (ret)
@@ -4441,8 +4558,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
goto out;
ret = create_key_fields(hist_data, file);
- if (ret)
- goto out;
+
out:
free_var_defs(hist_data);
@@ -4832,7 +4948,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
- else if (!field)
+ else if (!field || hist_field->flags & HIST_FIELD_FL_CPU)
cmp_fn = tracing_map_cmp_num(hist_field->size,
hist_field->is_signed);
else if (is_string_field(field))
@@ -4932,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+ hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
@@ -4963,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_key_field(i, hist_data) {
hist_field = hist_data->fields[i];
if (hist_field->flags & HIST_FIELD_FL_VAR) {
- hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+ hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
var_idx = hist_field->var.idx;
tracing_map_set_var(elt, var_idx, hist_val);
}
@@ -5027,6 +5143,9 @@ static void event_hist_trigger(struct event_trigger_data *data,
void *key = NULL;
unsigned int i;
+ if (unlikely(!rbe))
+ return;
+
memset(compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
@@ -5038,7 +5157,7 @@ static void event_hist_trigger(struct event_trigger_data *data,
HIST_STACKTRACE_SKIP);
key = entries;
} else {
- field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
+ field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -5233,7 +5352,7 @@ static void hist_trigger_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -5465,7 +5584,7 @@ static void hist_trigger_debug_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -5602,7 +5721,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
}
static int event_hist_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5610,7 +5728,7 @@ static int event_hist_trigger_print(struct seq_file *m,
bool have_var = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
@@ -5710,8 +5828,7 @@ static int event_hist_trigger_print(struct seq_file *m,
return 0;
}
-static int event_hist_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5739,8 +5856,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
}
}
-static void event_hist_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void event_hist_trigger_free(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5769,25 +5885,23 @@ static struct event_trigger_ops event_hist_trigger_ops = {
.free = event_hist_trigger_free,
};
-static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int event_hist_trigger_named_init(struct event_trigger_data *data)
{
data->ref++;
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(ops, data->named_data);
+ event_hist_trigger_init(data->named_data);
return 0;
}
-static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void event_hist_trigger_named_free(struct event_trigger_data *data)
{
if (WARN_ON_ONCE(data->ref <= 0))
return;
- event_hist_trigger_free(ops, data->named_data);
+ event_hist_trigger_free(data->named_data);
data->ref--;
if (!data->ref) {
@@ -5914,6 +6028,48 @@ static bool hist_trigger_match(struct event_trigger_data *data,
return true;
}
+static bool existing_hist_update_only(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ bool updated = false;
+
+ if (!hist_data->attrs->pause && !hist_data->attrs->cont &&
+ !hist_data->attrs->clear)
+ goto out;
+
+ if (hist_data->attrs->name) {
+ named_data = find_named_trigger(hist_data->attrs->name);
+ if (named_data) {
+ if (!hist_trigger_match(data, named_data, named_data,
+ true))
+ goto out;
+ }
+ }
+
+ if (hist_data->attrs->name && !named_data)
+ goto out;
+
+ list_for_each_entry(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ if (hist_data->attrs->pause)
+ test->paused = true;
+ else if (hist_data->attrs->cont)
+ test->paused = false;
+ else if (hist_data->attrs->clear)
+ hist_clear(test);
+ updated = true;
+ goto out;
+ }
+ }
+ out:
+ return updated;
+}
+
static int hist_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
@@ -5942,19 +6098,11 @@ static int hist_register_trigger(char *glob,
list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
- if (!hist_trigger_match(data, test, named_data, false))
- continue;
- if (hist_data->attrs->pause)
- test->paused = true;
- else if (hist_data->attrs->cont)
- test->paused = false;
- else if (hist_data->attrs->clear)
- hist_clear(test);
- else {
+ if (hist_trigger_match(data, test, named_data, false)) {
hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
ret = -EEXIST;
+ goto out;
}
- goto out;
}
}
new:
@@ -5974,7 +6122,7 @@ static int hist_register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
@@ -5993,8 +6141,6 @@ static int hist_register_trigger(char *glob,
if (named_data)
destroy_hist_data(hist_data);
-
- ret++;
out:
return ret;
}
@@ -6070,20 +6216,19 @@ static void hist_unregister_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
+ struct event_trigger_data *test = NULL, *iter, *named_data = NULL;
struct hist_trigger_data *hist_data = data->private_data;
- struct event_trigger_data *test, *named_data = NULL;
- bool unregistered = false;
lockdep_assert_held(&event_mutex);
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry(test, &file->triggers, list) {
- if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
- if (!hist_trigger_match(data, test, named_data, false))
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, iter, named_data, false))
continue;
- unregistered = true;
+ test = iter;
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -6091,11 +6236,11 @@ static void hist_unregister_trigger(char *glob,
}
}
- if (unregistered && test->ops->free)
- test->ops->free(test->ops, test);
+ if (test && test->ops->free)
+ test->ops->free(test);
if (hist_data->enable_timestamps) {
- if (!hist_data->remove || unregistered)
+ if (!hist_data->remove || test)
tracing_set_filter_buffering(file->tr, false);
}
}
@@ -6145,57 +6290,57 @@ static void hist_unreg_all(struct trace_event_file *file)
if (hist_data->enable_timestamps)
tracing_set_filter_buffering(file->tr, false);
if (test->ops->free)
- test->ops->free(test->ops, test);
+ test->ops->free(test);
}
}
}
static int event_hist_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd,
+ char *param_and_filter)
{
unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
struct event_trigger_data *trigger_data;
struct hist_trigger_attrs *attrs;
- struct event_trigger_ops *trigger_ops;
struct hist_trigger_data *hist_data;
+ char *param, *filter, *p, *start;
struct synth_event *se;
const char *se_name;
- bool remove = false;
- char *trigger, *p, *start;
+ bool remove;
int ret = 0;
lockdep_assert_held(&event_mutex);
- WARN_ON(!glob);
+ if (WARN_ON(!glob))
+ return -EINVAL;
- if (strlen(glob)) {
+ if (glob[0]) {
hist_err_clear();
- last_cmd_set(file, param);
+ last_cmd_set(file, param_and_filter);
}
- if (!param)
- return -EINVAL;
+ remove = event_trigger_check_remove(glob);
- if (glob[0] == '!')
- remove = true;
+ if (event_trigger_empty_param(param_and_filter))
+ return -EINVAL;
/*
* separate the trigger from the filter (k:v [if filter])
* allowing for whitespace in the trigger
*/
- p = trigger = param;
+ p = param = param_and_filter;
do {
p = strstr(p, "if");
if (!p)
break;
- if (p == param)
+ if (p == param_and_filter)
return -EINVAL;
if (*(p - 1) != ' ' && *(p - 1) != '\t') {
p++;
continue;
}
- if (p >= param + strlen(param) - (sizeof("if") - 1) - 1)
+ if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1)
return -EINVAL;
if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') {
p++;
@@ -6205,24 +6350,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
} while (1);
if (!p)
- param = NULL;
+ filter = NULL;
else {
*(p - 1) = '\0';
- param = strstrip(p);
- trigger = strstrip(trigger);
+ filter = strstrip(p);
+ param = strstrip(param);
}
/*
* To simplify arithmetic expression parsing, replace occurrences of
* '.sym-offset' modifier with '.symXoffset'
*/
- start = strstr(trigger, ".sym-offset");
+ start = strstr(param, ".sym-offset");
while (start) {
*(start + 4) = 'X';
start = strstr(start + 11, ".sym-offset");
}
- attrs = parse_hist_trigger_attrs(file->tr, trigger);
+ attrs = parse_hist_trigger_attrs(file->tr, param);
if (IS_ERR(attrs))
return PTR_ERR(attrs);
@@ -6235,29 +6380,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
}
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
-
- INIT_LIST_HEAD(&trigger_data->list);
- RCU_INIT_POINTER(trigger_data->filter, NULL);
-
- trigger_data->private_data = hist_data;
-
- /* if param is non-empty, it's supposed to be a filter */
- if (param && cmd_ops->set_filter) {
- ret = cmd_ops->set_filter(param, trigger_data, file);
- if (ret < 0)
- goto out_free;
- }
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
+ if (ret < 0)
+ goto out_free;
if (remove) {
if (!have_hist_trigger_match(trigger_data, file))
@@ -6268,7 +6399,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
goto out_free;
}
- cmd_ops->unreg(glob+1, trigger_data, file);
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
@@ -6277,17 +6408,11 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
goto out_free;
}
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of triggers registered,
- * but if it didn't register any it returns zero. Consider no
- * triggers registered a failure too.
- */
- if (!ret) {
- if (!(attrs->pause || attrs->cont || attrs->clear))
- ret = -ENOENT;
+ if (existing_hist_update_only(glob, trigger_data, file))
goto out_free;
- } else if (ret < 0)
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret < 0)
goto out_free;
if (get_named_trigger_data(trigger_data))
@@ -6312,18 +6437,15 @@ enable:
se = find_synth_event(se_name);
if (se)
se->ref++;
- /* Just return zero, not the number of registered triggers */
- ret = 0;
out:
if (ret == 0)
hist_err_clear();
return ret;
out_unreg:
- cmd_ops->unreg(glob+1, trigger_data, file);
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
remove_hist_vars(hist_data);
@@ -6444,7 +6566,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
if (test->ops->free)
- test->ops->free(test->ops, test);
+ test->ops->free(test);
}
}
}
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..c3b582d19b62 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -17,6 +17,8 @@
/* for gfp flag names */
#include <linux/trace_events.h>
#include <trace/events/mmflags.h>
+#include "trace_probe.h"
+#include "trace_probe_kernel.h"
#include "trace_synth.h"
@@ -42,10 +44,13 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
static int errpos(const char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -54,11 +59,16 @@ static void last_cmd_set(const char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(last_cmd);
+
+ last_cmd = kstrdup(str, GFP_KERNEL);
}
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
}
@@ -401,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
{
unsigned int len = 0;
char *str_field;
+ int ret;
if (is_dynamic) {
u32 data_offset;
@@ -409,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry,
data_offset += event->n_u64 * sizeof(u64);
data_offset += data_size;
- str_field = (char *)entry + data_offset;
-
- len = strlen(str_val) + 1;
- strscpy(str_field, str_val, len);
+ len = kern_fetch_store_strlen((unsigned long)str_val);
data_offset |= len << 16;
*(u32 *)&entry->fields[*n_u64] = data_offset;
+ ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
+
(*n_u64)++;
} else {
str_field = (char *)&entry->fields[*n_u64];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)str_val < TASK_SIZE)
+ ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+ else
+#endif
+ ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+
+ if (ret < 0)
+ strcpy(str_field, FAULT_STRING);
+
(*n_u64) += STR_VAR_LEN_MAX / sizeof(u64);
}
@@ -454,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
- len = strlen(str_val) + 1;
+ len = kern_fetch_store_strlen((unsigned long)str_val);
fields_size += len;
}
@@ -809,10 +828,9 @@ static int register_synth_event(struct synth_event *event)
}
ret = set_synth_event_print_fmt(call);
- if (ret < 0) {
+ /* unregister_trace_event() will be called inside */
+ if (ret < 0)
trace_remove_event_call(call);
- goto err;
- }
out:
return ret;
err:
@@ -1407,7 +1425,6 @@ int synth_event_delete(const char *event_name)
mutex_unlock(&event_mutex);
if (mod) {
- mutex_lock(&trace_types_lock);
/*
* It is safest to reset the ring buffer if the module
* being unloaded registered any events that were
@@ -1419,7 +1436,6 @@ int synth_event_delete(const char *event_name)
* occur.
*/
tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
}
return ret;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d00fee705f9c..918730d74932 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -84,6 +84,20 @@ event_triggers_call(struct trace_event_file *file,
}
EXPORT_SYMBOL_GPL(event_triggers_call);
+bool __trace_trigger_soft_disabled(struct trace_event_file *file)
+{
+ unsigned long eflags = file->flags;
+
+ if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
+ event_triggers_call(file, NULL, NULL, NULL);
+ if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
+ return true;
+ if (eflags & EVENT_FILE_FL_PID_FILTER)
+ return trace_event_ignore_this_pid(file);
+ return false;
+}
+EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled);
+
/**
* event_triggers_post_call - Call 'post_triggers' for a trace event
* @file: The trace_event_file associated with the event
@@ -128,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file)
{
struct event_trigger_data *data;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ list_for_each_entry_rcu(data, &file->triggers, list,
+ lockdep_is_held(&event_mutex)) {
if (data->flags & EVENT_TRIGGER_FL_PROBE)
continue;
return true;
@@ -174,7 +189,7 @@ static int trigger_show(struct seq_file *m, void *v)
}
data = list_entry(v, struct event_trigger_data, list);
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
return 0;
}
@@ -418,7 +433,6 @@ event_trigger_print(const char *name, struct seq_file *m,
/**
* event_trigger_init - Generic event_trigger_ops @init implementation
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data
*
* Common implementation of event trigger initialization.
@@ -428,8 +442,7 @@ event_trigger_print(const char *name, struct seq_file *m,
*
* Return: 0 on success, errno otherwise
*/
-int event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+int event_trigger_init(struct event_trigger_data *data)
{
data->ref++;
return 0;
@@ -437,7 +450,6 @@ int event_trigger_init(struct event_trigger_ops *ops,
/**
* event_trigger_free - Generic event_trigger_ops @free implementation
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data
*
* Common implementation of event trigger de-initialization.
@@ -446,8 +458,7 @@ int event_trigger_init(struct event_trigger_ops *ops,
* implementations.
*/
static void
-event_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+event_trigger_free(struct event_trigger_data *data)
{
if (WARN_ON_ONCE(data->ref <= 0))
return;
@@ -501,7 +512,7 @@ clear_event_triggers(struct trace_array *tr)
trace_event_trigger_enable_disable(file, 0);
list_del_rcu(&data->list);
if (data->ops->free)
- data->ops->free(data->ops, data);
+ data->ops->free(data);
}
}
}
@@ -567,19 +578,18 @@ static int register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
list_add_rcu(&data->list, &file->triggers);
- ret++;
update_cond_flag(file);
- if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ ret = trace_event_trigger_enable_disable(file, 1);
+ if (ret < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
- ret--;
}
out:
return ret;
@@ -600,14 +610,13 @@ static void unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file)
{
- struct event_trigger_data *data;
- bool unregistered = false;
+ struct event_trigger_data *data = NULL, *iter;
lockdep_assert_held(&event_mutex);
- list_for_each_entry(data, &file->triggers, list) {
- if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
- unregistered = true;
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+ data = iter;
list_del_rcu(&data->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -615,8 +624,8 @@ static void unregister_trigger(char *glob,
}
}
- if (unregistered && data->ops->free)
- data->ops->free(data->ops, data);
+ if (data && data->ops->free)
+ data->ops->free(data);
}
/*
@@ -730,15 +739,15 @@ bool event_trigger_empty_param(const char *param)
/**
* event_trigger_separate_filter - separate an event trigger from a filter
- * @param: The param string containing trigger and possibly filter
- * @trigger: outparam, will be filled with a pointer to the trigger
+ * @param_and_filter: String containing trigger and possibly filter
+ * @param: outparam, will be filled with a pointer to the trigger
* @filter: outparam, will be filled with a pointer to the filter
* @param_required: Specifies whether or not the param string is required
*
* Given a param string of the form '[trigger] [if filter]', this
* function separates the filter from the trigger and returns the
- * trigger in *trigger and the filter in *filter. Either the *trigger
- * or the *filter may be set to NULL by this function - if not set to
+ * trigger in @param and the filter in @filter. Either the @param
+ * or the @filter may be set to NULL by this function - if not set to
* NULL, they will contain strings corresponding to the trigger and
* filter.
*
@@ -913,48 +922,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops,
* @cmd_ops: The event_command operations for the trigger
* @file: The event file for the trigger's event
* @glob: The trigger command string, with optional remove(!) operator
- * @cmd: The cmd string
- * @param: The param string
* @trigger_data: The trigger_data for the trigger
- * @n_registered: optional outparam, the number of triggers registered
*
* Register an event trigger. The @cmd_ops are used to call the
- * cmd_ops->reg() function which actually does the registration. The
- * cmd_ops->reg() function returns the number of triggers registered,
- * which is assigned to n_registered, if n_registered is non-NULL.
+ * cmd_ops->reg() function which actually does the registration.
*
* Return: 0 on success, errno otherwise
*/
int event_trigger_register(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob,
- char *cmd,
- char *param,
- struct event_trigger_data *trigger_data,
- int *n_registered)
+ struct event_trigger_data *trigger_data)
{
- int ret;
-
- if (n_registered)
- *n_registered = 0;
-
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- cmd_ops->unreg(glob, trigger_data, file);
- ret = -ENOENT;
- } else if (ret > 0) {
- if (n_registered)
- *n_registered = ret;
- /* Just return zero, not the number of enabled functions */
- ret = 0;
- }
+ return cmd_ops->reg(glob, trigger_data, file);
+}
- return ret;
+/**
+ * event_trigger_unregister - unregister an event trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @glob: The trigger command string, with optional remove(!) operator
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Unregister an event trigger. The @cmd_ops are used to call the
+ * cmd_ops->unreg() function which actually does the unregistration.
+ */
+void event_trigger_unregister(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data)
+{
+ cmd_ops->unreg(glob, trigger_data, file);
}
/*
@@ -967,7 +965,7 @@ int event_trigger_register(struct event_command *cmd_ops,
* @file: The trace_event_file associated with the event
* @glob: The raw string used to register the trigger
* @cmd: The cmd portion of the string used to register the trigger
- * @param: The params portion of the string used to register the trigger
+ * @param_and_filter: The param and filter portion of the string used to register the trigger
*
* Common implementation for event command parsing and trigger
* instantiation.
@@ -980,94 +978,53 @@ int event_trigger_register(struct event_command *cmd_ops,
static int
event_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd, char *param_and_filter)
{
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
- char *trigger = NULL;
- char *number;
+ char *param, *filter;
+ bool remove;
int ret;
- /* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0])) {
- trigger = strsep(&param, " \t");
- if (param) {
- param = skip_spaces(param);
- if (!*param)
- param = NULL;
- }
- }
+ remove = event_trigger_check_remove(glob);
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+ ret = event_trigger_separate_filter(param_and_filter, &param, &filter, false);
+ if (ret)
+ return ret;
ret = -ENOMEM;
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
if (!trigger_data)
goto out;
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
- trigger_data->private_data = file;
- INIT_LIST_HEAD(&trigger_data->list);
- INIT_LIST_HEAD(&trigger_data->named_list);
-
- if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_data, file);
+ if (remove) {
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
kfree(trigger_data);
ret = 0;
goto out;
}
- if (trigger) {
- number = strsep(&trigger, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &trigger_data->count);
- if (ret)
- goto out_free;
- }
-
- if (!param) /* if param is non-empty, it's supposed to be a filter */
- goto out_reg;
-
- if (!cmd_ops->set_filter)
- goto out_reg;
+ ret = event_trigger_parse_num(param, trigger_data);
+ if (ret)
+ goto out_free;
- ret = cmd_ops->set_filter(param, trigger_data, file);
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
if (ret < 0)
goto out_free;
- out_reg:
/* Up the trigger_data count to make sure reg doesn't free it on failure */
- event_trigger_init(trigger_ops, trigger_data);
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- cmd_ops->unreg(glob, trigger_data, file);
- ret = -ENOENT;
- } else if (ret > 0)
- ret = 0;
+ event_trigger_init(trigger_data);
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret)
+ goto out_free;
/* Down the counter of trigger_data or free it if not used anymore */
- event_trigger_free(trigger_ops, trigger_data);
+ event_trigger_free(trigger_data);
out:
return ret;
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
kfree(trigger_data);
goto out;
}
@@ -1295,6 +1252,16 @@ traceon_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_on(file->tr);
+ return;
+ }
+
if (tracing_is_on())
return;
@@ -1306,8 +1273,15 @@ traceon_count_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- if (tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -1315,7 +1289,10 @@ traceon_count_trigger(struct event_trigger_data *data,
if (data->count != -1)
(data->count)--;
- tracing_on();
+ if (file)
+ tracer_tracing_on(file->tr);
+ else
+ tracing_on();
}
static void
@@ -1323,6 +1300,16 @@ traceoff_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_off(file->tr);
+ return;
+ }
+
if (!tracing_is_on())
return;
@@ -1334,8 +1321,15 @@ traceoff_count_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- if (!tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (!tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -1343,20 +1337,21 @@ traceoff_count_trigger(struct event_trigger_data *data,
if (data->count != -1)
(data->count)--;
- tracing_off();
+ if (file)
+ tracer_tracing_off(file->tr);
+ else
+ tracing_off();
}
static int
-traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("traceon", m, (void *)data->count,
data->filter_str);
}
static int
-traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("traceoff", m, (void *)data->count,
data->filter_str);
@@ -1467,8 +1462,7 @@ register_snapshot_trigger(char *glob,
}
static int
-snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("snapshot", m, (void *)data->count,
data->filter_str);
@@ -1540,7 +1534,12 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- trace_dump_stack(STACK_SKIP);
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ else
+ trace_dump_stack(STACK_SKIP);
}
static void
@@ -1558,8 +1557,7 @@ stacktrace_count_trigger(struct event_trigger_data *data,
}
static int
-stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("stacktrace", m, (void *)data->count,
data->filter_str);
@@ -1649,7 +1647,6 @@ event_enable_count_trigger(struct event_trigger_data *data,
}
int event_enable_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1674,8 +1671,7 @@ int event_enable_trigger_print(struct seq_file *m,
return 0;
}
-void event_enable_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1722,39 +1718,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
int event_enable_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd, char *param_and_filter)
{
struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
struct trace_array *tr = file->tr;
+ char *param, *filter;
+ bool enable, remove;
const char *system;
const char *event;
bool hist = false;
- char *trigger;
- char *number;
- bool enable;
int ret;
- if (!param)
- return -EINVAL;
+ remove = event_trigger_check_remove(glob);
- /* separate the trigger from the filter (s:e:n [if filter]) */
- trigger = strsep(&param, " \t");
- if (!trigger)
+ if (event_trigger_empty_param(param_and_filter))
return -EINVAL;
- if (param) {
- param = skip_spaces(param);
- if (!*param)
- param = NULL;
- }
- system = strsep(&trigger, ":");
- if (!trigger)
+ ret = event_trigger_separate_filter(param_and_filter, &param, &filter, true);
+ if (ret)
+ return ret;
+
+ system = strsep(&param, ":");
+ if (!param)
return -EINVAL;
- event = strsep(&trigger, ":");
+ event = strsep(&param, ":");
ret = -EINVAL;
event_enable_file = find_event_file(tr, system, event);
@@ -1770,32 +1760,24 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
#else
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
#endif
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
ret = -ENOMEM;
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
- if (!trigger_data)
- goto out;
enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
- if (!enable_data) {
- kfree(trigger_data);
+ if (!enable_data)
goto out;
- }
-
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
- INIT_LIST_HEAD(&trigger_data->list);
- RCU_INIT_POINTER(trigger_data->filter, NULL);
enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data->private_data = enable_data;
- if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_data, file);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ if (!trigger_data) {
+ kfree(enable_data);
+ goto out;
+ }
+
+ if (remove) {
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
kfree(trigger_data);
kfree(enable_data);
ret = 0;
@@ -1803,35 +1785,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
}
/* Up the trigger_data count to make sure nothing frees it on failure */
- event_trigger_init(trigger_ops, trigger_data);
-
- if (trigger) {
- number = strsep(&trigger, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &trigger_data->count);
- if (ret)
- goto out_free;
- }
+ event_trigger_init(trigger_data);
- if (!param) /* if param is non-empty, it's supposed to be a filter */
- goto out_reg;
-
- if (!cmd_ops->set_filter)
- goto out_reg;
+ ret = event_trigger_parse_num(param, trigger_data);
+ if (ret)
+ goto out_free;
- ret = cmd_ops->set_filter(param, trigger_data, file);
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
if (ret < 0)
goto out_free;
- out_reg:
/* Don't let event modules unload while probe registered */
ret = trace_event_try_get_ref(event_enable_file->event_call);
if (!ret) {
@@ -1842,32 +1805,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
ret = trace_event_enable_disable(event_enable_file, 1, 1);
if (ret < 0)
goto out_put;
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret)
goto out_disable;
- /* Just return zero, not the number of enabled functions */
- ret = 0;
- event_trigger_free(trigger_ops, trigger_data);
+
+ event_trigger_free(trigger_data);
out:
return ret;
-
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
out_put:
trace_event_put_ref(event_enable_file->event_call);
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
- event_trigger_free(trigger_ops, trigger_data);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
+ event_trigger_free(trigger_data);
kfree(enable_data);
+
goto out;
}
@@ -1894,19 +1848,18 @@ int event_enable_register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
list_add_rcu(&data->list, &file->triggers);
- ret++;
update_cond_flag(file);
- if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ ret = trace_event_trigger_enable_disable(file, 1);
+ if (ret < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
- ret--;
}
out:
return ret;
@@ -1917,19 +1870,18 @@ void event_enable_unregister_trigger(char *glob,
struct trace_event_file *file)
{
struct enable_trigger_data *test_enable_data = test->private_data;
+ struct event_trigger_data *data = NULL, *iter;
struct enable_trigger_data *enable_data;
- struct event_trigger_data *data;
- bool unregistered = false;
lockdep_assert_held(&event_mutex);
- list_for_each_entry(data, &file->triggers, list) {
- enable_data = data->private_data;
+ list_for_each_entry(iter, &file->triggers, list) {
+ enable_data = iter->private_data;
if (enable_data &&
- (data->cmd_ops->trigger_type ==
+ (iter->cmd_ops->trigger_type ==
test->cmd_ops->trigger_type) &&
(enable_data->file == test_enable_data->file)) {
- unregistered = true;
+ data = iter;
list_del_rcu(&data->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -1937,8 +1889,8 @@ void event_enable_unregister_trigger(char *glob,
}
}
- if (unregistered && data->ops->free)
- data->ops->free(data->ops, data);
+ if (data && data->ops->free)
+ data->ops->free(data);
}
static struct event_trigger_ops *
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..539b08ae7020
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1914 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/refcount.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+/* Reminder to move to uapi when everything works */
+#ifdef CONFIG_COMPILE_TEST
+#include <linux/user_events.h>
+#else
+#include <uapi/linux/user_events.h>
+#endif
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be a power of two of PAGE_SIZE.
+ */
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
+#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
+#define MAX_EVENTS (MAX_BYTES * 8)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+
+/*
+ * The MAP_STATUS_* macros are used for taking a index and determining the
+ * appropriate byte and the bit in the byte to set/reset for an event.
+ *
+ * The lower 3 bits of the index decide which bit to set.
+ * The remaining upper bits of the index decide which byte to use for the bit.
+ *
+ * This is used when an event has a probe attached/removed to reflect live
+ * status of the event wanting tracing or not to user-programs via shared
+ * memory maps.
+ */
+#define MAP_STATUS_BYTE(index) ((index) >> 3)
+#define MAP_STATUS_MASK(index) BIT((index) & 7)
+
+/*
+ * Internal bits (kernel side only) to keep track of connected probes:
+ * These are used when status is requested in text form about an event. These
+ * bits are compared against an internal byte on the event to determine which
+ * probes to print out to the user.
+ *
+ * These do not reflect the mapped bytes between the user and kernel space.
+ */
+#define EVENT_STATUS_FTRACE BIT(0)
+#define EVENT_STATUS_PERF BIT(1)
+#define EVENT_STATUS_OTHER BIT(7)
+
+/*
+ * Stores the pages, tables, and locks for a group of events.
+ * Each logical grouping of events has its own group, with a
+ * matching page for status checks within user programs. This
+ * allows for isolation of events to user programs by various
+ * means.
+ */
+struct user_event_group {
+ struct page *pages;
+ char *register_page_data;
+ char *system_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
+ DECLARE_HASHTABLE(register_table, 8);
+ DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+};
+
+/* Group for init_user_ns mapping, top-most group */
+static struct user_event_group *init_group;
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches one.
+ */
+struct user_event {
+ struct user_event_group *group;
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ refcount_t refcnt;
+ int index;
+ int flags;
+ int min_size;
+ char status;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+struct user_event_file_info {
+ struct user_event_group *group;
+ struct user_event_refs *refs;
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head link;
+ int offset;
+ int flags;
+};
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(struct user_event_group *group, char *name,
+ char *args, char *flags,
+ struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static void set_page_reservations(char *pages, bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = pages + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static void user_event_group_destroy(struct user_event_group *group)
+{
+ if (group->register_page_data)
+ set_page_reservations(group->register_page_data, false);
+
+ if (group->pages)
+ __free_pages(group->pages, MAX_PAGE_ORDER);
+
+ kfree(group->system_name);
+ kfree(group);
+}
+
+static char *user_event_group_system_name(struct user_namespace *user_ns)
+{
+ char *system_name;
+ int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+
+ if (user_ns != &init_user_ns) {
+ /*
+ * Unexpected at this point:
+ * We only currently support init_user_ns.
+ * When we enable more, this will trigger a failure so log.
+ */
+ pr_warn("user_events: Namespace other than init_user_ns!\n");
+ return NULL;
+ }
+
+ system_name = kmalloc(len, GFP_KERNEL);
+
+ if (!system_name)
+ return NULL;
+
+ snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+
+ return system_name;
+}
+
+static inline struct user_event_group
+*user_event_group_from_user_ns(struct user_namespace *user_ns)
+{
+ if (user_ns == &init_user_ns)
+ return init_group;
+
+ return NULL;
+}
+
+static struct user_event_group *current_user_event_group(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ struct user_event_group *group = NULL;
+
+ while (user_ns) {
+ group = user_event_group_from_user_ns(user_ns);
+
+ if (group)
+ break;
+
+ user_ns = user_ns->parent;
+ }
+
+ return group;
+}
+
+static struct user_event_group
+*user_event_group_create(struct user_namespace *user_ns)
+{
+ struct user_event_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+
+ if (!group)
+ return NULL;
+
+ group->system_name = user_event_group_system_name(user_ns);
+
+ if (!group->system_name)
+ goto error;
+
+ group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+
+ if (!group->pages)
+ goto error;
+
+ group->register_page_data = page_address(group->pages);
+
+ set_page_reservations(group->register_page_data, true);
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(group->page_bitmap, MAX_EVENTS);
+ set_bit(0, group->page_bitmap);
+
+ mutex_init(&group->reg_mutex);
+ hash_init(group->register_table);
+
+ return group;
+error:
+ if (group)
+ user_event_group_destroy(group);
+
+ return NULL;
+};
+
+static __always_inline
+void user_event_register_set(struct user_event *user)
+{
+ int i = user->index;
+
+ user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+}
+
+static __always_inline
+void user_event_register_clear(struct user_event *user)
+{
+ int i = user->index;
+
+ user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+}
+
+static __always_inline __must_check
+bool user_event_last_ref(struct user_event *user)
+{
+ return refcount_read(&user->refcnt) == 1;
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(struct user_event_group *group,
+ char *raw_command, struct user_event **newuser)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(group, name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, link) {
+ list_del(&validator->link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != NULL)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != NULL)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != NULL;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
+ char *buf, int len, bool *colon)
+{
+ int pos = 0, i = *iout;
+
+ *colon = false;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ *colon = true;
+ break;
+ }
+ }
+
+ /* Actual set, advance i */
+ if (len != 0)
+ *iout = i;
+
+ return pos + 1;
+}
+
+static int user_field_set_string(struct ftrace_event_field *field,
+ char *buf, int len, bool colon)
+{
+ int pos = 0;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
+
+ if (colon)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
+
+ return pos + 1;
+}
+
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ /* Unsafe to try to decode user provided print_fmt, use hex */
+ trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+ 1, iter->ent, iter->ent_size, true);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+
+ user_event_register_clear(user);
+ clear_bit(user->index, user->group->page_bitmap);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ return ret;
+}
+
+static struct user_event *find_user_event(struct user_event_group *group,
+ char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(group->register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name)) {
+ refcount_inc(&user->refcnt);
+ return user;
+ }
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+/*
+ * Writes the user supplied payload out to perf ring buffer.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ if (status)
+ user_event_register_set(user);
+ else
+ user_event_register_clear(user);
+
+ user->status = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ refcount_inc(&user->refcnt);
+ update_reg_page_for(user);
+ return 0;
+dec:
+ update_reg_page_for(user);
+ refcount_dec(&user->refcnt);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event_group *group;
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ group = current_user_event_group();
+
+ if (!group) {
+ kfree(name);
+ return -ENOENT;
+ }
+
+ mutex_lock(&group->reg_mutex);
+
+ ret = user_event_parse_cmd(group, name, &user);
+
+ if (!ret)
+ refcount_dec(&user->refcnt);
+
+ mutex_unlock(&group->reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return !user_event_last_ref(user);
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (!user_event_last_ref(user))
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name = NULL, *dyn_field_name = NULL;
+ bool colon = false, match = false;
+ int dyn_len, len;
+
+ if (*iout >= argc)
+ return false;
+
+ dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+ 0, &colon);
+
+ len = user_field_set_string(field, field_name, 0, colon);
+
+ if (dyn_len != len)
+ return false;
+
+ dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
+ field_name = kmalloc(len, GFP_KERNEL);
+
+ if (!dyn_field_name || !field_name)
+ goto out;
+
+ user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+ dyn_len, &colon);
+
+ user_field_set_string(field, field_name, len, colon);
+
+ match = strcmp(dyn_field_name, field_name) == 0;
+out:
+ kfree(dyn_field_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_safe_reverse(field, next, head, link)
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(struct user_event_group *group, char *name,
+ char *args, char *flags,
+ struct user_event **newuser)
+{
+ int ret;
+ int index;
+ u32 key;
+ struct user_event *user;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(group, name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ return 0;
+ }
+
+ index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
+
+ if (index == MAX_EVENTS)
+ return -EMFILE;
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->group = group;
+ user->tracepoint.name = name;
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+ user->class.system = group->system_name;
+
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+
+ ret = user_event_trace_register(user);
+
+ if (ret)
+ goto put_user_lock;
+
+ user->index = index;
+
+ /* Ensure we track self ref and caller ref (2) */
+ refcount_set(&user->refcnt, 2);
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ set_bit(user->index, group->page_bitmap);
+ hash_add(group->register_table, &user->node, key);
+
+ mutex_unlock(&event_mutex);
+
+ *newuser = user;
+ return 0;
+put_user_lock:
+ mutex_unlock(&event_mutex);
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(struct user_event_group *group, char *name)
+{
+ u32 key;
+ struct user_event *user = find_user_event(group, name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ refcount_dec(&user->refcnt);
+
+ if (!user_event_last_ref(user))
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(info->refs);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static int user_events_open(struct inode *node, struct file *file)
+{
+ struct user_event_group *group;
+ struct user_event_file_info *info;
+
+ group = current_user_event_group();
+
+ if (!group)
+ return -ENOENT;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return -ENOMEM;
+
+ info->group = group;
+
+ file->private_data = info;
+
+ return 0;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_single_range(WRITE, (char __user *)ubuf,
+ count, &iov, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct user_event_file_info *info,
+ struct user_event *user)
+{
+ struct user_event_group *group = info->group;
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(info->refs,
+ lockdep_is_held(&group->reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user;
+
+ refcount_inc(&user->refcnt);
+
+ rcu_assign_pointer(info->refs, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ if (size < offsetofend(struct user_reg, write_index))
+ return -EINVAL;
+
+ ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+ if (ret)
+ return ret;
+
+ kreg->size = size;
+
+ return 0;
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct user_event_file_info *info,
+ unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ char *name;
+ long ret;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(info->group, name, &user);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(info, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ refcount_dec(&user->refcnt);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ put_user((u32)ret, &ureg->write_index);
+ put_user(user->index, &ureg->status_bit);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct user_event_file_info *info,
+ unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(info->group, name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_group *group = info->group;
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_reg(info, uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_del(info, uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_file_info *info = file->private_data;
+ struct user_event_group *group;
+ struct user_event_refs *refs;
+ struct user_event *user;
+ int i;
+
+ if (!info)
+ return -EINVAL;
+
+ group = info->group;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&group->reg_mutex);
+
+ refs = info->refs;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i) {
+ user = refs->events[i];
+
+ if (user)
+ refcount_dec(&user->refcnt);
+ }
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&group->reg_mutex);
+
+ kfree(refs);
+ kfree(info);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .open = user_events_open,
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+static struct user_event_group *user_status_group(struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ if (!m)
+ return NULL;
+
+ return m->private;
+}
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ char *pages;
+ struct user_event_group *group = user_status_group(file);
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (size != MAX_BYTES)
+ return -EINVAL;
+
+ if (!group)
+ return -EINVAL;
+
+ pages = group->register_page_data;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(pages) >> PAGE_SHIFT,
+ size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event_group *group = m->private;
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0, flags;
+
+ if (!group)
+ return -EINVAL;
+
+ mutex_lock(&group->reg_mutex);
+
+ hash_for_each(group->register_table, i, user, node) {
+ status = user->status;
+ flags = user->flags;
+
+ seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+ if (flags != 0 || status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&group->reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+ seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ struct user_event_group *group;
+ int ret;
+
+ group = current_user_event_group();
+
+ if (!group)
+ return -ENOENT;
+
+ ret = seq_open(file, &user_seq_ops);
+
+ if (!ret) {
+ /* Chain group to seq_file */
+ struct seq_file *m = file->private_data;
+
+ m->private = group;
+ }
+
+ return ret;
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .mmap = user_status_mmap,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ /* mmap with MAP_SHARED requires writable fd */
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static int __init trace_events_user_init(void)
+{
+ int ret;
+
+ init_group = user_event_group_create(&init_user_ns);
+
+ if (!init_group)
+ return -ENOMEM;
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ user_event_group_destroy(init_group);
+ init_group = NULL;
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 508f14af4f2c..5a75b039e586 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -20,6 +20,7 @@
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -32,7 +33,7 @@ static int __init set_kprobe_boot_events(char *str)
strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
- return 0;
+ return 1;
}
__setup("kprobe_event=", set_kprobe_boot_events);
@@ -163,7 +164,8 @@ static bool trace_kprobe_match(const char *system, const char *event,
{
struct trace_kprobe *tk = to_trace_kprobe(ev);
- return strcmp(trace_probe_name(&tk->tp), event) == 0 &&
+ return (event[0] == '\0' ||
+ strcmp(trace_probe_name(&tk->tp), event) == 0) &&
(!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) &&
trace_kprobe_match_command_head(tk, argc, argv);
}
@@ -708,11 +710,11 @@ static int __trace_kprobe_create(int argc, const char *argv[])
/*
* Argument syntax:
* - Add kprobe:
- * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
+ * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
* - Add kretprobe:
- * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
+ * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS]
* Or
- * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS]
+ * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS]
*
* Fetch args:
* $retval : fetch return value
@@ -739,6 +741,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
long offset = 0;
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
+ char gbuf[MAX_EVENT_NAME_LEN];
unsigned int flags = TPARG_FL_KERNEL;
switch (argv[0][0]) {
@@ -833,11 +836,13 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_set_index(0);
if (event) {
- ret = traceprobe_parse_event_name(&event, &group, buf,
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
goto parse_error;
- } else {
+ }
+
+ if (!event) {
/* Make a new event name */
if (symbol)
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
@@ -1219,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = {
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
+ return kern_fetch_store_strlen(addr);
}
/*
@@ -1251,21 +1241,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -1275,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
@@ -1433,7 +1387,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
fbuffer.regs = regs;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
@@ -1628,7 +1582,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
return;
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
@@ -1718,8 +1672,17 @@ static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct kretprobe *rp = get_kretprobe(ri);
- struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp);
+ struct trace_kprobe *tk;
+ /*
+ * There is a small chance that get_kretprobe(ri) returns NULL when
+ * the kretprobe is unregister on another CPU between kretprobe's
+ * trampoline_handler and this function.
+ */
+ if (unlikely(!rp))
+ return 0;
+
+ tk = container_of(rp, struct trace_kprobe, rp);
raw_cpu_inc(*tk->nhit);
if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
@@ -1907,25 +1870,18 @@ core_initcall(init_kprobe_trace_early);
static __init int init_kprobe_trace(void)
{
int ret;
- struct dentry *entry;
ret = tracing_init_dentry();
if (ret)
return 0;
- entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE,
- NULL, NULL, &kprobe_events_ops);
-
/* Event list interface */
- if (!entry)
- pr_warn("Could not create tracefs 'kprobe_events' entry\n");
+ trace_create_file("kprobe_events", TRACE_MODE_WRITE,
+ NULL, NULL, &kprobe_events_ops);
/* Profile interface */
- entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ,
- NULL, NULL, &kprobe_profile_ops);
-
- if (!entry)
- pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
+ trace_create_file("kprobe_profile", TRACE_MODE_READ,
+ NULL, NULL, &kprobe_profile_ops);
setup_boot_kprobe_events();
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 870a08da5b48..4300c5dc4e5d 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -917,7 +917,7 @@ void osnoise_trace_irq_entry(int id)
void osnoise_trace_irq_exit(int id, const char *desc)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- int duration;
+ s64 duration;
if (!osn_var->sampling)
return;
@@ -1048,7 +1048,7 @@ static void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- int duration;
+ s64 duration;
if (!osn_var->sampling)
return;
@@ -1144,7 +1144,7 @@ thread_entry(struct osnoise_variables *osn_var, struct task_struct *t)
static void
thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
{
- int duration;
+ s64 duration;
if (!osn_var->sampling)
return;
@@ -1167,8 +1167,10 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
- struct task_struct *n)
+trace_sched_switch_callback(void *data, bool preempt,
+ struct task_struct *p,
+ struct task_struct *n,
+ unsigned int prev_state)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
@@ -1387,6 +1389,26 @@ static int run_osnoise(void)
}
/*
+ * In some cases, notably when running on a nohz_full CPU with
+ * a stopped tick PREEMPT_RCU has no way to account for QSs.
+ * This will eventually cause unwarranted noise as PREEMPT_RCU
+ * will force preemption as the means of ending the current
+ * grace period. We avoid this problem by calling
+ * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * EQS allowing PREEMPT_RCU to end the current grace period.
+ * This call shouldn't be wrapped inside an RCU critical
+ * section.
+ *
+ * Note that in non PREEMPT_RCU kernels QSs are handled through
+ * cond_resched()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ local_irq_disable();
+ rcu_momentary_dyntick_idle();
+ local_irq_enable();
+ }
+
+ /*
* For the non-preemptive kernel config: let threads runs, if
* they so wish.
*/
@@ -1437,6 +1459,37 @@ static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
/*
+ * osnoise_sleep - sleep until the next period
+ */
+static void osnoise_sleep(void)
+{
+ u64 interval;
+ ktime_t wake_time;
+
+ mutex_lock(&interface_lock);
+ interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+ mutex_unlock(&interface_lock);
+
+ /*
+ * differently from hwlat_detector, the osnoise tracer can run
+ * without a pause because preemption is on.
+ */
+ if (!interval) {
+ /* Let synchronize_rcu_tasks() make progress */
+ cond_resched_tasks_rcu_qs();
+ return;
+ }
+
+ wake_time = ktime_add_us(ktime_get(), interval);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) {
+ if (kthread_should_stop())
+ break;
+ }
+}
+
+/*
* osnoise_main - The osnoise detection kernel thread
*
* Calls run_osnoise() function to measure the osnoise for the configured runtime,
@@ -1444,30 +1497,10 @@ static struct cpumask save_cpumask;
*/
static int osnoise_main(void *data)
{
- u64 interval;
while (!kthread_should_stop()) {
-
run_osnoise();
-
- mutex_lock(&interface_lock);
- interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
- mutex_unlock(&interface_lock);
-
- do_div(interval, USEC_PER_MSEC);
-
- /*
- * differently from hwlat_detector, the osnoise tracer can run
- * without a pause because preemption is on.
- */
- if (interval < 1) {
- /* Let synchronize_rcu_tasks() make progress */
- cond_resched_tasks_rcu_qs();
- continue;
- }
-
- if (msleep_interruptible(interval))
- break;
+ osnoise_sleep();
}
return 0;
@@ -1545,11 +1578,27 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
trace_timerlat_sample(&s);
- notify_new_max_latency(diff);
+ if (osnoise_data.stop_tracing) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing) {
+
+ /*
+ * At this point, if stop_tracing is set and <= print_stack,
+ * print_stack is set and would be printed in the thread handler.
+ *
+ * Thus, print the stack trace as it is helpful to define the
+ * root cause of an IRQ latency.
+ */
+ if (osnoise_data.stop_tracing <= osnoise_data.print_stack) {
+ timerlat_save_stack(0);
+ timerlat_dump_stack(time_to_us(diff));
+ }
- if (osnoise_data.stop_tracing)
- if (time_to_us(diff) >= osnoise_data.stop_tracing)
osnoise_stop_tracing();
+ notify_new_max_latency(diff);
+
+ return HRTIMER_NORESTART;
+ }
+ }
wake_up_process(tlat->kthread);
@@ -1737,8 +1786,9 @@ static int start_per_cpu_kthreads(void)
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
if (retval) {
+ cpus_read_unlock();
stop_per_cpu_kthreads();
- break;
+ return retval;
}
}
@@ -2189,6 +2239,17 @@ static void osnoise_workload_stop(void)
if (osnoise_has_registered_instances())
return;
+ /*
+ * If callbacks were already disabled in a previous stop
+ * call, there is no need to disable then again.
+ *
+ * For instance, this happens when tracing is stopped via:
+ * echo 0 > tracing_on
+ * echo nop > current_tracer.
+ */
+ if (!trace_osnoise_callback_enabled)
+ return;
+
trace_osnoise_callback_enabled = false;
/*
* Make sure that ftrace_nmi_enter/exit() see
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8aa493d25c73..67f47ea27921 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list);
static int trace_search_list(struct list_head **list)
{
- struct trace_event *e;
+ struct trace_event *e = NULL, *iter;
int next = __TRACE_LAST_TYPE;
if (list_empty(&ftrace_event_list)) {
@@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list)
* We used up all possible max events,
* lets see if somebody freed one.
*/
- list_for_each_entry(e, &ftrace_event_list, list) {
- if (e->type != next)
+ list_for_each_entry(iter, &ftrace_event_list, list) {
+ if (iter->type != next) {
+ e = iter;
break;
+ }
next++;
}
@@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list)
if (next > TRACE_EVENT_TYPE_MAX)
return 0;
- *list = &e->list;
+ if (e)
+ *list = &e->list;
+ else
+ *list = &ftrace_event_list;
return next;
}
@@ -778,9 +783,8 @@ int register_trace_event(struct trace_event *event)
list_add_tail(&event->list, list);
- } else if (event->type > __TRACE_LAST_TYPE) {
- printk(KERN_WARNING "Need to add type to trace.h\n");
- WARN_ON(1);
+ } else if (WARN(event->type > __TRACE_LAST_TYPE,
+ "Need to add type to trace.h")) {
goto out;
} else {
/* Is this event already used */
@@ -1571,13 +1575,8 @@ __init static int init_events(void)
for (i = 0; events[i]; i++) {
event = events[i];
-
ret = register_trace_event(event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- event->type);
- WARN_ON_ONCE(1);
- }
+ WARN_ONCE(!ret, "event %d failed to register", event->type);
}
return 0;
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f4938040c228..1e130da1b742 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -46,7 +46,7 @@ void trace_hardirqs_on(void)
this_cpu_write(tracing_irq_cpu, 0);
}
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
@@ -94,15 +94,15 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
this_cpu_write(tracing_irq_cpu, 0);
}
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- lockdep_hardirqs_on(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
+ lockdep_hardirqs_on(caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
- lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirqs_off(caller_addr);
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 73d90179b51b..36dff277de46 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -257,6 +257,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
}
len = strlen(event);
if (len == 0) {
+ if (slash) {
+ *pevent = NULL;
+ return 0;
+ }
trace_probe_log_err(offset, NO_EVENT_NAME);
return -EINVAL;
} else if (len > MAX_EVENT_NAME_LEN) {
@@ -279,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
int ret = 0;
int len;
- if (strcmp(arg, "retval") == 0) {
+ if (flags & TPARG_FL_TPOINT) {
+ if (code->data)
+ return -EFAULT;
+ code->data = kstrdup(arg, GFP_KERNEL);
+ if (!code->data)
+ return -ENOMEM;
+ code->op = FETCH_OP_TP_ARG;
+ } else if (strcmp(arg, "retval") == 0) {
if (flags & TPARG_FL_RETURN) {
code->op = FETCH_OP_RETVAL;
} else {
@@ -303,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
goto inval_var;
- } else if (strcmp(arg, "comm") == 0) {
+ } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
code->op = FETCH_OP_COMM;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
} else if (((flags & TPARG_FL_MASK) ==
@@ -319,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
#endif
- } else if (flags & TPARG_FL_TPOINT) {
- if (code->data)
- return -EFAULT;
- code->data = kstrdup(arg, GFP_KERNEL);
- if (!code->data)
- return -ENOMEM;
- code->op = FETCH_OP_TP_ARG;
} else
goto inval_var;
@@ -380,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
break;
case '%': /* named register */
+ if (flags & TPARG_FL_TPOINT) {
+ /* eprobes do not handle registers */
+ trace_probe_log_err(offs, BAD_VAR);
+ break;
+ }
ret = regs_query_register_offset(arg + 1);
if (ret >= 0) {
code->op = FETCH_OP_REG;
@@ -613,9 +622,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
/*
* Since $comm and immediate string can not be dereferenced,
- * we can find those by strcmp.
+ * we can find those by strcmp. But ignore for eprobes.
*/
- if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
+ if (!(flags & TPARG_FL_TPOINT) &&
+ (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
+ strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
goto out;
@@ -871,15 +882,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
switch (ptype) {
case PROBE_PRINT_NORMAL:
fmt = "(%lx)";
- arg = "REC->" FIELD_STRING_IP;
+ arg = ", REC->" FIELD_STRING_IP;
break;
case PROBE_PRINT_RETURN:
fmt = "(%lx <- %lx)";
- arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
break;
case PROBE_PRINT_EVENT:
- fmt = "(%u)";
- arg = "REC->" FIELD_STRING_TYPE;
+ fmt = "";
+ arg = "";
break;
default:
WARN_ON_ONCE(1);
@@ -903,7 +914,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
parg->type->fmt);
}
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg);
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 99e7a5df025e..de38f1c03776 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,7 +38,6 @@
#define FIELD_STRING_IP "__probe_ip"
#define FIELD_STRING_RETIP "__probe_ret_ip"
#define FIELD_STRING_FUNC "__probe_func"
-#define FIELD_STRING_TYPE "__probe_type"
#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed) \
@@ -443,7 +442,11 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(FAIL_REG_PROBE, "Failed to register probe event"),\
C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\
C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\
- C(SAME_PROBE, "There is already the exact same probe event"),
+ C(SAME_PROBE, "There is already the exact same probe event"),\
+ C(NO_EVENT_INFO, "This requires both group and event name to attach"),\
+ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\
+ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\
+ C(NO_EP_FILTER, "No filter rule after 'if'"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
new file mode 100644
index 000000000000..77dbd9ff9782
--- /dev/null
+++ b/kernel/trace/trace_probe_kernel.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_PROBE_KERNEL_H_
+#define __TRACE_PROBE_KERNEL_H_
+
+#define FAULT_STRING "(fault)"
+
+/*
+ * This depends on trace_probe.h, but can not include it due to
+ * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
+ * Which means that any other user must include trace_probe.h before including
+ * this file.
+ */
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+kern_fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int ret;
+
+ ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ /*
+ * strnlen_user_nofault returns zero on fault, insert the
+ * FAULT_STRING when that occurs.
+ */
+ if (ret <= 0)
+ return strlen(FAULT_STRING) + 1;
+ return ret;
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+kern_fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return kern_fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ /* For faults, return enough to hold the FAULT_STRING */
+ return (ret < 0) ? strlen(FAULT_STRING) + 1 : len;
+}
+
+static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len)
+{
+ if (ret >= 0) {
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+ } else {
+ strscpy(__dest, FAULT_STRING, len);
+ ret = strlen(__dest) + 1;
+ }
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ set_data_loc(ret, dest, __dest, base, maxlen);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+kern_fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return kern_fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ set_data_loc(ret, dest, __dest, base, maxlen);
+
+ return ret;
+}
+
+#endif /* __TRACE_PROBE_KERNEL_H_ */
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index 4d4b78c8ca25..a520b11afb0d 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = {
__init static int create_recursed_functions(void)
{
- struct dentry *dentry;
- dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE,
- NULL, NULL, &recursed_functions_fops);
- if (!dentry)
- pr_warn("WARNING: Failed to create recursed_functions\n");
+ trace_create_file("recursed_functions", TRACE_MODE_WRITE,
+ NULL, NULL, &recursed_functions_fops);
return 0;
}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..c9ffdcfe622e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,7 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
{
int flags;
@@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..330aee1c1a49 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,7 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array_cpu *data;
u64 T0, T1, delta;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index afd937a46496..a2d301f58ced 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -784,9 +784,7 @@ static struct fgraph_ops fgraph_ops __initdata = {
.retfunc = &trace_graph_return,
};
-#if defined(CONFIG_DYNAMIC_FTRACE) && \
- defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
-#define TEST_DIRECT_TRAMP
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
noinline __noclone static void trace_direct_tramp(void) { }
#endif
@@ -849,7 +847,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
goto out;
}
-#ifdef TEST_DIRECT_TRAMP
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
tracing_reset_online_cpus(&tr->array_buffer);
set_graph_array(tr);
@@ -897,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
ret = -1;
goto out;
}
+
+ /* Enable tracing on all functions again */
+ ftrace_set_global_filter(NULL, 0, 1);
#endif
/* Don't test dynamic tracing, the function tracer already did */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f755bde42fd0..942ddbdace4a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
/* parameter types */
- if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
@@ -201,8 +201,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-extern char *__bad_type_size(void);
-
#define SYSCALL_FIELD(_type, _name) { \
.type = #_type, .name = #_name, \
.size = sizeof(_type), .align = __alignof__(_type), \
@@ -296,9 +294,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct trace_event_file *trace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- unsigned int trace_ctx;
+ struct trace_event_buffer fbuffer;
unsigned long args[6];
int syscall_nr;
int size;
@@ -321,20 +317,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- trace_ctx = tracing_gen_ctx();
-
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- sys_data->enter_event->event.type, size, trace_ctx);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
+ entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, args);
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
- event_trigger_unlock_commit(trace_file, buffer, event, entry,
- trace_ctx);
+ trace_event_buffer_commit(&fbuffer);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,9 +335,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_file *trace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- unsigned int trace_ctx;
+ struct trace_event_buffer fbuffer;
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -364,20 +354,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- trace_ctx = tracing_gen_ctx();
-
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- sys_data->exit_event->event.type, sizeof(*entry),
- trace_ctx);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
+ entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- event_trigger_unlock_commit(trace_file, buffer, event, entry,
- trace_ctx);
+ trace_event_buffer_commit(&fbuffer);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9711589273cd..fb58e86dd117 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/string.h>
#include <linux/rculist.h>
+#include <linux/filter.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
@@ -312,7 +313,8 @@ static bool trace_uprobe_match(const char *system, const char *event,
{
struct trace_uprobe *tu = to_trace_uprobe(ev);
- return strcmp(trace_probe_name(&tu->tp), event) == 0 &&
+ return (event[0] == '\0' ||
+ strcmp(trace_probe_name(&tu->tp), event) == 0) &&
(!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) &&
trace_uprobe_match_command_head(tu, argc, argv);
}
@@ -532,7 +534,7 @@ end:
/*
* Argument syntax:
- * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS]
+ * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS]
*/
static int __trace_uprobe_create(int argc, const char **argv)
{
@@ -540,13 +542,13 @@ static int __trace_uprobe_create(int argc, const char **argv)
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
char *arg, *filename, *rctr, *rctr_end, *tmp;
char buf[MAX_EVENT_NAME_LEN];
+ char gbuf[MAX_EVENT_NAME_LEN];
enum probe_print_type ptype;
struct path path;
unsigned long offset, ref_ctr_offset;
bool is_return = false;
int i, ret;
- ret = 0;
ref_ctr_offset = 0;
switch (argv[0][0]) {
@@ -645,11 +647,13 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* setup a probe */
trace_probe_log_set_index(0);
if (event) {
- ret = traceprobe_parse_event_name(&event, &group, buf,
+ ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
goto fail_address_parse;
- } else {
+ }
+
+ if (!event) {
char *tail;
char *ptr;
@@ -1343,15 +1347,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
int size, esize;
int rctx;
+#ifdef CONFIG_BPF_EVENTS
if (bpf_prog_array_valid(call)) {
u32 ret;
- preempt_disable();
- ret = trace_call_bpf(call, regs);
- preempt_enable();
+ ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
if (!ret)
return;
}
+#endif /* CONFIG_BPF_EVENTS */
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9628b5571846..c774e560f2f9 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
- unsigned int dups = 0, total_dups = 0;
+ unsigned int total_dups = 0;
int i;
void *key;
@@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
key = sort_entries[0]->key;
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
- dups++; total_dups++;
+ total_dups++;
continue;
}
key = sort_entries[i]->key;
- dups = 0;
}
WARN_ONCE(total_dups > 0,
@@ -1045,7 +1044,8 @@ static void sort_secondary(struct tracing_map *map,
/**
* tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
* @map: The tracing_map
- * @sort_key: The sort key to use for sorting
+ * @sort_keys: The sort key to use for sorting
+ * @n_sort_keys: hitcount, always have at least one
* @sort_entries: outval: pointer to allocated and sorted array of entries
*
* tracing_map_sort_entries() sorts the current set of entries in the