From 214c1b7f13954559cf09d5d04b934bf32ba4d618 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:52 +0100 Subject: sched/balancing: Switch the 'DEFINE_SPINLOCK(balancing)' spinlock into an 'atomic_t sched_balance_running' flag The 'balancing' spinlock added in: 08c183f31bdb ("[PATCH] sched: add option to serialize load balancing") ... is taken when the SD_SERIALIZE flag is set in a domain, but in reality it is a glorified global atomic flag serializing the load-balancing of those domains. It doesn't have any explicit locking semantics per se: we just spin_trylock() it. Turn it into a ... global atomic flag. This makes it more clear what is going on here, and reduces overhead and code size a bit: # kernel/sched/fair.o: [x86-64 defconfig] text data bss dec hex filename 60730 2721 104 63555 f843 fair.o.before 60718 2721 104 63543 f837 fair.o.after Also document the flag a bit. No change in functionality intended. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Cc: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308105901.1096078-2-mingo@kernel.org --- kernel/sched/fair.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a16129f9a5c..2ef89b36aed1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11633,7 +11633,20 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* * Scale the max load_balance interval with the number of CPUs in the system. @@ -11711,7 +11724,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } @@ -11729,7 +11742,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; -- cgit v1.2.3 From 02a61f325a8e62a7c76479c5f2f7ddcba16877e8 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 8 Mar 2024 11:58:53 +0100 Subject: sched/balancing: Remove reliance on 'enum cpu_idle_type' ordering when iterating [CPU_MAX_IDLE_TYPES] arrays in show_schedstat() show_schedstat() output breaks and doesn't print all entries if the ordering of the definitions in 'enum cpu_idle_type' is changed, because show_schedstat() assumes that 'CPU_IDLE' is 0. Fix it before we change the definition order & values. [ mingo: Added changelog. ] Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-3-mingo@kernel.org --- kernel/sched/stats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 857f837f52cb..85277953cc72 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %*pb", dcount++, cpumask_pr_args(sched_domain_span(sd))); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { + for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], -- cgit v1.2.3 From 38d707c54df4ca58cd9ceae2ddcbd6f606b99e9f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:54 +0100 Subject: sched/balancing: Change 'enum cpu_idle_type' to have more natural definitions The cpu_idle_type enum has the confusingly inverted property that 'not idle' is 1, and 'idle' is '0'. This resulted in a number of unnecessary complications in the code. Reverse the order, remove the CPU_NOT_IDLE type, and convert all code to a natural boolean form. It's much more readable: - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; -------------------------------- - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) -------------------------------- And gets rid of the double negation in these usages: - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) Furthermore, this makes code much more obvious where there's differentiation between CPU_IDLE and CPU_NEWLY_IDLE. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Cc: "Gautham R. Shenoy" Link: https://lore.kernel.org/r/20240308105901.1096078-4-mingo@kernel.org --- include/linux/sched/idle.h | 2 +- kernel/sched/fair.c | 27 ++++++++++++--------------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 478084f9105e..e670ac282333 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -5,8 +5,8 @@ #include enum cpu_idle_type { + __CPU_NOT_IDLE = 0, CPU_IDLE, - CPU_NOT_IDLE, CPU_NEWLY_IDLE, CPU_MAX_IDLE_TYPES }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ef89b36aed1..3a510cf1fb00 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9070,7 +9070,7 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; env->loop++; @@ -9803,7 +9803,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) return false; /* @@ -9827,7 +9827,7 @@ static inline long sibling_imbalance(struct lb_env *env, int ncores_busiest, ncores_local; long imbalance; - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) return 0; ncores_busiest = sds->busiest->cores; @@ -9927,8 +9927,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_misfit_task_load = rq->misfit_task_load; *sg_status |= SG_OVERLOAD; } - } else if ((env->idle != CPU_NOT_IDLE) && - sched_reduced_capacity(rq, env->sd)) { + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ if (sgs->group_misfit_task_load < load) sgs->group_misfit_task_load = load; @@ -9940,7 +9939,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + if (!local_group && env->idle && sgs->sum_h_nr_running && sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; @@ -10698,7 +10697,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -10913,7 +10912,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) { + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already @@ -11121,7 +11120,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11159,7 +11158,7 @@ static int need_active_balance(struct lb_env *env) * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && + if (env->idle && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) @@ -11735,8 +11734,8 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); @@ -12416,9 +12415,7 @@ out: static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* * If this CPU has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle CPUs whose ticks are -- cgit v1.2.3 From 11b0bfa5d463b17cac5bf6b94fea4921713530c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:55 +0100 Subject: sched/debug: Increase SCHEDSTAT_VERSION to 16 We changed the order of definitions within 'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] columns in show_schedstat(). Suggested-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Cc: "Gautham R. Shenoy" Link: https://lore.kernel.org/r/20240308105901.1096078-5-mingo@kernel.org --- Documentation/scheduler/sched-stats.rst | 5 +++++ kernel/sched/stats.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index 03c062915998..73c412666655 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -2,6 +2,11 @@ Scheduler Statistics ==================== +Version 16 of schedstats changed the order of definitions within +'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] +columns in show_schedstat(). In particular the position of CPU_IDLE +and __CPU_NOT_IDLE changed places. The size of the array is unchanged. + Version 15 of schedstats dropped counters for some sched_yield: yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is identical to version 14. diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 85277953cc72..78e48f5426ee 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 16 static int show_schedstat(struct seq_file *seq, void *v) { -- cgit v1.2.3 From be8858dba9a2c3aec454a6b382671101fd0dc3b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:57 +0100 Subject: sched/balancing: Change comment formatting to not overlap Git conflict marker lines So the scheduler has two such comment blocks, with '=' used as a double underline: /* * VRUNTIME * ======== * '========' also happens to be a Git conflict marker, throwing off a simple search in an editor for this pattern. Change them to '-------' type of underline instead - it looks just as good. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-7-mingo@kernel.org --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a510cf1fb00..84d4791cf628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3679,7 +3679,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, /* * VRUNTIME - * ======== + * -------- * * COROLLARY #1: The virtual runtime of the entity needs to be * adjusted if re-weight at !0-lag point. @@ -3762,7 +3762,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, /* * DEADLINE - * ======== + * -------- * * When the weight changes, the virtual time slope changes and * we should adjust the relative virtual deadline accordingly. -- cgit v1.2.3 From 3a5fe9305719c680ccf63216781a4d4068c8e3f3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:58 +0100 Subject: sched/balancing: Fix comments (trying to) refer to NOHZ_BALANCE_KICK Fix two typos: - There's no such thing as 'nohz_balancing_kick', the flag is named 'BALANCE' and is capitalized: NOHZ_BALANCE_KICK. - Likewise there's no such thing as a 'pending nohz_balance_kick' either, the NOHZ_BALANCE_KICK flag is all upper-case. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-8-mingo@kernel.org --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84d4791cf628..f3c03c6db3c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12409,15 +12409,16 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq may be triggered from the scheduler tick, or by + * any of the flags in NOHZ_KICK_MASK: NOHZ_BALANCE_KICK, + * NOHZ_STATS_KICK or NOHZ_NEXT_KICK. */ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to * give the idle CPUs a chance to load balance. Else we may -- cgit v1.2.3 From 3dc6f6c8efe2d9148865664fffbe9dd89fb0d157 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:59 +0100 Subject: sched/balancing: Update run_rebalance_domains() comments The first sentence of the comment explaining run_rebalance_domains() is historic and not true anymore: * run_rebalance_domains is triggered when needed from the scheduler tick. ... contradicted/modified by the second sentence: * Also triggered for NOHZ idle balancing (with NOHZ_BALANCE_KICK set). Avoid that kind of confusion straight away and explain from what places sched_balance_softirq() is triggered. Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Acked-by: Valentin Schneider Link: https://lore.kernel.org/r/20240308105901.1096078-9-mingo@kernel.org --- kernel/sched/fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3c03c6db3c8..b567c0790f44 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12409,9 +12409,12 @@ out: } /* - * This softirq may be triggered from the scheduler tick, or by - * any of the flags in NOHZ_KICK_MASK: NOHZ_BALANCE_KICK, - * NOHZ_STATS_KICK or NOHZ_NEXT_KICK. + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local scheduler_tick() for periodic load balancing + * + * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { -- cgit v1.2.3 From e492e1b0e0721f3929ef9d9708d029144b396dd7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:59:00 +0100 Subject: sched/balancing: Vertically align the comments of 'struct sg_lb_stats' and 'struct sd_lb_stats' Make them easier to read. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-10-mingo@kernel.org --- kernel/sched/fair.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b567c0790f44..40b98e43d794 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9436,19 +9436,19 @@ static void update_blocked_averages(int cpu) * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long avg_load; /* Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned int group_smt_balance; /* Task on busy SMT be moved */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -9460,15 +9460,15 @@ struct sg_lb_stats { * during load balancing. */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) -- cgit v1.2.3 From 33928ed8bde066316dc3cb6ccf7f74400073652f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:59:01 +0100 Subject: sched/balancing: Update comments in 'struct sg_lb_stats' and 'struct sd_lb_stats' - Align for readability - Capitalize consistently Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-11-mingo@kernel.org --- kernel/sched/fair.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40b98e43d794..116a640534b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9433,17 +9433,17 @@ static void update_blocked_averages(int cpu) /********** Helpers for find_busiest_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /* Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ @@ -9456,8 +9456,7 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ @@ -9465,7 +9464,7 @@ struct sd_lb_stats { unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ -- cgit v1.2.3 From 70a27d6d1b19392a23bb4a41de7788fbc539f18d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:07 +0100 Subject: sched/balancing: Rename run_rebalance_domains() => sched_balance_softirq() run_rebalance_domains() is a misnomer, as it doesn't only run rebalance_domains(), but since the introduction of the NOHZ code it also runs nohz_idle_balance(). Rename it to sched_balance_softirq(), reflecting its more generic purpose and that it's a softirq handler. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-2-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 2 +- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 2 +- kernel/sched/fair.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index e57ad28301bd..6577b068f921 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -34,7 +34,7 @@ out of balance are tasks moved between groups. In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU through scheduler_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load -balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run +balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run in softirq context (SCHED_SOFTIRQ). The latter function takes two arguments: the runqueue of current CPU and whether diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index e814d4c01141..fbc326668e37 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -36,7 +36,7 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过scheduler_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 -的工作由run_rebalance_domains()->rebalance_domains()完成,在软中断上下文中执行 +的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 后一个函数有两个入参:当前CPU的运行队列、它在scheduler_tick()调用时是否空闲。函数会从 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 116a640534b9..953f39deb68e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12415,7 +12415,7 @@ out: * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; @@ -13216,7 +13216,7 @@ __init void init_sched_fair_class(void) #endif } - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; -- cgit v1.2.3 From 86dd6c04ef9f213e14d60c9f64bce1cc019f816e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:08 +0100 Subject: sched/balancing: Rename scheduler_tick() => sched_tick() - Standardize on prefixing scheduler-internal functions defined in with sched_*() prefix. scheduler_tick() was the only function using the scheduler_ prefix. Harmonize it. - The other reason to rename it is the NOHZ scheduler tick handling functions are already named sched_tick_*(). Make the 'git grep sched_tick' more meaningful. Signed-off-by: Ingo Molnar Acked-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-3-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 4 ++-- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 4 ++-- include/linux/sched.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/loadavg.c | 2 +- kernel/time/timer.c | 2 +- kernel/workqueue.c | 2 +- tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 6577b068f921..541d6c617971 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -32,13 +32,13 @@ load of each of its member CPUs, and only when the load of a group becomes out of balance are tasks moved between groups. In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU -through scheduler_tick(). It raises a softirq after the next regularly scheduled +through sched_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run in softirq context (SCHED_SOFTIRQ). The latter function takes two arguments: the runqueue of current CPU and whether -the CPU was idle at the time the scheduler_tick() happened and iterates over all +the CPU was idle at the time the sched_tick() happened and iterates over all sched domains our CPU is on, starting from its base domain and going up the ->parent chain. While doing that, it checks to see if the current domain has exhausted its rebalance interval. If so, it runs load_balance() on that domain. It then checks diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index fbc326668e37..fa0c0bcc6ba5 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -34,12 +34,12 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 调度域中的负载均衡发生在调度组中。也就是说,每个组被视为一个实体。组的负载被定义为它 管辖的每个CPU的负载之和。仅当组的负载不均衡后,任务才在组之间发生迁移。 -在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过scheduler_tick() +在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过sched_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 -后一个函数有两个入参:当前CPU的运行队列、它在scheduler_tick()调用时是否空闲。函数会从 +后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从 当前CPU所在的基调度域开始迭代执行,并沿着parent指针链向上进入更高层级的调度域。在迭代 过程中,函数会检查当前调度域是否已经耗尽了再平衡的时间间隔,如果是,它在该调度域运行 load_balance()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。 diff --git a/include/linux/sched.h b/include/linux/sched.h index 17cb0761ff65..7eb7f31af796 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -301,7 +301,7 @@ enum { TASK_COMM_LEN = 16, }; -extern void scheduler_tick(void); +extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d44efa0d0611..71b7a08a6502 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5662,7 +5662,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). + * interrupt handler sched_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 52c8f8226b0d..ca9da66cc894 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -379,7 +379,7 @@ void calc_global_load(void) } /* - * Called from scheduler_tick() to periodically update this CPU's + * Called from sched_tick() to periodically update this CPU's * active count. */ void calc_global_load_tick(struct rq *this_rq) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e69e75d3858c..ff49ddcc9800 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2478,7 +2478,7 @@ void update_process_times(int user_tick) if (in_irq()) irq_work_tick(); #endif - scheduler_tick(); + sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bf2bdac46843..8fbb0ec39079 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1464,7 +1464,7 @@ void wq_worker_sleeping(struct task_struct *task) * wq_worker_tick - a scheduler tick occurred while a kworker is running * @task: task currently running * - * Called from scheduler_tick(). We're in the IRQ context and the current + * Called from sched_tick(). We're in the IRQ context and the current * worker's fields which follow the 'K' locking rule can be accessed safely. */ void wq_worker_tick(struct task_struct *task) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 25432b8cd5bd..073a748b9380 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,7 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="scheduler_tick" +FUNC2="sched_tick" ALL_FUNCS="#### all functions enabled ####" -- cgit v1.2.3 From 983be0628c061989b6cc175d2f5e429b40699fbb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:09 +0100 Subject: sched/balancing: Rename trigger_load_balance() => sched_balance_trigger() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-4-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 2 +- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 2 +- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 541d6c617971..c7ea05f4107b 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -31,7 +31,7 @@ is treated as one entity. The load of a group is defined as the sum of the load of each of its member CPUs, and only when the load of a group becomes out of balance are tasks moved between groups. -In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU +In kernel/sched/core.c, sched_balance_trigger() is run periodically on each CPU through sched_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index fa0c0bcc6ba5..1a8587a971f9 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -34,7 +34,7 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 调度域中的负载均衡发生在调度组中。也就是说,每个组被视为一个实体。组的负载被定义为它 管辖的每个CPU的负载之和。仅当组的负载不均衡后,任务才在组之间发生迁移。 -在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过sched_tick() +在kernel/sched/core.c中,sched_balance_trigger()在每个CPU上通过sched_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 71b7a08a6502..929fce69f555 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5700,7 +5700,7 @@ void sched_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + sched_balance_trigger(rq); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 953f39deb68e..e377b675920a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12438,7 +12438,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { /* * Don't need to rebalance while attached to NULL domain or diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d2242679239e..5b0ddb0e6017 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2397,7 +2397,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); -- cgit v1.2.3 From 14ff4dbd34f46cc6b6105f549983321241ccbba9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:10 +0100 Subject: sched/balancing: Rename rebalance_domains() => sched_balance_domains() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-5-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 2 +- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 2 +- arch/arm/kernel/topology.c | 2 +- kernel/sched/fair.c | 8 ++++---- kernel/sched/sched.h | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index c7ea05f4107b..5d8e8b8b269e 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -34,7 +34,7 @@ out of balance are tasks moved between groups. In kernel/sched/core.c, sched_balance_trigger() is run periodically on each CPU through sched_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load -balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run +balancing workhorse, sched_balance_softirq()->sched_balance_domains(), is then run in softirq context (SCHED_SOFTIRQ). The latter function takes two arguments: the runqueue of current CPU and whether diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index 1a8587a971f9..e6590fd80640 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -36,7 +36,7 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 在kernel/sched/core.c中,sched_balance_trigger()在每个CPU上通过sched_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 -的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 +的工作由sched_balance_softirq()->sched_balance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从 diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index ef0058de432b..2336ee2aa44a 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ * can take this difference into account during load balance. A per cpu * structure is preferred because each CPU updates its own cpu_capacity field * during the load balance except for idle cores. One idle core is selected - * to run the rebalance_domains for all idle cores and the cpu_capacity can be + * to run the sched_balance_domains for all idle cores and the cpu_capacity can be * updated during this sequence. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e377b675920a..330788b0c617 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11685,7 +11685,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -12161,7 +12161,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -12422,7 +12422,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) /* * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -12432,7 +12432,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) /* normal load balance */ update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_domains(this_rq, idle); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b0ddb0e6017..41024c1c49b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2904,7 +2904,7 @@ extern void cfs_bandwidth_usage_dec(void); #define NOHZ_NEWILB_KICK_BIT 2 #define NOHZ_NEXT_KICK_BIT 3 -/* Run rebalance_domains() */ +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) /* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -- cgit v1.2.3 From 4c3e509ea9f249458e8692f8298cceac73105948 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:11 +0100 Subject: sched/balancing: Rename load_balance() => sched_balance_rq() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also load_balance() has become somewhat of a misnomer: historically it was the first and primary load-balancing function that was called, but with the introduction of sched domains, it's become a lower layer function that balances runqueues. Rename it to sched_balance_rq() accordingly. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-6-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 4 +-- Documentation/scheduler/sched-stats.rst | 32 +++++++++++----------- .../translations/zh_CN/scheduler/sched-domains.rst | 4 +-- .../translations/zh_CN/scheduler/sched-stats.rst | 30 ++++++++++---------- include/linux/sched/topology.h | 2 +- kernel/sched/fair.c | 10 +++---- 6 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 5d8e8b8b269e..5e996fe973b1 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -41,11 +41,11 @@ The latter function takes two arguments: the runqueue of current CPU and whether the CPU was idle at the time the sched_tick() happened and iterates over all sched domains our CPU is on, starting from its base domain and going up the ->parent chain. While doing that, it checks to see if the current domain has exhausted its -rebalance interval. If so, it runs load_balance() on that domain. It then checks +rebalance interval. If so, it runs sched_balance_rq() on that domain. It then checks the parent sched_domain (if it exists), and the parent of the parent and so forth. -Initially, load_balance() finds the busiest group in the current sched domain. +Initially, sched_balance_rq() finds the busiest group in the current sched domain. If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in that group. If it manages to find such a runqueue, it locks both our initial CPU's runqueue and the newly found busiest one and starts moving tasks from it diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index 73c412666655..7c2b16c4729d 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -77,53 +77,53 @@ domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 The first field is a bit mask indicating what cpus this domain operates over. -The next 24 are a variety of load_balance() statistics in grouped into types +The next 24 are a variety of sched_balance_rq() statistics in grouped into types of idleness (idle, busy, and newly idle): - 1) # of times in this domain load_balance() was called when the + 1) # of times in this domain sched_balance_rq() was called when the cpu was idle - 2) # of times in this domain load_balance() checked but found + 2) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when the cpu was idle - 3) # of times in this domain load_balance() tried to move one or + 3) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was idle 4) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was idle + sched_balance_rq() in this domain when the cpu was idle 5) # of times in this domain pull_task() was called when the cpu was idle 6) # of times in this domain pull_task() was called even though the target task was cache-hot when idle - 7) # of times in this domain load_balance() was called but did + 7) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was idle 8) # of times in this domain a busier queue was found while the cpu was idle but no busier group was found - 9) # of times in this domain load_balance() was called when the + 9) # of times in this domain sched_balance_rq() was called when the cpu was busy - 10) # of times in this domain load_balance() checked but found the + 10) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when busy - 11) # of times in this domain load_balance() tried to move one or + 11) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was busy 12) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was busy + sched_balance_rq() in this domain when the cpu was busy 13) # of times in this domain pull_task() was called when busy 14) # of times in this domain pull_task() was called even though the target task was cache-hot when busy - 15) # of times in this domain load_balance() was called but did not + 15) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was busy 16) # of times in this domain a busier queue was found while the cpu was busy but no busier group was found - 17) # of times in this domain load_balance() was called when the + 17) # of times in this domain sched_balance_rq() was called when the cpu was just becoming idle - 18) # of times in this domain load_balance() checked but found the + 18) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when the cpu was just becoming idle - 19) # of times in this domain load_balance() tried to move one or more + 19) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was just becoming idle 20) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was just becoming idle + sched_balance_rq() in this domain when the cpu was just becoming idle 21) # of times in this domain pull_task() was called when newly idle 22) # of times in this domain pull_task() was called even though the target task was cache-hot when just becoming idle - 23) # of times in this domain load_balance() was called but did not + 23) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was just becoming idle 24) # of times in this domain a busier queue was found while the cpu was just becoming idle but no busier group was found diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index e6590fd80640..06363169c56b 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -42,9 +42,9 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从 当前CPU所在的基调度域开始迭代执行,并沿着parent指针链向上进入更高层级的调度域。在迭代 过程中,函数会检查当前调度域是否已经耗尽了再平衡的时间间隔,如果是,它在该调度域运行 -load_balance()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。 +sched_balance_rq()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。 -起初,load_balance()查找当前调度域中最繁忙的调度组。如果成功,在该调度组管辖的全部CPU +起初,sched_balance_rq()查找当前调度域中最繁忙的调度组。如果成功,在该调度组管辖的全部CPU 的运行队列中找出最繁忙的运行队列。如能找到,对当前的CPU运行队列和新找到的最繁忙运行 队列均加锁,并把任务从最繁忙队列中迁移到当前CPU上。被迁移的任务数量等于在先前迭代执行 中计算出的该调度域的调度组的不均衡值。 diff --git a/Documentation/translations/zh_CN/scheduler/sched-stats.rst b/Documentation/translations/zh_CN/scheduler/sched-stats.rst index c5e0be663837..09eee2517610 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-stats.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-stats.rst @@ -75,42 +75,42 @@ domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 繁忙,新空闲): - 1) 当CPU空闲时,load_balance()在这个调度域中被调用了#次 - 2) 当CPU空闲时,load_balance()在这个调度域中被调用,但是发现负载无需 + 1) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用了#次 + 2) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需 均衡#次 - 3) 当CPU空闲时,load_balance()在这个调度域中被调用,试图迁移1个或更多 + 3) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多 任务且失败了#次 - 4) 当CPU空闲时,load_balance()在这个调度域中被调用,发现不均衡(如果有) + 4) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有) #次 5) 当CPU空闲时,pull_task()在这个调度域中被调用#次 6) 当CPU空闲时,尽管目标任务是热缓存状态,pull_task()依然被调用#次 - 7) 当CPU空闲时,load_balance()在这个调度域中被调用,未能找到更繁忙的 + 7) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的 队列#次 8) 当CPU空闲时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组 #次 - 9) 当CPU繁忙时,load_balance()在这个调度域中被调用了#次 - 10) 当CPU繁忙时,load_balance()在这个调度域中被调用,但是发现负载无需 + 9) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用了#次 + 10) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需 均衡#次 - 11) 当CPU繁忙时,load_balance()在这个调度域中被调用,试图迁移1个或更多 + 11) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多 任务且失败了#次 - 12) 当CPU繁忙时,load_balance()在这个调度域中被调用,发现不均衡(如果有) + 12) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有) #次 13) 当CPU繁忙时,pull_task()在这个调度域中被调用#次 14) 当CPU繁忙时,尽管目标任务是热缓存状态,pull_task()依然被调用#次 - 15) 当CPU繁忙时,load_balance()在这个调度域中被调用,未能找到更繁忙的 + 15) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的 队列#次 16) 当CPU繁忙时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组 #次 - 17) 当CPU新空闲时,load_balance()在这个调度域中被调用了#次 - 18) 当CPU新空闲时,load_balance()在这个调度域中被调用,但是发现负载无需 + 17) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用了#次 + 18) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需 均衡#次 - 19) 当CPU新空闲时,load_balance()在这个调度域中被调用,试图迁移1个或更多 + 19) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多 任务且失败了#次 - 20) 当CPU新空闲时,load_balance()在这个调度域中被调用,发现不均衡(如果有) + 20) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有) #次 21) 当CPU新空闲时,pull_task()在这个调度域中被调用#次 22) 当CPU新空闲时,尽管目标任务是热缓存状态,pull_task()依然被调用#次 - 23) 当CPU新空闲时,load_balance()在这个调度域中被调用,未能找到更繁忙的 + 23) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的 队列#次 24) 当CPU新空闲时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组 #次 diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 18572c9ea724..c8fe9bab981b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -110,7 +110,7 @@ struct sched_domain { unsigned long last_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ + /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 330788b0c617..0d2753c50be9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6866,7 +6866,7 @@ dequeue_throttle: #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq, load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); @@ -11242,7 +11242,7 @@ static int should_we_balance(struct lb_env *env) * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -11647,7 +11647,7 @@ out_unlock: static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -11727,7 +11727,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle @@ -12353,7 +12353,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); -- cgit v1.2.3 From f1cd2e2e79d283e315356bd403c7f928e994f057 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Oct 2023 13:04:12 +0200 Subject: sched/balancing: Rename find_busiest_queue() => sched_balance_find_src_rq() The find_busiest_queue() naming has two small quirks: - Scheduler functions that deal with runqueues usually have a rq_ prefix or _rq postfix, but this function has neither. - Plus the 'busiest' qualifier to this function was historically correct, but has become somewhat of a misnomer: in quite a few cases we will not pick the busiest runqueue - but the best (possible) runqueue we can balance tasks from. So name it a bit more neutrally, similar to the 'src/dst' nomenclature we are already using when moving tasks between runqueues. To fix both quirks, and to standardize scheduler load-balancing function names on the sched_balance_() prefix, rename the function to sched_balance_find_src_rq(). Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-7-mingo@kernel.org --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d2753c50be9..1cd9a18b35e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10959,9 +10959,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -11280,7 +11280,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; -- cgit v1.2.3 From 82cf921432fc184adbbb9c1bced182564876ec5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:14 +0100 Subject: sched/balancing: Rename find_busiest_group() => sched_balance_find_src_group() Make two naming changes: 1) Standardize scheduler load-balancing function names on the sched_balance_() prefix. 2) Similar to find_busiest_queue(), the find_busiest_group() naming has become a bit of a misnomer: the 'busiest' qualifier to this function was historically correct but in the current code in quite a few cases we will not pick the 'busiest' group - but the best (possible) group we can balance from based on a complex set of constraints. So name it a bit more neutrally, similar to the 'src/dst' nomenclature we are already using when moving tasks between runqueues, and also use the sched_balance_ prefix: sched_balance_find_src_group(). Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-9-mingo@kernel.org --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cd9a18b35e0..96a81b2fa281 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9430,7 +9430,7 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* * sg_lb_stats - stats of a sched_group required for load-balancing: @@ -9637,7 +9637,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -10788,7 +10788,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -10811,7 +10811,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * @@ -10820,7 +10820,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -11274,7 +11274,7 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; @@ -11298,7 +11298,7 @@ redo: env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. -- cgit v1.2.3 From 391b7a5335c45b2bafe535cb440836ccd17515aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:15 +0100 Subject: sched/balancing: Rename update_blocked_averages() => sched_balance_update_blocked_averages() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-10-mingo@kernel.org --- kernel/sched/fair.c | 8 ++++---- kernel/sched/pelt.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96a81b2fa281..95f7092043f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9411,7 +9411,7 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); @@ -12079,7 +12079,7 @@ static bool update_nohz_stats(struct rq *rq) if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) return true; - update_blocked_averages(cpu); + sched_balance_update_blocked_averages(cpu); return rq->has_blocked_load; } @@ -12339,7 +12339,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); - update_blocked_averages(this_cpu); + sched_balance_update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -12431,7 +12431,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); + sched_balance_update_blocked_averages(this_rq->cpu); sched_balance_domains(this_rq, idle); } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 63b6cf898220..f80955ecdce6 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -209,7 +209,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages(). + * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). */ -- cgit v1.2.3 From 7d058285cd77cc1411c91efd1b1673530bb1bee8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:16 +0100 Subject: sched/balancing: Rename newidle_balance() => sched_balance_newidle() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-11-mingo@kernel.org --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95f7092043f3..aa5ff0efcca8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4816,7 +4816,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -5136,7 +5136,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -8253,7 +8253,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (rq->nr_running) return 1; - return newidle_balance(rq, rf) != 0; + return sched_balance_newidle(rq, rf) != 0; } #endif /* CONFIG_SMP */ @@ -8505,10 +8505,10 @@ idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -11493,7 +11493,7 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. @@ -12277,7 +12277,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -12285,7 +12285,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; -- cgit v1.2.3 From 646ebaf51c64c6416ca89765c20041363fc1b518 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:17 +0100 Subject: sched/balancing: Rename find_idlest_group_cpu() => sched_balance_find_dst_group_cpu() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also use 'dst' instead of 'idlest': while historically correct, today it's not really true anymore that we return the 'idlest' group or CPU, we sort by idle-exit latency and only return the idlest CPUs from the lowest-latency set of CPUs. The true 'idlest' CPUs often remain idle for a long time and are never returned as long as the system is under-loaded. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-12-mingo@kernel.org --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa5ff0efcca8..02ff0272b2e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7101,10 +7101,10 @@ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -7191,7 +7191,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; -- cgit v1.2.3 From a88b17080294f735c4124acccfa2d803a6a7d46f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:18 +0100 Subject: sched/balancing: Rename find_idlest_group() => sched_balance_find_dst_group() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also use 'dst' instead of 'idlest', because it's not really true that we return the 'idlest' group or CPU, we sort by idle-exit latency and only return the idlest CPUs from the lowest-latency set of CPUs. The true 'idlest' CPUs often remain idle for a long time and are never returned as long as the system is under-loaded. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-13-mingo@kernel.org --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02ff0272b2e4..d0c3a091d7d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7098,7 +7098,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. @@ -7185,7 +7185,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -10296,13 +10296,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; -- cgit v1.2.3 From 686d148cbb5a1c2891914b8d11147d3c5556a29a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:19 +0100 Subject: sched/balancing: Rename find_idlest_cpu() => sched_balance_find_dst_cpu() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also use 'dst' instead of 'idlest', because it's not really true that we return the 'idlest' group or CPU, we sort by idle-exit latency and only return the idlest CPUs from the lowest-latency set of CPUs. The true 'idlest' CPUs often remain idle for a long time and are never returned as long as the system is under-loaded. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-14-mingo@kernel.org --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0c3a091d7d1..4b3c4a181a91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7160,7 +7160,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct * return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -7936,7 +7936,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -8201,7 +8201,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.2.3 From d72cf62438d67b911212f8d4cf65d6167c1541ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Mar 2024 11:33:50 +0100 Subject: sched/balancing: Fix a couple of outdated function names in comments The 'idle_balance()' function hasn't existed for years, and there's no load_balance_newidle() either - both are sched_balance_newidle() today. Reported-by: Honglei Wang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ZfAwNufbiyt/5biu@gmail.com --- kernel/sched/fair.c | 2 +- kernel/sched/pelt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b3c4a181a91..a19ea290b790 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6866,7 +6866,7 @@ dequeue_throttle: #ifdef CONFIG_SMP -/* Working cpumask for: sched_balance_rq, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index f80955ecdce6..3a96da25b67c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -208,7 +208,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * se has been already dequeued but cfs_rq->curr still points to it. * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls + * this happens during sched_balance_newidle() which calls * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). -- cgit v1.2.3 From b9e6e28663928cab836a19abbdec3d036a07db3b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Mar 2024 12:06:03 +0100 Subject: sched/fair: Fix typos in comments So I made all speling mistakes / typos red in my editor. Big mistake... Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org --- kernel/sched/fair.c | 68 ++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a19ea290b790..c8e50fbac345 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -388,8 +388,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -406,7 +406,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq) SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -595,13 +595,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the - * virtual time has non-continguous motion equivalent to: + * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * - * However, since v_i is u64, and the multiplcation could easily overflow + * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 @@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) } if (load) { - /* sign flips effective floor / ceil */ + /* sign flips effective floor / ceiling */ if (avg < 0) avg -= (load - 1); avg = div_s64(avg, load); @@ -721,7 +721,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * - * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. */ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) @@ -1024,7 +1024,7 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } /* @@ -1616,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -3284,7 +3284,7 @@ retry_pids: /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || @@ -3295,7 +3295,7 @@ retry_pids: /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ if (!vma_is_accessible(vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); @@ -3327,7 +3327,7 @@ retry_pids: } /* - * Scanning the VMA's of short lived tasks add more overhead. So + * Scanning the VMAs of short lived tasks add more overhead. So * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, @@ -3371,7 +3371,7 @@ retry_pids: /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -4733,7 +4733,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -5014,14 +5014,14 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * In the above example if a task is capped to a specific performance * point, y, then when: * - * * util = 80% of x then it does not fit on cpu0 and should migrate - * to cpu1 - * * util = 80% of y then it is forced to fit on cpu1 to honour + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour * uclamp_max request. * * which is what we're enforcing here. A task always fits if @@ -5052,7 +5052,7 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | (region c, boosted, util < uclamp_min) * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * a) If util > uclamp_max, then we're capped, we don't care about * actual fitness value here. We only care if uclamp_max fits @@ -5242,7 +5242,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; /* - * When joining the competition; the exisiting tasks will be, + * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition. */ @@ -5391,7 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); @@ -5427,7 +5427,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && @@ -7503,7 +7503,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * On asymmetric system, update task utilization because we will check - * that the task fits with cpu's capacity. + * that the task fits with CPU's capacity. */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); @@ -8027,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { /* * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation + * the clamp() part. I.e.: apply max aggregation * only. util_fits_cpu() logic requires to * operate on non clamped util but must use the * max-aggregated uclamp_{min, max}. @@ -8586,7 +8586,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -8924,7 +8924,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - /* Disregard pcpu kthreads; they are where they need to be. */ + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -10076,7 +10076,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -10715,7 +10715,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; env->imbalance = max_t(long, 0, @@ -11900,7 +11900,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. * - * When balancing betwen cores, all the SMT siblings of the + * When balancing between cores, all the SMT siblings of the * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { @@ -12061,7 +12061,7 @@ void nohz_balance_enter_idle(int cpu) out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } @@ -12085,7 +12085,7 @@ static bool update_nohz_stats(struct rq *rq) } /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ @@ -12190,7 +12190,7 @@ abort: /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -12221,7 +12221,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of - * those idle CPUs and outside the preempt disable / irq off phase of the local + * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) @@ -12232,7 +12232,7 @@ void nohz_run_idle_balance(int cpu) /* * Update the blocked load only if no SCHED_SOFTIRQ is about to happen - * (ie NOHZ_STATS_KICK set) and will do the same. + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); -- cgit v1.2.3 From 4d8926a0407cff0c864b759b59104f4fb6f8efab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 Mar 2024 17:01:27 -0700 Subject: bpf: preserve sleepable bit in subprog info Copy over main program's sleepable bit into subprog's info. This might be important for, e.g., freplace cases. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Message-ID: <20240314000127.3881569-1-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 63749ad5ac6b..7b208e5d38f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19158,6 +19158,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->sleepable = prog->sleepable; func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; -- cgit v1.2.3 From 7d2cc63eca0c993c99d18893214abf8f85d566d8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 8 Mar 2024 06:38:07 +0100 Subject: bpf: Take return from set_memory_ro() into account with bpf_prog_lock_ro() set_memory_ro() can fail, leaving memory unprotected. Check its return and take it into account as an error. Link: https://github.com/KSPP/linux/issues/7 Signed-off-by: Christophe Leroy Cc: linux-hardening@vger.kernel.org Reviewed-by: Kees Cook Message-ID: <286def78955e04382b227cb3e4b6ba272a7442e3.1709850515.git.christophe.leroy@csgroup.eu> Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 5 +++-- kernel/bpf/core.c | 4 +++- kernel/bpf/verifier.c | 8 ++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index c99bc3df2d28..9107ee1de66f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -887,14 +887,15 @@ bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) -static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); - set_memory_ro((unsigned long)fp, fp->pages); + return set_memory_ro((unsigned long)fp, fp->pages); } #endif + return 0; } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e8..63f100def31b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2403,7 +2403,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } finalize: - bpf_prog_lock_ro(fp); + *err = bpf_prog_lock_ro(fp); + if (*err) + return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b208e5d38f6..de7813947981 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19263,10 +19263,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) * bpf_prog_load will add the kallsyms for the main program. */ for (i = 1; i < env->subprog_cnt; i++) { - bpf_prog_lock_ro(func[i]); - bpf_prog_kallsyms_add(func[i]); + err = bpf_prog_lock_ro(func[i]); + if (err) + goto out_free; } + for (i = 1; i < env->subprog_cnt; i++) + bpf_prog_kallsyms_add(func[i]); + /* Last step: make now unused interpreter insns from main * prog consistent for later dump requests, so they can * later look the same as if they were interpreted only. -- cgit v1.2.3 From 7f3edd0c72c3f7214f8f28495f2e6466348eb128 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 15 Mar 2024 12:21:12 -0700 Subject: bpf: Remove unnecessary err < 0 check in bpf_struct_ops_map_update_elem There is a "if (err)" check earlier, so the "if (err < 0)" check that this patch removing is unnecessary. It was my overlook when making adjustments to the bpf_struct_ops_prepare_trampoline() such that the caller does not have to worry about the new page when the function returns error. Fixes: 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.") Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315192112.2825039-1-martin.lau@linux.dev --- kernel/bpf/bpf_struct_ops.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 43356faaa057..3fcd35314ce5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -728,8 +728,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, cur_image = image; trampoline_start = 0; } - if (err < 0) - goto reset_unlock; *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); -- cgit v1.2.3 From e3362acd796789dc0562eb1a3937007b0beb0c5b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 16 Mar 2024 08:35:40 +0100 Subject: bpf: Remove arch_unprotect_bpf_trampoline() Last user of arch_unprotect_bpf_trampoline() was removed by commit 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.") Remove arch_unprotect_bpf_trampoline() Reported-by: Daniel Borkmann Fixes: 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.") Signed-off-by: Christophe Leroy Link: https://lore.kernel.org/r/42c635bb54d3af91db0f9b85d724c7c290069f67.1710574353.git.christophe.leroy@csgroup.eu Signed-off-by: Martin KaFai Lau --- arch/arm64/net/bpf_jit_comp.c | 4 ---- arch/x86/net/bpf_jit_comp.c | 4 ---- include/linux/bpf.h | 1 - kernel/bpf/trampoline.c | 7 ------- 4 files changed, 16 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c5b461dda438..132c8ffba109 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2180,10 +2180,6 @@ void arch_protect_bpf_trampoline(void *image, unsigned int size) { } -void arch_unprotect_bpf_trampoline(void *image, unsigned int size) -{ -} - int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a7ba8e178645..7a56d2d84512 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3008,10 +3008,6 @@ void arch_protect_bpf_trampoline(void *image, unsigned int size) { } -void arch_unprotect_bpf_trampoline(void *image, unsigned int size) -{ -} - int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..d89bdefb42e2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1117,7 +1117,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); -void arch_unprotect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59c78..04fd1abd3661 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1078,13 +1078,6 @@ void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) set_memory_rox((long)image, 1); } -void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) -{ - WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_nx((long)image, 1); - set_memory_rw((long)image, 1); -} - int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { -- cgit v1.2.3 From c733239f8f530872a1f80d8c45dcafbaff368737 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 16 Mar 2024 08:35:41 +0100 Subject: bpf: Check return from set_memory_rox() arch_protect_bpf_trampoline() and alloc_new_pack() call set_memory_rox() which can fail, leading to unprotected memory. Take into account return from set_memory_rox() function and add __must_check flag to arch_protect_bpf_trampoline(). Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/fe1c163c83767fde5cab31d209a4a6be3ddb3a73.1710574353.git.christophe.leroy@csgroup.eu Signed-off-by: Martin KaFai Lau --- arch/arm64/net/bpf_jit_comp.c | 3 ++- arch/x86/net/bpf_jit_comp.c | 3 ++- include/linux/bpf.h | 2 +- kernel/bpf/bpf_struct_ops.c | 8 ++++++-- kernel/bpf/core.c | 28 +++++++++++++++++++++------- kernel/bpf/trampoline.c | 8 +++++--- net/bpf/bpf_dummy_struct_ops.c | 4 +++- 7 files changed, 40 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 132c8ffba109..bc16eb694657 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2176,8 +2176,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size) bpf_prog_pack_free(image, size); } -void arch_protect_bpf_trampoline(void *image, unsigned int size) +int arch_protect_bpf_trampoline(void *image, unsigned int size) { + return 0; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7a56d2d84512..4900b1ee019f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3004,8 +3004,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size) bpf_prog_pack_free(image, size); } -void arch_protect_bpf_trampoline(void *image, unsigned int size) +int arch_protect_bpf_trampoline(void *image, unsigned int size) { + return 0; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d89bdefb42e2..17843e66a1d3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1116,7 +1116,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *func_addr); void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); -void arch_protect_bpf_trampoline(void *image, unsigned int size); +int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 3fcd35314ce5..86c7884abaf8 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -740,8 +740,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - for (i = 0; i < st_map->image_pages_cnt; i++) - arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); + for (i = 0; i < st_map->image_pages_cnt; i++) { + err = arch_protect_bpf_trampoline(st_map->image_pages[i], + PAGE_SIZE); + if (err) + goto reset_unlock; + } if (st_map->map.map_flags & BPF_F_LINK) { err = 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 63f100def31b..5aacb1d3c4cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -908,23 +908,30 @@ static LIST_HEAD(pack_list); static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; + int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); - if (!pack->ptr) { - kfree(pack); - return NULL; - } + if (!pack->ptr) + goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); - list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + err = set_memory_rox((unsigned long)pack->ptr, + BPF_PROG_PACK_SIZE / PAGE_SIZE); + if (err) + goto out; + list_add_tail(&pack->list, &pack_list); return pack; + +out: + bpf_jit_free_exec(pack->ptr); + kfree(pack); + return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) @@ -939,9 +946,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { + int err; + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); + err = set_memory_rox((unsigned long)ptr, + size / PAGE_SIZE); + if (err) { + bpf_jit_free_exec(ptr); + ptr = NULL; + } } goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 04fd1abd3661..cc50607f8d8c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -456,7 +456,9 @@ again: if (err < 0) goto out_free; - arch_protect_bpf_trampoline(im->image, im->size); + err = arch_protect_bpf_trampoline(im->image, im->size); + if (err) + goto out_free; WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -1072,10 +1074,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size) bpf_jit_free_exec(image); } -void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) +int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_rox((long)image, 1); + return set_memory_rox((long)image, 1); } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index de33dc1b0daa..25b75844891a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -133,7 +133,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - arch_protect_bpf_trampoline(image, PAGE_SIZE); + err = arch_protect_bpf_trampoline(image, PAGE_SIZE); + if (err) + goto out; prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); -- cgit v1.2.3 From 1a4a0cb7985f921548f1a7ac17686afbefe67f87 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 18 Mar 2024 14:25:26 +0100 Subject: bpf/lpm_trie: Inline longest_prefix_match for fastpath The BPF map type LPM (Longest Prefix Match) is used heavily in production by multiple products that have BPF components. Perf data shows trie_lookup_elem() and longest_prefix_match() being part of kernels perf top. For every level in the LPM tree trie_lookup_elem() calls out to longest_prefix_match(). The compiler is free to inline this call, but chooses not to inline, because other slowpath callers (that can be invoked via syscall) exists like trie_update_elem(), trie_delete_elem() or trie_get_next_key(). bcc/tools/funccount -Ti 1 'trie_lookup_elem|longest_prefix_match.isra.0' FUNC COUNT trie_lookup_elem 664945 longest_prefix_match.isra.0 8101507 Observation on a single random machine shows a factor 12 between the two functions. Given an average of 12 levels in the trie being searched. This patch force inlining longest_prefix_match(), but only for the lookup fastpath to balance object instruction size. In production with AMD CPUs, measuring the function latency of 'trie_lookup_elem' (bcc/tools/funclatency) we are seeing an improvement function latency reduction 7-8% with this patch applied (to production kernels 6.6 and 6.1). Analyzing perf data, we can explain this rather large improvement due to reducing the overhead for AMD side-channel mitigation SRSO (Speculative Return Stack Overflow). Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/171076828575.2141737.18370644069389889027.stgit@firesoul --- kernel/bpf/lpm_trie.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 050fe1ebf0f7..939620b91c0e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -155,16 +155,17 @@ static inline int extract_bit(const u8 *data, size_t index) } /** - * longest_prefix_match() - determine the longest prefix + * __longest_prefix_match() - determine the longest prefix * @trie: The trie to get internal sizes from * @node: The node to operate on * @key: The key to compare to @node * * Determine the longest prefix of @node that matches the bits in @key. */ -static size_t longest_prefix_match(const struct lpm_trie *trie, - const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key_u8 *key) +static __always_inline +size_t __longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; @@ -224,6 +225,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, return prefixlen; } +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) +{ + return __longest_prefix_match(trie, node, key); +} + /* Called from syscall or from eBPF program */ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { @@ -245,7 +253,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * If it's the maximum possible prefix for this trie, we have * an exact match and can return it directly. */ - matchlen = longest_prefix_match(trie, node, key); + matchlen = __longest_prefix_match(trie, node, key); if (matchlen == trie->max_prefixlen) { found = node; break; -- cgit v1.2.3 From eb166e522c77699fc19bfa705652327a1e51a117 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:48:54 -0700 Subject: bpf: Allow helper bpf_get_[ns_]current_pid_tgid() for all prog types Currently bpf_get_current_pid_tgid() is allowed in tracing, cgroup and sk_msg progs while bpf_get_ns_current_pid_tgid() is only allowed in tracing progs. We have an internal use case where for an application running in a container (with pid namespace), user wants to get the pid associated with the pid namespace in a cgroup bpf program. Currently, cgroup bpf progs already allow bpf_get_current_pid_tgid(). Let us allow bpf_get_ns_current_pid_tgid() as well. With auditing the code, bpf_get_current_pid_tgid() is also used by sk_msg prog. But there are no side effect to expose these two helpers to all prog types since they do not reveal any kernel specific data. The detailed discussion is in [1]. So with this patch, both bpf_get_current_pid_tgid() and bpf_get_ns_current_pid_tgid() are put in bpf_base_func_proto(), making them available to all program types. [1] https://lore.kernel.org/bpf/20240307232659.1115872-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240315184854.2975190-1-yonghong.song@linux.dev --- kernel/bpf/cgroup.c | 2 -- kernel/bpf/helpers.c | 4 ++++ kernel/trace/bpf_trace.c | 4 ---- net/core/filter.c | 2 -- 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 82243cb6c54d..8ba73042a239 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2575,8 +2575,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUP_NET_CLASSID diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a89587859571..9234174ccb21 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1730,6 +1730,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a5c4efc73c3..1b041911b1d8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1525,8 +1525,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: @@ -1582,8 +1580,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_get_ns_current_pid_tgid: - return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/net/core/filter.c b/net/core/filter.c index 8adf95765cdd..0c66e4a3fc5b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8342,8 +8342,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: -- cgit v1.2.3 From 6b9c2950c912780ce113079c9c52041b1e2a611a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:48 -0700 Subject: bpf: flatten bpf_probe_register call chain bpf_probe_register() and __bpf_probe_register() have identical signatures and bpf_probe_register() just redirect to __bpf_probe_register(). So get rid of this extra function call step to simplify following the source code. It has no difference at runtime due to inlining, of course. Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-2-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1b041911b1d8..30ecf62f8a17 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2425,7 +2425,7 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { struct tracepoint *tp = btp->tp; @@ -2443,11 +2443,6 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * prog); } -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - return __bpf_probe_register(btp, prog); -} - int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); -- cgit v1.2.3 From d4dfc5700e867b22ab94f960f9a9972696a637d5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:49 -0700 Subject: bpf: pass whole link instead of prog when triggering raw tracepoint Instead of passing prog as an argument to bpf_trace_runX() helpers, that are called from tracepoint triggering calls, store BPF link itself (struct bpf_raw_tp_link for raw tracepoints). This will allow to pass extra information like BPF cookie into raw tracepoint registration. Instead of replacing `struct bpf_prog *prog = __data;` with corresponding `struct bpf_raw_tp_link *link = __data;` assignment in `__bpf_trace_##call` I just passed `__data` through into underlying bpf_trace_runX() call. This works well because we implicitly cast `void *`, and it also avoids naming clashes with arguments coming from tracepoint's "proto" list. We could have run into the same problem with "prog", we just happened to not have a tracepoint that has "prog" input argument. We are less lucky with "link", as there are tracepoints using "link" argument name already. So instead of trying to avoid naming conflicts, let's just remove intermediate local variable. It doesn't hurt readibility, it's either way a bit of a maze of calls and macros, that requires careful reading. Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-3-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/trace_events.h | 36 ++++++++++++++++++++---------------- include/trace/bpf_probe.h | 3 +-- kernel/bpf/syscall.c | 9 ++------- kernel/trace/bpf_trace.c | 18 ++++++++++-------- 5 files changed, 38 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 17843e66a1d3..2ea8ce59f582 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1607,6 +1607,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_raw_tp_link { + struct bpf_link link; + struct bpf_raw_event_map *btp; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d68ff9b1247f..a7fc6fb6de3c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -759,8 +759,11 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); + +struct bpf_raw_tp_link; +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); + struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, @@ -788,11 +791,12 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } -static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +struct bpf_raw_tp_link; +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return -EOPNOTSUPP; } -static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return -EOPNOTSUPP; } @@ -903,31 +907,31 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); -void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); -void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); -void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run1(struct bpf_raw_tp_link *link, u64 arg1); +void bpf_trace_run2(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3); -void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run4(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4); -void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run5(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); -void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run6(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); -void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run7(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); -void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run8(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); -void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run9(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); -void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run10(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); -void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run11(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); -void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, +void bpf_trace_run12(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index e609cd7da47e..a2ea11cc912e 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -46,8 +46,7 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ - struct bpf_prog *prog = __data; \ - CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ } #undef DECLARE_EVENT_CLASS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..1cb4c3809af4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3469,17 +3469,12 @@ out_put_prog: return err; } -struct bpf_raw_tp_link { - struct bpf_link link; - struct bpf_raw_event_map *btp; -}; - static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); + bpf_probe_unregister(raw_tp->btp, raw_tp); bpf_put_raw_tracepoint(raw_tp->btp); } @@ -3833,7 +3828,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } - err = bpf_probe_register(link->btp, prog); + err = bpf_probe_register(link->btp, link); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 30ecf62f8a17..17de91ad4a1f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2366,8 +2366,10 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) } static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct bpf_prog *prog = link->link.prog; + cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); @@ -2404,12 +2406,12 @@ out: #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ + void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ + __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); @@ -2425,9 +2427,10 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; + struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's @@ -2439,13 +2442,12 @@ int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, - prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, -- cgit v1.2.3 From 68ca5d4eebb8c4de246ee5f634eee26bc689562d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:50 -0700 Subject: bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs Wire up BPF cookie for raw tracepoint programs (both BTF and non-BTF aware variants). This brings them up to part w.r.t. BPF cookie usage with classic tracepoint and fentry/fexit programs. Acked-by: Stanislav Fomichev Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-4-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++-- kernel/bpf/syscall.c | 13 +++++++++---- kernel/trace/bpf_trace.c | 13 +++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2ea8ce59f582..62762390c93d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1610,6 +1610,7 @@ struct bpf_tracing_link { struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; + u64 cookie; }; struct bpf_link_primer { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c42b9f1bada..9585f5345353 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1662,8 +1662,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1cb4c3809af4..e44c276e8617 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3774,7 +3774,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name) + const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3821,6 +3821,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; + link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -3841,11 +3842,13 @@ out_put_btp: return err; } -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; + void __user *tp_name; + __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) @@ -3855,7 +3858,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); + cookie = attr->raw_tracepoint.cookie; + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; @@ -5193,7 +5198,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 17de91ad4a1f..434e3ece6688 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2004,6 +2004,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2066,6 +2068,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); @@ -2369,15 +2374,23 @@ static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { struct bpf_prog *prog = link->link.prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } + + run_ctx.bpf_cookie = link->cookie; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c42b9f1bada..bf80b614c4db 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1664,6 +1664,7 @@ union bpf_attr { struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ -- cgit v1.2.3 From 4c2a26fc80bcb851dc630590f2eec157991eccbf Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Wed, 20 Mar 2024 20:29:54 -0400 Subject: bpf-next: Avoid goto in regs_refine_cond_op() In case of GE/GT/SGE/JST instructions, regs_refine_cond_op() reuses the logic that does analysis of LE/LT/SLE/SLT instructions. This commit avoids the use of a goto to perform the reuse. Signed-off-by: Harishankar Vishwanathan Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240321002955.808604-1-harishankar.vishwanathan@gmail.com --- kernel/bpf/verifier.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de7813947981..ca6cacf7b42f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14544,7 +14544,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state struct tnum t; u64 val; -again: + /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */ + switch (opcode) { + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + opcode = flip_opcode(opcode); + swap(reg1, reg2); + break; + default: + break; + } + switch (opcode) { case BPF_JEQ: if (is_jmp32) { @@ -14687,14 +14699,6 @@ again: reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - case BPF_JGE: - case BPF_JGT: - case BPF_JSGE: - case BPF_JSGT: - /* just reuse LE/LT logic above */ - opcode = flip_opcode(opcode); - swap(reg1, reg2); - goto again; default: return; } -- cgit v1.2.3 From 3774b28d8f3b9e8a946beb9550bee85e5454fc9f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 18 Mar 2024 20:50:04 -0400 Subject: locking/qspinlock: Always evaluate lockevent* non-event parameter once The 'inc' parameter of lockevent_add() and the cond parameter of lockevent_cond_inc() are only evaluated when CONFIG_LOCK_EVENT_COUNTS is on. That can cause problem if those parameters are expressions with side effect like a "++". Fix this by evaluating those non-event parameters once even if CONFIG_LOCK_EVENT_COUNTS is off. This will also eliminate the need of the __maybe_unused attribute to the wait_early local variable in pv_wait_node(). Suggested-by: Ingo Molnar Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Reviewed-by: Boqun Feng Link: https://lore.kernel.org/r/20240319005004.1692705-1-longman@redhat.com --- kernel/locking/lock_events.h | 4 ++-- kernel/locking/qspinlock_paravirt.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index a6016b91803d..d2345e9c0190 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -53,8 +53,8 @@ static inline void __lockevent_add(enum lock_events event, int inc) #else /* CONFIG_LOCK_EVENT_COUNTS */ #define lockevent_inc(ev) -#define lockevent_add(ev, c) -#define lockevent_cond_inc(ev, c) +#define lockevent_add(ev, c) do { (void)(c); } while (0) +#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0) #endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index ae2b12f68b90..169950fe1aad 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -294,7 +294,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - bool __maybe_unused wait_early; + bool wait_early; int loop; for (;;) { -- cgit v1.2.3 From 91a1d97ef482c1e4c9d4c1c656a53b0f6b16d0ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Mar 2024 19:01:03 +0100 Subject: jump_label,module: Don't alloc static_key_mod for __ro_after_init keys When a static_key is marked ro_after_init, its state will never change (after init), therefore jump_label_update() will never need to iterate the entries, and thus module load won't actually need to track this -- avoiding the static_key::next write. Therefore, mark these keys such that jump_label_add_module() might recognise them and avoid the modification. Use the special state: 'static_key_linked(key) && !static_key_mod(key)' to denote such keys. jump_label_add_module() does not exist under CONFIG_JUMP_LABEL=n, so the newly-introduced jump_label_init_ro() can be defined as a nop for that configuration. [ mingo: Renamed jump_label_ro() to jump_label_init_ro() ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-2-vschneid@redhat.com --- include/asm-generic/sections.h | 5 ++++ include/linux/jump_label.h | 3 +++ init/main.c | 1 + kernel/jump_label.c | 53 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+) (limited to 'kernel') diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index db13bb620f52..c768de6f19a9 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -180,6 +180,11 @@ static inline bool is_kernel_rodata(unsigned long addr) addr < (unsigned long)__end_rodata; } +static inline bool is_kernel_ro_after_init(unsigned long addr) +{ + return addr >= (unsigned long)__start_ro_after_init && + addr < (unsigned long)__end_ro_after_init; +} /** * is_kernel_inittext - checks if the pointer address is located in the * .init.text section diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f0a949b7c973..f5a2727ca4a9 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -216,6 +216,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); +extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -265,6 +266,8 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } +static __always_inline void jump_label_init_ro(void) { } + static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) diff --git a/init/main.c b/init/main.c index 2ca52474d0c3..6c3f251d6ef8 100644 --- a/init/main.c +++ b/init/main.c @@ -1408,6 +1408,7 @@ static void mark_readonly(void) * insecure pages which are W+X. */ flush_module_init_free_work(); + jump_label_init_ro(); mark_rodata_ro(); debug_checkwx(); rodata_test(); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d9c822bbffb8..3218fa5688b9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -530,6 +530,45 @@ void __init jump_label_init(void) cpus_read_unlock(); } +static inline bool static_key_sealed(struct static_key *key) +{ + return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); +} + +static inline void static_key_seal(struct static_key *key) +{ + unsigned long type = key->type & JUMP_TYPE_TRUE; + key->type = JUMP_TYPE_LINKED | type; +} + +void jump_label_init_ro(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + if (WARN_ON_ONCE(!static_key_initialized)) + return; + + cpus_read_lock(); + jump_label_lock(); + + for (iter = iter_start; iter < iter_stop; iter++) { + struct static_key *iterk = jump_entry_key(iter); + + if (!is_kernel_ro_after_init((unsigned long)iterk)) + continue; + + if (static_key_sealed(iterk)) + continue; + + static_key_seal(iterk); + } + + jump_label_unlock(); + cpus_read_unlock(); +} + #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -650,6 +689,15 @@ static int jump_label_add_module(struct module *mod) static_key_set_entries(key, iter); continue; } + + /* + * If the key was sealed at init, then there's no need to keep a + * reference to its module entries - just patch them now and be + * done with it. + */ + if (static_key_sealed(key)) + goto do_poke; + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; @@ -675,6 +723,7 @@ static int jump_label_add_module(struct module *mod) static_key_set_linked(key); /* Only update if we've changed from our initial state */ +do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } @@ -699,6 +748,10 @@ static void jump_label_del_module(struct module *mod) if (within_module((unsigned long)key, mod)) continue; + /* No @jlm allocated because key was sealed at init. */ + if (static_key_sealed(key)) + continue; + /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; -- cgit v1.2.3 From 4eab44a8ae98df53e59f6af3c860142177235507 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 13 Mar 2024 19:01:04 +0100 Subject: context_tracking: Make context_tracking_key __ro_after_init context_tracking_key is only ever enabled in __init ct_cpu_tracker_user(), so mark it as __ro_after_init. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-3-vschneid@redhat.com --- kernel/context_tracking.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 70ae70d03823..24b1e1143260 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -432,7 +432,7 @@ static __always_inline void ct_kernel_enter(bool user, int offset) { } #define CREATE_TRACE_POINTS #include -DEFINE_STATIC_KEY_FALSE(context_tracking_key); +DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key); EXPORT_SYMBOL_GPL(context_tracking_key); static noinstr bool context_tracking_recursion_enter(void) -- cgit v1.2.3 From 77222b0d12e8ae6f082261842174cc2e981bf99c Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:49 +0000 Subject: sched/topology: Export asym_cap_list So that we can use it to iterate through available capacities in the system. Sort asym_cap_list in descending order as expected users are likely to be interested on the highest capacity first. Make the list RCU protected to allow for cheap access in hot paths. Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-2-qyousef@layalina.io --- kernel/sched/sched.h | 14 ++++++++++++++ kernel/sched/topology.c | 43 +++++++++++++++++++++++++------------------ 2 files changed, 39 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 41024c1c49b4..f77c00dddfe1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -109,6 +109,20 @@ extern int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; +/* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + /* * Helpers for converting nanosecond timing to jiffy resolution */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..44ed3d0812ab 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1329,24 +1329,13 @@ next: update_group_capacity(sd, cpu); } -/* - * Asymmetric CPU capacity bits - */ -struct asym_cap_data { - struct list_head link; - unsigned long capacity; - unsigned long cpus[]; -}; - /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same * capacity. * The lifespan of data is unlimited. */ -static LIST_HEAD(asym_cap_list); - -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) +LIST_HEAD(asym_cap_list); /* * Verify whether there is any CPU capacity asymmetry in a given sched domain. @@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span, } +static void free_asym_cap_entry(struct rcu_head *head) +{ + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); + kfree(entry); +} + static inline void asym_cpu_capacity_update_data(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - struct asym_cap_data *entry = NULL; + struct asym_cap_data *insert_entry = NULL; + struct asym_cap_data *entry; + /* + * Search if capacity already exits. If not, track which the entry + * where we should insert to keep the list ordered descendingly. + */ list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) goto done; + else if (!insert_entry && capacity > entry->capacity) + insert_entry = list_prev_entry(entry, link); } entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) return; entry->capacity = capacity; - list_add(&entry->link, &asym_cap_list); + + /* If NULL then the new capacity is the smallest, add last. */ + if (!insert_entry) + list_add_tail_rcu(&entry->link, &asym_cap_list); + else + list_add_rcu(&entry->link, &insert_entry->link); done: __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); } @@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry_safe(entry, next, &asym_cap_list, link) { if (cpumask_empty(cpu_capacity_span(entry))) { - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void) */ if (list_is_singular(&asym_cap_list)) { entry = list_first_entry(&asym_cap_list, typeof(*entry), link); - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } -- cgit v1.2.3 From 22d5607400c62c72da9b60e3324744be83e147a4 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:50 +0000 Subject: sched/fair: Check if a task has a fitting CPU when updating misfit If a misfit task is affined to a subset of the possible CPUs, we need to verify that one of these CPUs can fit it. Otherwise the load balancer code will continuously trigger needlessly leading the balance_interval to increase in return and eventually end up with a situation where real imbalances take a long time to address because of this impossible imbalance situation. This can happen in Android world where it's common for background tasks to be restricted to little cores. Similarly if we can't fit the biggest core, triggering misfit is pointless as it is the best we can ever get on this system. To be able to detect that; we use asym_cap_list to iterate through capacities in the system to see if the task is able to run at a higher capacity level based on its p->cpus_ptr. We do that when the affinity change, a fair task is forked, or when a task switched to fair policy. We store the max_allowed_capacity in task_struct to allow for cheap comparison in the fast path. Improve check_misfit_status() function by removing redundant checks. misfit_task_load will be 0 if the task can't move to a bigger CPU. And nohz_balancer_kick() already checks for cpu_check_capacity() before calling check_misfit_status(). Test: ===== Add trace_printk("balance_interval = %lu\n", interval) in get_sd_balance_interval(). run if [ "$MASK" != "0" ]; then adb shell "taskset -a $MASK cat /dev/zero > /dev/null" fi sleep 10 // parse ftrace buffer counting the occurrence of each valaue Where MASK is either: * 0: no busy task running * 1: busy task is pinned to 1 cpu; handled today to not cause misfit * f: busy task pinned to little cores, simulates busy background task, demonstrates the problem to be fixed Results: ======== Note how occurrence of balance_interval = 128 overshoots for MASK = f. BEFORE ------ MASK=0 1 balance_interval = 175 120 balance_interval = 128 846 balance_interval = 64 55 balance_interval = 63 215 balance_interval = 32 2 balance_interval = 31 2 balance_interval = 16 4 balance_interval = 8 1870 balance_interval = 4 65 balance_interval = 2 MASK=1 27 balance_interval = 175 37 balance_interval = 127 840 balance_interval = 64 167 balance_interval = 63 449 balance_interval = 32 84 balance_interval = 31 304 balance_interval = 16 1156 balance_interval = 8 2781 balance_interval = 4 428 balance_interval = 2 MASK=f 1 balance_interval = 175 1328 balance_interval = 128 44 balance_interval = 64 101 balance_interval = 63 25 balance_interval = 32 5 balance_interval = 31 23 balance_interval = 16 23 balance_interval = 8 4306 balance_interval = 4 177 balance_interval = 2 AFTER ----- Note how the high values almost disappear for all MASK values. The system has background tasks that could trigger the problem without simulate it even with MASK=0. MASK=0 103 balance_interval = 63 19 balance_interval = 31 194 balance_interval = 8 4827 balance_interval = 4 179 balance_interval = 2 MASK=1 131 balance_interval = 63 1 balance_interval = 31 87 balance_interval = 8 3600 balance_interval = 4 7 balance_interval = 2 MASK=f 8 balance_interval = 127 182 balance_interval = 63 3 balance_interval = 31 9 balance_interval = 16 415 balance_interval = 8 3415 balance_interval = 4 21 balance_interval = 2 Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-3-qyousef@layalina.io --- include/linux/sched.h | 1 + init/init_task.c | 1 + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ed40e9f6155..c75fd46506df 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct task_struct { #endif unsigned int policy; + unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; diff --git a/init/init_task.c b/init/init_task.c index 4daee6d761c8..2558b719e053 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, + .max_allowed_capacity = SCHED_CAPACITY_SCALE, .nr_cpus_allowed= NR_CPUS, .mm = NULL, .active_mm = &init_mm, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8270e2e15cb..c47c4f2e28f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5098,15 +5098,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + int cpu = cpu_of(rq); + if (!sched_asym_cpucap_active()) return; - if (!p || p->nr_cpus_allowed == 1) { - rq->misfit_task_load = 0; - return; - } + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { - if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -8253,6 +8257,36 @@ static void task_dead_fair(struct task_struct *p) remove_entity_load_avg(&p->se); } +/* + * Set the max capacity the task is allowed to run at for misfit detection. + */ +static void set_task_max_allowed_capacity(struct task_struct *p) +{ + struct asym_cap_data *entry; + + if (!sched_asym_cpucap_active()) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; + + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); +} + +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) +{ + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} + static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -8261,6 +8295,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) @@ -9610,16 +9646,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -11923,7 +11953,7 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { + if (check_misfit_status(rq)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12648,6 +12678,8 @@ static void task_fork_fair(struct task_struct *p) rq_lock(rq, &rf); update_rq_clock(rq); + set_task_max_allowed_capacity(p); + cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (curr) @@ -12771,6 +12803,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) { attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so @@ -13142,7 +13176,7 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, -- cgit v1.2.3 From fa427e8e53d8db15090af7e952a55870dc2a453f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:51 +0000 Subject: sched/topology: Remove root_domain::max_cpu_capacity The value is no longer used as we now keep track of max_allowed_capacity for each task instead. Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-4-qyousef@layalina.io --- kernel/sched/sched.h | 2 -- kernel/sched/topology.c | 13 ++----------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f77c00dddfe1..4f9e952d4fad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -917,8 +917,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 44ed3d0812ab..63aecd2a7a9f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2514,16 +2514,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - unsigned long capacity; - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - capacity = arch_scale_cpu_capacity(i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, capacity); - cpu_attach_domain(sd, d.rd, i); if (lowest_flag_domain(i, SD_CLUSTER)) @@ -2537,10 +2530,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_cluster) static_branch_inc_cpuslocked(&sched_cluster_active); - if (rq && sched_debug_verbose) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); ret = 0; error: -- cgit v1.2.3 From 58eeb2d79b542c678c46e245dba6b66936368a99 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:52 +0000 Subject: sched/fair: Don't double balance_interval for migrate_misfit It is not necessarily an indication of the system being busy and requires a backoff of the load balancer activities. But pushing it high could mean generally delaying other misfit activities or other type of imbalances. Also don't pollute nr_balance_failed because of misfit failures. The value is used for enabling cache hot migration and in migrate_util/load types. None of which should be impacted (skewed) by misfit failures. Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-5-qyousef@layalina.io --- kernel/sched/fair.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c47c4f2e28f7..dbf4f1c44259 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11449,8 +11449,12 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -11533,8 +11537,13 @@ out_one_pinned: * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ -- cgit v1.2.3 From a8497506cd2c0fc90a64f6f5d2744a0ddb2c81eb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 14:20:13 -0700 Subject: bpf: Avoid get_kernel_nofault() to fetch kprobe entry IP get_kernel_nofault() (or, rather, underlying copy_from_kernel_nofault()) is not free and it does pop up in performance profiles when kprobes are heavily utilized with CONFIG_X86_KERNEL_IBT=y config. Let's avoid using it if we know that fentry_ip - 4 can't cross page boundary. We do that by masking lowest 12 bits and checking if they are Another benefit (and actually what caused a closer look at this part of code) is that now LBR record is (typically) not wasted on copy_from_kernel_nofault() call and code, which helps tools like retsnoop that grab LBR records from inside BPF code in kretprobes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/bpf/20240319212013.1046779-1-andrii@kernel.org --- kernel/trace/bpf_trace.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 434e3ece6688..6d0c95638e1b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1053,9 +1053,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip) { u32 instr; - /* Being extra safe in here in case entry ip is on the page-edge. */ - if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) - return fentry_ip; + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { + if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) + return fentry_ip; + } else { + instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); + } if (is_endbr(instr)) fentry_ip -= ENDBR_INSN_SIZE; return fentry_ip; -- cgit v1.2.3 From 1211f3b21c2aa0d22d8d7f050e3a5930a91cd0e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:02 -1000 Subject: workqueue: Preserve OFFQ bits in cancel[_sync] paths The cancel[_sync] paths acquire and release WORK_STRUCT_PENDING, and manipulate WORK_OFFQ_CANCELING. However, they assume that all the OFFQ bit values except for the pool ID are statically known and don't preserve them, which is not wrong in the current code as the pool ID and CANCELING are the only information carried. However, the planned disable/enable support will add more fields and need them to be preserved. This patch updates work data handling so that only the bits which need updating are updated. - struct work_offq_data is added along with work_offqd_unpack() and work_offqd_pack_flags() to help manipulating multiple fields contained in work->data. Note that the helpers look a bit silly right now as there isn't that much to pack. The next patch will add more. - mark_work_canceling() which is used only by __cancel_work_sync() is replaced by open-coded usage of work_offq_data and set_work_pool_and_keep_pending() in __cancel_work_sync(). - __cancel_work[_sync]() uses offq_data helpers to preserve other OFFQ bits when clearing WORK_STRUCT_PENDING and WORK_OFFQ_CANCELING at the end. - This removes all users of get_work_pool_id() which is dropped. Note that get_work_pool_id() could handle both WORK_STRUCT_PWQ and !WORK_STRUCT_PWQ cases; however, it was only being called after try_to_grab_pending() succeeded, in which case WORK_STRUCT_PWQ is never set and thus it's safe to use work_offqd_unpack() instead. No behavior changes intended. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 51 ++++++++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 158784dd189a..ae7ae4a51499 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -97,6 +97,7 @@ enum wq_misc_consts { /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) +#define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0066c8f6c154..d8f37cfa9935 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -392,6 +392,11 @@ struct wq_pod_type { int *cpu_pod; /* cpu -> pod */ }; +struct work_offq_data { + u32 pool_id; + u32 flags; +}; + static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", @@ -892,29 +897,23 @@ static struct worker_pool *get_work_pool(struct work_struct *work) return idr_find(&worker_pool_idr, pool_id); } -/** - * get_work_pool_id - return the worker pool ID a given work is associated with - * @work: the work item of interest - * - * Return: The worker_pool ID @work was last associated with. - * %WORK_OFFQ_POOL_NONE if none. - */ -static int get_work_pool_id(struct work_struct *work) +static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { - unsigned long data = atomic_long_read(&work->data); + return (v >> shift) & ((1 << bits) - 1); +} - if (data & WORK_STRUCT_PWQ) - return work_struct_pwq(data)->pool->id; +static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) +{ + WARN_ON_ONCE(data & WORK_STRUCT_PWQ); - return data >> WORK_OFFQ_POOL_SHIFT; + offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, + WORK_OFFQ_POOL_BITS); + offqd->flags = data & WORK_OFFQ_FLAG_MASK; } -static void mark_work_canceling(struct work_struct *work) +static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { - unsigned long pool_id = get_work_pool_id(work); - - pool_id <<= WORK_OFFQ_POOL_SHIFT; - set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING); + return (unsigned long)offqd->flags; } static bool work_is_canceling(struct work_struct *work) @@ -4271,6 +4270,7 @@ EXPORT_SYMBOL(flush_rcu_work); static bool __cancel_work(struct work_struct *work, u32 cflags) { + struct work_offq_data offqd; unsigned long irq_flags; int ret; @@ -4281,19 +4281,26 @@ static bool __cancel_work(struct work_struct *work, u32 cflags) if (unlikely(ret < 0)) return false; - set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0); + work_offqd_unpack(&offqd, *work_data_bits(work)); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return ret; } static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { + struct work_offq_data offqd; unsigned long irq_flags; bool ret; /* claim @work and tell other tasks trying to grab @work to back off */ ret = work_grab_pending(work, cflags, &irq_flags); - mark_work_canceling(work); + + work_offqd_unpack(&offqd, *work_data_bits(work)); + offqd.flags |= WORK_OFFQ_CANCELING; + set_work_pool_and_keep_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); /* @@ -4303,12 +4310,16 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) if (wq_online) __flush_work(work, true); + work_offqd_unpack(&offqd, *work_data_bits(work)); + /* * smp_mb() at the end of set_work_pool_and_clear_pending() is paired * with prepare_to_wait() above so that either waitqueue_active() is * visible here or !work_is_canceling() is visible there. */ - set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0); + offqd.flags &= ~WORK_OFFQ_CANCELING; + set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, + work_offqd_pack_flags(&offqd)); if (waitqueue_active(&wq_cancel_waitq)) __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); -- cgit v1.2.3 From 86898fa6b8cd942505860556f3a0bf52eae57fe8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: workqueue: Implement disable/enable for (delayed) work items While (delayed) work items could be flushed and canceled, there was no way to prevent them from being queued in the future. While this didn't lead to functional deficiencies, it sometimes required a bit more effort from the workqueue users to e.g. sequence shutdown steps with more care. Workqueue is currently in the process of replacing tasklet which does support disabling and enabling. The feature is used relatively widely to, for example, temporarily suppress main path while a control plane operation (reset or config change) is in progress. To enable easy conversion of tasklet users and as it seems like an inherent useful feature, this patch implements disabling and enabling of work items. - A work item carries 16bit disable count in work->data while not queued. The access to the count is synchronized by the PENDING bit like all other parts of work->data. - If the count is non-zero, the work item cannot be queued. Any attempt to queue the work item fails and returns %false. - disable_work[_sync](), enable_work(), disable_delayed_work[_sync]() and enable_delayed_work() are added. v3: enable_work() was using local_irq_enable() instead of local_irq_restore() to undo IRQ-disable by work_grab_pending(). This is awkward now and will become incorrect as enable_work() will later be used from IRQ context too. (Lai) v2: Lai noticed that queue_work_node() wasn't checking the disable count. Fixed. queue_rcu_work() is updated to trigger warning if the inner work item is disabled. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 18 ++++- kernel/workqueue.c | 177 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 182 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ae7ae4a51499..bd80e66298a0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -51,20 +51,23 @@ enum work_bits { * data contains off-queue information when !WORK_STRUCT_PWQ. * * MSB - * [ pool ID ] [ OFFQ flags ] [ STRUCT flags ] - * 1 bit 4 or 5 bits + * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] + * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, + WORK_OFFQ_DISABLE_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, + WORK_OFFQ_DISABLE_BITS = 16, + /* * When a work item is off queue, the high bits encode off-queue flags * and the last pool it was on. Cap pool ID to 31 bits and use the * highest number to indicate that no pool is associated. */ - WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, + WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, }; @@ -98,6 +101,7 @@ enum wq_misc_consts { /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) +#define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) @@ -560,6 +564,14 @@ extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); +extern bool disable_work(struct work_struct *work); +extern bool disable_work_sync(struct work_struct *work); +extern bool enable_work(struct work_struct *work); + +extern bool disable_delayed_work(struct delayed_work *dwork); +extern bool disable_delayed_work_sync(struct delayed_work *dwork); +extern bool enable_delayed_work(struct delayed_work *dwork); + extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d8f37cfa9935..5c53dde877fd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,6 +99,7 @@ enum worker_flags { enum work_cancel_flags { WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ + WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ }; enum wq_internal_consts { @@ -394,6 +395,7 @@ struct wq_pod_type { struct work_offq_data { u32 pool_id; + u32 disable; u32 flags; }; @@ -908,12 +910,15 @@ static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS); + offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, + WORK_OFFQ_DISABLE_BITS); offqd->flags = data & WORK_OFFQ_FLAG_MASK; } static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { - return (unsigned long)offqd->flags; + return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | + ((unsigned long)offqd->flags); } static bool work_is_canceling(struct work_struct *work) @@ -2408,6 +2413,21 @@ out: rcu_read_unlock(); } +static bool clear_pending_if_disabled(struct work_struct *work) +{ + unsigned long data = *work_data_bits(work); + struct work_offq_data offqd; + + if (likely((data & WORK_STRUCT_PWQ) || + !(data & WORK_OFFQ_DISABLE_MASK))) + return false; + + work_offqd_unpack(&offqd, data); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + return true; +} + /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on @@ -2430,7 +2450,8 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } @@ -2508,7 +2529,8 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); @@ -2590,7 +2612,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, /* read the comment in __queue_work() */ local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } @@ -2663,7 +2686,12 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + /* + * rcu_work can't be canceled or disabled. Warn if the user reached + * inside @rwork and disabled the inner work. + */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !WARN_ON_ONCE(clear_pending_if_disabled(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; @@ -4268,20 +4296,46 @@ bool flush_rcu_work(struct rcu_work *rwork) } EXPORT_SYMBOL(flush_rcu_work); +static void work_offqd_disable(struct work_offq_data *offqd) +{ + const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; + + if (likely(offqd->disable < max)) + offqd->disable++; + else + WARN_ONCE(true, "workqueue: work disable count overflowed\n"); +} + +static void work_offqd_enable(struct work_offq_data *offqd) +{ + if (likely(offqd->disable > 0)) + offqd->disable--; + else + WARN_ONCE(true, "workqueue: work disable count underflowed\n"); +} + static bool __cancel_work(struct work_struct *work, u32 cflags) { struct work_offq_data offqd; unsigned long irq_flags; int ret; - do { - ret = try_to_grab_pending(work, cflags, &irq_flags); - } while (unlikely(ret == -EAGAIN)); + if (cflags & WORK_CANCEL_DISABLE) { + ret = work_grab_pending(work, cflags, &irq_flags); + } else { + do { + ret = try_to_grab_pending(work, cflags, &irq_flags); + } while (unlikely(ret == -EAGAIN)); - if (unlikely(ret < 0)) - return false; + if (unlikely(ret < 0)) + return false; + } work_offqd_unpack(&offqd, *work_data_bits(work)); + + if (cflags & WORK_CANCEL_DISABLE) + work_offqd_disable(&offqd); + set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); @@ -4298,6 +4352,10 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) ret = work_grab_pending(work, cflags, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); + + if (cflags & WORK_CANCEL_DISABLE) + work_offqd_disable(&offqd); + offqd.flags |= WORK_OFFQ_CANCELING; set_work_pool_and_keep_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); @@ -4397,6 +4455,105 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); +/** + * disable_work - Disable and cancel a work item + * @work: work item to disable + * + * Disable @work by incrementing its disable count and cancel it if currently + * pending. As long as the disable count is non-zero, any attempt to queue @work + * will fail and return %false. The maximum supported disable depth is 2 to the + * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. + * + * Must be called from a sleepable context. Returns %true if @work was pending, + * %false otherwise. + */ +bool disable_work(struct work_struct *work) +{ + return __cancel_work(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work); + +/** + * disable_work_sync - Disable, cancel and drain a work item + * @work: work item to disable + * + * Similar to disable_work() but also wait for @work to finish if currently + * executing. + * + * Must be called from a sleepable context. Returns %true if @work was pending, + * %false otherwise. + */ +bool disable_work_sync(struct work_struct *work) +{ + return __cancel_work_sync(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work_sync); + +/** + * enable_work - Enable a work item + * @work: work item to enable + * + * Undo disable_work[_sync]() by decrementing @work's disable count. @work can + * only be queued if its disable count is 0. + * + * Must be called from a sleepable context. Returns %true if the disable count + * reached 0. Otherwise, %false. + */ +bool enable_work(struct work_struct *work) +{ + struct work_offq_data offqd; + unsigned long irq_flags; + + work_grab_pending(work, 0, &irq_flags); + + work_offqd_unpack(&offqd, *work_data_bits(work)); + work_offqd_enable(&offqd); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + local_irq_restore(irq_flags); + + return !offqd.disable; +} +EXPORT_SYMBOL_GPL(enable_work); + +/** + * disable_delayed_work - Disable and cancel a delayed work item + * @dwork: delayed work item to disable + * + * disable_work() for delayed work items. + */ +bool disable_delayed_work(struct delayed_work *dwork) +{ + return __cancel_work(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work); + +/** + * disable_delayed_work_sync - Disable, cancel and drain a delayed work item + * @dwork: delayed work item to disable + * + * disable_work_sync() for delayed work items. + */ +bool disable_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_sync(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work_sync); + +/** + * enable_delayed_work - Enable a delayed work item + * @dwork: delayed work item to enable + * + * enable_work() for delayed work items. + */ +bool enable_delayed_work(struct delayed_work *dwork) +{ + return enable_work(&dwork->work); +} +EXPORT_SYMBOL_GPL(enable_delayed_work); + /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call -- cgit v1.2.3 From f09b10b6f442656524d2ee26e45966401a14f54b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: workqueue: Remove WORK_OFFQ_CANCELING cancel[_delayed]_work_sync() guarantees that it can shut down self-requeueing work items. To achieve that, it grabs and then holds WORK_STRUCT_PENDING bit set while flushing the currently executing instance. As the PENDING bit is set, all queueing attempts including the self-requeueing ones fail and once the currently executing instance is flushed, the work item should be idle as long as someone else isn't actively queueing it. This means that the cancel_work_sync path may hold the PENDING bit set while flushing the target work item. This isn't a problem for the queueing path - it can just fail which is the desired effect. It doesn't affect flush. It doesn't matter to cancel_work either as it can just report that the work item has successfully canceled. However, if there's another cancel_work_sync attempt on the work item, it can't simply fail or report success and that would breach the guarantee that it should provide. cancel_work_sync has to wait for and grab that PENDING bit and go through the motions. WORK_OFFQ_CANCELING and wq_cancel_waitq are what implement this cancel_work_sync to cancel_work_sync wait mechanism. When a work item is being canceled, WORK_OFFQ_CANCELING is also set on it and other cancel_work_sync attempts wait on the bit to be cleared using the wait queue. While this works, it's an isolated wart which doesn't jive with the rest of flush and cancel mechanisms and forces enable_work() and disable_work() to require a sleepable context, which hampers their usability. Now that a work item can be disabled, we can use that to block queueing while cancel_work_sync is in progress. Instead of holding PENDING the bit, it can temporarily disable the work item, flush and then re-enable it as that'd achieve the same end result of blocking queueings while canceling and thus enable canceling of self-requeueing work items. - WORK_OFFQ_CANCELING and the surrounding mechanims are removed. - work_grab_pending() is now simpler, no longer has to wait for a blocking operation and thus can be called from any context. - With work_grab_pending() simplified, no need to use try_to_grab_pending() directly. All users are converted to use work_grab_pending(). - __cancel_work_sync() is updated to __cancel_work() with WORK_CANCEL_DISABLE to cancel and plug racing queueing attempts. It then flushes and re-enables the work item if necessary. - These changes allow disable_work() and enable_work() to be called from any context. v2: Lai pointed out that mod_delayed_work_on() needs to check the disable count before queueing the delayed work item. Added clear_pending_if_disabled() call. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 4 +- kernel/workqueue.c | 140 +++++++--------------------------------------- 2 files changed, 20 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bd80e66298a0..a5075969931b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -52,10 +52,9 @@ enum work_bits { * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] - * 16 bits 1 bit 4 or 5 bits + * 16 bits 0 bits 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, - WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, @@ -99,7 +98,6 @@ enum wq_misc_consts { }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ -#define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5c53dde877fd..e80a815ec172 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -496,12 +496,6 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; -/* - * Used to synchronize multiple cancel_sync attempts on the same work item. See - * work_grab_pending() and __cancel_work_sync(). - */ -static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq); - /* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread @@ -783,11 +777,6 @@ static int work_next_color(int color) * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. - * - * %WORK_OFFQ_CANCELING is used to mark a work item which is being - * canceled. While being canceled, a work item may have its PENDING set - * but stay off timer and worklist for arbitrarily long and nobody should - * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data) { @@ -921,13 +910,6 @@ static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) ((unsigned long)offqd->flags); } -static bool work_is_canceling(struct work_struct *work) -{ - unsigned long data = atomic_long_read(&work->data); - - return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); -} - /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that @@ -2058,8 +2040,6 @@ out_put: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry - * -ENOENT if someone else is canceling @work, this state may persist - * for arbitrarily long * ======== ================================================================ * * Note: @@ -2155,26 +2135,9 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, fail: rcu_read_unlock(); local_irq_restore(*irq_flags); - if (work_is_canceling(work)) - return -ENOENT; - cpu_relax(); return -EAGAIN; } -struct cwt_wait { - wait_queue_entry_t wait; - struct work_struct *work; -}; - -static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) -{ - struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); - - if (cwait->work != key) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - /** * work_grab_pending - steal work item from worklist and disable irq * @work: work item to steal @@ -2184,7 +2147,7 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer * or on worklist. * - * Must be called in process context. IRQ is disabled on return with IRQ state + * Can be called from any context. IRQ is disabled on return with IRQ state * stored in *@irq_flags. The caller is responsible for re-enabling it using * local_irq_restore(). * @@ -2193,41 +2156,14 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k static bool work_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { - struct cwt_wait cwait; int ret; - might_sleep(); -repeat: - ret = try_to_grab_pending(work, cflags, irq_flags); - if (likely(ret >= 0)) - return ret; - if (ret != -ENOENT) - goto repeat; - - /* - * Someone is already canceling. Wait for it to finish. flush_work() - * doesn't work for PREEMPT_NONE because we may get woken up between - * @work's completion and the other canceling task resuming and clearing - * CANCELING - flush_work() will return false immediately as @work is no - * longer busy, try_to_grab_pending() will return -ENOENT as @work is - * still being canceled and the other canceling task won't be able to - * clear CANCELING as we're hogging the CPU. - * - * Let's wait for completion using a waitqueue. As this may lead to the - * thundering herd problem, use a custom wake function which matches - * @work along with exclusive wait and wakeup. - */ - init_wait(&cwait.wait); - cwait.wait.func = cwt_wakefn; - cwait.work = work; - - prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait, - TASK_UNINTERRUPTIBLE); - if (work_is_canceling(work)) - schedule(); - finish_wait(&wq_cancel_waitq, &cwait.wait); - - goto repeat; + while (true) { + ret = try_to_grab_pending(work, cflags, irq_flags); + if (ret >= 0) + return ret; + cpu_relax(); + } } /** @@ -2645,19 +2581,14 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long irq_flags; - int ret; + bool ret; - do { - ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, - &irq_flags); - } while (unlikely(ret == -EAGAIN)); + ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); - if (likely(ret >= 0)) { + if (!clear_pending_if_disabled(&dwork->work)) __queue_delayed_work(cpu, wq, dwork, delay); - local_irq_restore(irq_flags); - } - /* -ENOENT from try_to_grab_pending() becomes %true */ + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); @@ -4320,16 +4251,7 @@ static bool __cancel_work(struct work_struct *work, u32 cflags) unsigned long irq_flags; int ret; - if (cflags & WORK_CANCEL_DISABLE) { - ret = work_grab_pending(work, cflags, &irq_flags); - } else { - do { - ret = try_to_grab_pending(work, cflags, &irq_flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - } + ret = work_grab_pending(work, cflags, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); @@ -4344,22 +4266,9 @@ static bool __cancel_work(struct work_struct *work, u32 cflags) static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { - struct work_offq_data offqd; - unsigned long irq_flags; bool ret; - /* claim @work and tell other tasks trying to grab @work to back off */ - ret = work_grab_pending(work, cflags, &irq_flags); - - work_offqd_unpack(&offqd, *work_data_bits(work)); - - if (cflags & WORK_CANCEL_DISABLE) - work_offqd_disable(&offqd); - - offqd.flags |= WORK_OFFQ_CANCELING; - set_work_pool_and_keep_pending(work, offqd.pool_id, - work_offqd_pack_flags(&offqd)); - local_irq_restore(irq_flags); + ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); /* * Skip __flush_work() during early boot when we know that @work isn't @@ -4368,19 +4277,8 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) if (wq_online) __flush_work(work, true); - work_offqd_unpack(&offqd, *work_data_bits(work)); - - /* - * smp_mb() at the end of set_work_pool_and_clear_pending() is paired - * with prepare_to_wait() above so that either waitqueue_active() is - * visible here or !work_is_canceling() is visible there. - */ - offqd.flags &= ~WORK_OFFQ_CANCELING; - set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, - work_offqd_pack_flags(&offqd)); - - if (waitqueue_active(&wq_cancel_waitq)) - __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); + if (!(cflags & WORK_CANCEL_DISABLE)) + enable_work(work); return ret; } @@ -4464,8 +4362,8 @@ EXPORT_SYMBOL(cancel_delayed_work_sync); * will fail and return %false. The maximum supported disable depth is 2 to the * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. * - * Must be called from a sleepable context. Returns %true if @work was pending, - * %false otherwise. + * Can be called from any context. Returns %true if @work was pending, %false + * otherwise. */ bool disable_work(struct work_struct *work) { @@ -4496,8 +4394,8 @@ EXPORT_SYMBOL_GPL(disable_work_sync); * Undo disable_work[_sync]() by decrementing @work's disable count. @work can * only be queued if its disable count is 0. * - * Must be called from a sleepable context. Returns %true if the disable count - * reached 0. Otherwise, %false. + * Can be called from any context. Returns %true if the disable count reached 0. + * Otherwise, %false. */ bool enable_work(struct work_struct *work) { -- cgit v1.2.3 From 456a78eef2670d0e9521e87f35a056de8fec7fb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: workqueue: Remember whether a work item was on a BH workqueue Add an off-queue flag, WORK_OFFQ_BH, that indicates whether the last workqueue the work item was on was a BH one. This will be used to test whether a work item is BH in cancel_sync path to implement atomic cancel_sync'ing for BH work items. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 4 +++- kernel/workqueue.c | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a5075969931b..777b0186317e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -52,9 +52,10 @@ enum work_bits { * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] - * 16 bits 0 bits 4 or 5 bits + * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, + WORK_OFFQ_BH_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, @@ -98,6 +99,7 @@ enum wq_misc_consts { }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ +#define WORK_OFFQ_BH (1ul << WORK_OFFQ_BH_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e80a815ec172..baf7495338bc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -764,6 +764,11 @@ static int work_next_color(int color) return (color + 1) % WORK_NR_COLORS; } +static unsigned long pool_offq_flags(struct worker_pool *pool) +{ + return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; +} + /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag @@ -2122,7 +2127,8 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * this destroys work->data needed by the next step, stash it. */ work_data = *work_data_bits(work); - set_work_pool_and_keep_pending(work, pool->id, 0); + set_work_pool_and_keep_pending(work, pool->id, + pool_offq_flags(pool)); /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); @@ -3175,7 +3181,7 @@ __acquires(&pool->lock) * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_pool_and_clear_pending(work, pool->id, 0); + set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); -- cgit v1.2.3 From 134874e2eee9380c2700411d4844cbc29297bc01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Mar 2024 07:21:03 -1000 Subject: workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items Now that work_grab_pending() can always grab the PENDING bit without sleeping, the only thing that prevents allowing cancel_work_sync() of a BH work item from an atomic context is the flushing of the in-flight instance. When we're flushing a BH work item for cancel_work_sync(), we know that the work item is not queued and must be executing in a BH context, which means that it's safe to busy-wait for its completion from a non-hardirq atomic context. This patch updates __flush_work() so that it busy-waits when flushing a BH work item for cancel_work_sync(). might_sleep() is pushed from start_flush_work() to its callers - when operating on a BH work item, __cancel_work_sync() now enforces !in_hardirq() instead of might_sleep(). This allows cancel_work_sync() and disable_work() to be called from non-hardirq atomic contexts on BH work items. v3: In __flush_work(), test WORK_OFFQ_BH to tell whether a work item being canceled can be busy waited instead of making start_flush_work() return the pool. (Lai) v2: Lai pointed out that __flush_work() was accessing pool->flags outside the RCU critical section protecting the pool pointer. Fix it by testing and remembering the result inside the RCU critical section. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 74 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index baf7495338bc..c0cc8b209d5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4105,8 +4105,6 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, struct pool_workqueue *pwq; struct workqueue_struct *wq; - might_sleep(); - rcu_read_lock(); pool = get_work_pool(work); if (!pool) { @@ -4158,6 +4156,7 @@ already_gone: static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; + unsigned long data; if (WARN_ON(!wq_online)) return false; @@ -4165,13 +4164,41 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (start_flush_work(work, &barr, from_cancel)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { + if (!start_flush_work(work, &barr, from_cancel)) return false; + + /* + * start_flush_work() returned %true. If @from_cancel is set, we know + * that @work must have been executing during start_flush_work() and + * can't currently be queued. Its data must contain OFFQ bits. If @work + * was queued on a BH workqueue, we also know that it was running in the + * BH context and thus can be busy-waited. + */ + data = *work_data_bits(work); + if (from_cancel && + !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { + /* + * On RT, prevent a live lock when %current preempted soft + * interrupt processing or prevents ksoftirqd from running by + * keeping flipping BH. If the BH work item runs on a different + * CPU then this has no effect other than doing the BH + * disable/enable dance for nothing. This is copied from + * kernel/softirq.c::tasklet_unlock_spin_wait(). + */ + while (!try_wait_for_completion(&barr.done)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } + } + } else { + wait_for_completion(&barr.done); } + + destroy_work_on_stack(&barr.work); + return true; } /** @@ -4187,6 +4214,7 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) */ bool flush_work(struct work_struct *work) { + might_sleep(); return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -4276,6 +4304,11 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); + if (*work_data_bits(work) & WORK_OFFQ_BH) + WARN_ON_ONCE(in_hardirq()); + else + might_sleep(); + /* * Skip __flush_work() during early boot when we know that @work isn't * executing. This allows canceling during early boot. @@ -4302,19 +4335,19 @@ EXPORT_SYMBOL(cancel_work); * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * - * Cancel @work and wait for its execution to finish. This function - * can be used even if the work re-queues itself or migrates to - * another workqueue. On return from this function, @work is - * guaranteed to be not pending or executing on any CPU. + * Cancel @work and wait for its execution to finish. This function can be used + * even if the work re-queues itself or migrates to another workqueue. On return + * from this function, @work is guaranteed to be not pending or executing on any + * CPU as long as there aren't racing enqueues. * - * cancel_work_sync(&delayed_work->work) must not be used for - * delayed_work's. Use cancel_delayed_work_sync() instead. + * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. + * Use cancel_delayed_work_sync() instead. * - * The caller must ensure that the workqueue on which @work was last - * queued can't be destroyed before this function returns. + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. * - * Return: - * %true if @work was pending, %false otherwise. + * Returns %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { @@ -4384,8 +4417,11 @@ EXPORT_SYMBOL_GPL(disable_work); * Similar to disable_work() but also wait for @work to finish if currently * executing. * - * Must be called from a sleepable context. Returns %true if @work was pending, - * %false otherwise. + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. + * + * Returns %true if @work was pending, %false otherwise. */ bool disable_work_sync(struct work_struct *work) { -- cgit v1.2.3 From e7cc3be6fdb57d98fc399a856fc3b05cce1ca754 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2024 17:42:50 +0800 Subject: workqueue: Use INIT_WORK_ONSTACK in workqueue_softirq_dead() dead_work is a stack variable. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0cc8b209d5c..45d2aae73c96 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3651,7 +3651,7 @@ void workqueue_softirq_dead(unsigned int cpu) if (!need_more_worker(pool)) continue; - INIT_WORK(&dead_work.work, drain_dead_softirq_workfn); + INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); dead_work.pool = pool; init_completion(&dead_work.done); -- cgit v1.2.3 From ae1296a7bfe4f8e446677ccb761d9419926557bc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2024 17:42:52 +0800 Subject: workqueue: Move attrs->cpumask out of worker_pool's properties when attrs->affn_strict Allow more pools can be shared when attrs->affn_strict. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 13 ++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 777b0186317e..bfcf8d38f4b1 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -185,6 +185,9 @@ struct workqueue_attrs { * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't * participate in pool hash calculations or equality comparisons. + * + * If @affn_strict is set, @cpumask isn't a property of a worker_pool + * either. */ /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45d2aae73c96..f03960f094fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4625,6 +4625,8 @@ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; + if (attrs->affn_strict) + cpumask_copy(attrs->cpumask, cpu_possible_mask); } /* hash value of the content of @attr */ @@ -4633,11 +4635,12 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) u32 hash = 0; hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - hash = jhash_1word(attrs->affn_strict, hash); + if (!attrs->affn_strict) + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -4647,11 +4650,11 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, { if (a->nice != b->nice) return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) + if (a->affn_strict != b->affn_strict) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; - if (a->affn_strict != b->affn_strict) + if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } -- cgit v1.2.3 From d70f5d5778e88addcc3e56858d5e9c635c1e420e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2024 17:42:53 +0800 Subject: workqueue: Use list_last_entry() to get the last idle worker It is clearer than open code. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f03960f094fa..d1ccc3d05b7a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2904,7 +2904,7 @@ static void idle_worker_timeout(struct timer_list *t) unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); @@ -2946,7 +2946,7 @@ static void idle_cull_fn(struct work_struct *work) struct worker *worker; unsigned long expires; - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { -- cgit v1.2.3 From 79202591a55a365251496162ced3004a0a1fa1cf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 7 Mar 2024 21:39:32 -0800 Subject: workqueue: Cleanup subsys attribute registration While reviewing users of subsys_virtual_register() I noticed that wq_sysfs_init() ignores the @groups argument. This looks like a historical artifact as the original wq_subsys only had one attribute to register. On the way to building up an @groups argument to pass to subsys_virtual_register() a few more cleanups fell out: * Use DEVICE_ATTR_RO() and DEVICE_ATTR_RW() for cpumask_{isolated,requested} and cpumask respectively. Rename the @show and @store methods accordingly. * Co-locate the attribute definition with the methods. This required moving wq_unbound_cpumask_show down next to wq_unbound_cpumask_store (renamed to cpumask_show() and cpumask_store()) * Use ATTRIBUTE_GROUPS() to skip some boilerplate declarations Signed-off-by: Dan Williams Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 49 +++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d1ccc3d05b7a..a8cbaede1e22 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7246,25 +7246,27 @@ static ssize_t __wq_cpumask_show(struct device *dev, return written; } -static ssize_t wq_unbound_cpumask_show(struct device *dev, +static ssize_t cpumask_requested_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); } +static DEVICE_ATTR_RO(cpumask_requested); -static ssize_t wq_requested_cpumask_show(struct device *dev, +static ssize_t cpumask_isolated_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); } +static DEVICE_ATTR_RO(cpumask_isolated); -static ssize_t wq_isolated_cpumask_show(struct device *dev, +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); } -static ssize_t wq_unbound_cpumask_store(struct device *dev, +static ssize_t cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; @@ -7280,36 +7282,19 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, free_cpumask_var(cpumask); return ret ? ret : count; } +static DEVICE_ATTR_RW(cpumask); -static struct device_attribute wq_sysfs_cpumask_attrs[] = { - __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store), - __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), - __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), - __ATTR_NULL, +static struct attribute *wq_sysfs_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + &dev_attr_cpumask_requested.attr, + &dev_attr_cpumask_isolated.attr, + NULL, }; +ATTRIBUTE_GROUPS(wq_sysfs_cpumask); static int __init wq_sysfs_init(void) { - struct device *dev_root; - int err; - - err = subsys_virtual_register(&wq_subsys, NULL); - if (err) - return err; - - dev_root = bus_get_dev_root(&wq_subsys); - if (dev_root) { - struct device_attribute *attr; - - for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { - err = device_create_file(dev_root, attr); - if (err) - break; - } - put_device(dev_root); - } - return err; + return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); } core_initcall(wq_sysfs_init); -- cgit v1.2.3 From be3a51e68f2f1b17250ce40d8872c7645b7a2991 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 7 Mar 2024 14:27:23 +0530 Subject: sched/fair: Add EAS checks before updating root_domain::overutilized root_domain::overutilized is only used for EAS(energy aware scheduler) to decide whether to do load balance or not. It is not used if EAS not possible. Currently enqueue_task_fair and task_tick_fair accesses, sometime updates this field. In update_sd_lb_stats it is updated often. This causes cache contention due to true sharing and burns a lot of cycles. ::overload and ::overutilized are part of the same cacheline. Updating it often invalidates the cacheline. That causes access to ::overload to slow down due to false sharing. Hence add EAS check before accessing/updating this field. EAS check is optimized at compile time or it is a static branch. Hence it shouldn't cost much. With the patch, both enqueue_task_fair and newidle_balance don't show up as hot routines in perf profile. 6.8-rc4: 7.18% swapper [kernel.vmlinux] [k] enqueue_task_fair 6.78% s [kernel.vmlinux] [k] newidle_balance +patch: 0.14% swapper [kernel.vmlinux] [k] enqueue_task_fair 0.00% swapper [kernel.vmlinux] [k] newidle_balance While at it: trace_sched_overutilized_tp expect that second argument to be bool. So do a int to bool conversion for that. Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator") Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Srikar Dronamraju Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240307085725.444486-2-sshegde@linux.ibm.com --- kernel/sched/fair.c | 53 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dbf4f1c44259..1afa4f87fb25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6673,22 +6673,42 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + unsigned long rq_util_min, rq_util_max; + + if (!sched_energy_enabled()) + return false; + + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } -static inline void update_overutilized_status(struct rq *rq) +static inline void set_rd_overutilized_status(struct root_domain *rd, + unsigned int status) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); - } + if (!sched_energy_enabled()) + return; + + WRITE_ONCE(rd->overutilized, status); + trace_sched_overutilized_tp(rd, !!status); +} + +static inline void check_update_overutilized_status(struct rq *rq) +{ + /* + * overutilized field is used for load balancing decisions only + * if energy aware scheduler is being used + */ + if (!sched_energy_enabled()) + return; + + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); } #else -static inline void update_overutilized_status(struct rq *rq) { } +static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ @@ -6789,7 +6809,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * and the following generally works well enough in practice. */ if (!task_new) - update_overutilized_status(rq); + check_update_overutilized_status(rq); enqueue_throttle: assert_list_leaf_cfs_rq(rq); @@ -10630,19 +10650,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { - struct root_domain *rd = env->dst_rq->rd; - /* update overload indicator if we are at root domain */ - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); + set_rd_overutilized_status(env->dst_rq->rd, + sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - struct root_domain *rd = env->dst_rq->rd; - - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); } update_idle_cpu_scan(env, sum_util); @@ -12667,7 +12682,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); - update_overutilized_status(task_rq(curr)); + check_update_overutilized_status(task_rq(curr)); task_tick_core(rq, curr); } -- cgit v1.2.3 From d0f5d3cefc259f498456338d319098dc84393b24 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 7 Mar 2024 14:27:24 +0530 Subject: sched/fair: Introduce is_rd_overutilized() helper function to access root_domain::overutilized The root_domain::overutilized field is READ_ONCE() accessed in multiple places, which could be simplified with a helper function. This might also make it more apparent that it needs to be used only in case of EAS. No change in functionality intended. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240307085725.444486-3-sshegde@linux.ibm.com --- kernel/sched/fair.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1afa4f87fb25..24a7530a7d3f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6685,6 +6685,15 @@ static inline bool cpu_overutilized(int cpu) return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } +/* + * Ensure that caller can do EAS. overutilized value + * make sense only if EAS is enabled + */ +static inline int is_rd_overutilized(struct root_domain *rd) +{ + return READ_ONCE(rd->overutilized); +} + static inline void set_rd_overutilized_status(struct root_domain *rd, unsigned int status) { @@ -6704,7 +6713,7 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!sched_energy_enabled()) return; - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); } #else @@ -7990,7 +7999,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) + if (!pd || is_rd_overutilized(rd)) goto unlock; /* @@ -10897,7 +10906,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + if (rcu_dereference(rd->pd) && !is_rd_overutilized(rd)) goto out_balanced; } -- cgit v1.2.3 From c829d6818b60c591f70c060b2bb75d76cf0cec6d Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 25 Mar 2024 21:09:26 +0530 Subject: sched/fair: Simplify the continue_balancing logic in sched_balance_newidle() newidle(CPU_NEWLY_IDLE) balancing doesn't stop the load-balancing if the continue_balancing flag is reset, but the other two balancing (IDLE, BUSY) cases do that. newidle balance stops the load balancing if rq has a task or there is wakeup pending. The same checks are present in should_we_balance for newidle. Hence use the return value and simplify continue_balancing mechanism for newidle. Update the comment surrounding it as well. No change in functionality intended. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20240325153926.274284-1-sshegde@linux.ibm.com --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24a7530a7d3f..1856e58595c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12358,6 +12358,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; @@ -12372,8 +12373,9 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) return 0; /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -12412,7 +12414,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; u64 domain_cost; update_next_balance(sd, &next_balance); @@ -12438,8 +12439,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0 || - this_rq->ttwu_pending) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); -- cgit v1.2.3 From 902e786c4a54a2c4f7462b9026bb56610888db3d Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Tue, 26 Mar 2024 20:56:16 +0530 Subject: sched/fair: Combine EAS check with root_domain::overutilized access Access to root_domainoverutilized is always used with sched_energy_enabled in the pattern: if (sched_energy_enabled && !overutilized) do something So modify the helper function to utilize this pattern. This is more readable code as it would say, do something when root domain is not overutilized. This function always return true when EAS is disabled. No change in functionality intended. Suggested-by: Vincent Guittot Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240326152616.380999-1-sshegde@linux.ibm.com --- kernel/sched/fair.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1856e58595c3..38462305bc39 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6686,12 +6686,11 @@ static inline bool cpu_overutilized(int cpu) } /* - * Ensure that caller can do EAS. overutilized value - * make sense only if EAS is enabled + * overutilized value make sense only if EAS is enabled */ static inline int is_rd_overutilized(struct root_domain *rd) { - return READ_ONCE(rd->overutilized); + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } static inline void set_rd_overutilized_status(struct root_domain *rd, @@ -6710,8 +6709,6 @@ static inline void check_update_overutilized_status(struct rq *rq) * overutilized field is used for load balancing decisions only * if energy aware scheduler is being used */ - if (!sched_energy_enabled()) - return; if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); @@ -7999,7 +7996,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || is_rd_overutilized(rd)) + if (!pd) goto unlock; /* @@ -8202,7 +8199,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) cpumask_test_cpu(cpu, p->cpus_ptr)) return cpu; - if (sched_energy_enabled()) { + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -10903,12 +10900,9 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) if (busiest->group_type == group_misfit_task) goto force_balance; - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !is_rd_overutilized(rd)) - goto out_balanced; - } + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) -- cgit v1.2.3 From c628db0a6831f80e89873ee44f1b40e3ab3216c6 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 25 Mar 2024 11:15:04 +0530 Subject: sched/fair: Check root_domain::overload value before update The root_domain::overload flag is 1 when there's any rq in the root domain that has 2 or more running tasks. (Ie. it's overloaded.) The root_domain structure itself is a global structure per cpuset island. The ::overload flag is maintained the following way: - Set when adding a second task to the runqueue. - It is cleared in update_sd_lb_stats() during load balance, if none of the rqs have 2 or more running tasks. This flag is used during newidle balance to see if its worth doing a full load balance pass, which can be an expensive operation. If it is set, then newidle balance will try to aggressively pull a task. Since commit: 630246a06ae2 ("sched/fair: Clean-up update_sg_lb_stats parameters") ::overload is being written unconditionally, even if it has the same value. The change in value of this depends on the workload, but on typical workloads, it doesn't change all that often: a system is either dominantly overloaded for substantial amounts of time, or not. Extra writes to this semi-global structure cause unnecessary overhead, extra bus traffic, etc. - so avoid it as much as possible. Perf probe stats show that it's worth making this change (numbers are with patch applied): 1M probe:sched_balance_newidle_L38 139 probe:update_sd_lb_stats_L53 <====== 1->0 writes 129K probe:add_nr_running_L12 74 probe:add_nr_running_L13 <====== 0->1 writes 54K probe:update_sd_lb_stats_L50 <====== reads These numbers prove that actual change in the ::overload value is (much) less frequent: L50 is much larger at ~54,000 accesses vs L53+L13 of 139+74. [ mingo: Rewrote the changelog. ] Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Vincent Guittot Cc: Mel Gorman Link: https://lore.kernel.org/r/20240325054505.201995-2-sshegde@linux.ibm.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 38462305bc39..600fdde1e7ac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10657,7 +10657,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + if (READ_ONCE(env->dst_rq->rd->overload) != (sg_status & SG_OVERLOAD)) + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, -- cgit v1.2.3 From caac6291728ed5493d8a53f4b086c270849ce0c4 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 25 Mar 2024 11:15:05 +0530 Subject: sched/fair: Use helper functions to access root_domain::overload Introduce two helper functions to access & set the root_domain::overload flag: get_rd_overload() set_rd_overload() To make sure code is always following READ_ONCE()/WRITE_ONCE() access methods. No change in functionality intended. [ mingo: Renamed the accessors to get_/set_rd_overload(), tidied up the changelog. ] Suggested-by: Qais Yousef Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240325054505.201995-3-sshegde@linux.ibm.com --- kernel/sched/fair.c | 5 ++--- kernel/sched/sched.h | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 600fdde1e7ac..0cc058265308 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10657,8 +10657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (READ_ONCE(env->dst_rq->rd->overload) != (sg_status & SG_OVERLOAD)) - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + set_rd_overload(env->dst_rq->rd, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, @@ -12391,7 +12390,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!READ_ONCE(this_rq->rd->overload) || + if (!get_rd_overload(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4f9e952d4fad..e86ee262d2f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -930,6 +930,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overload(struct root_domain *rd) +{ + return READ_ONCE(rd->overload); +} + +static inline void set_rd_overload(struct root_domain *rd, int status) +{ + if (get_rd_overload(rd) != status) + WRITE_ONCE(rd->overload, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -2530,8 +2541,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); + set_rd_overload(rq->rd, SG_OVERLOAD); } #endif -- cgit v1.2.3 From dfb83ef7b8b064c15be19cf7fcbde0996712de8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:33:20 +0100 Subject: sched/fair: Rename root_domain::overload to ::overloaded It is silly to use an ambiguous noun instead of a clear adjective when naming such a flag ... Note how root_domain::overutilized already used a proper adjective. rd->overloaded is now set to 1 when the root domain is overloaded. Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e86ee262d2f3..cddc4795f72d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -874,7 +874,7 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + int overloaded; /* Indicate one or more cpus over-utilized (tipping point) */ int overutilized; @@ -932,13 +932,13 @@ extern void sched_put_rd(struct root_domain *rd); static inline int get_rd_overload(struct root_domain *rd) { - return READ_ONCE(rd->overload); + return READ_ONCE(rd->overloaded); } static inline void set_rd_overload(struct root_domain *rd, int status) { if (get_rd_overload(rd) != status) - WRITE_ONCE(rd->overload, status); + WRITE_ONCE(rd->overloaded, status); } #ifdef HAVE_RT_PUSH_IPI -- cgit v1.2.3 From 76cc4f91488af0a808bec97794bfe434dece7d67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:41:31 +0100 Subject: sched/fair: Rename {set|get}_rd_overload() to {set|get}_rd_overloaded() Follow the rename of the root_domain::overloaded flag. Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0cc058265308..bf10665b6f4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10657,7 +10657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - set_rd_overload(env->dst_rq->rd, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, @@ -12390,7 +12390,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!get_rd_overload(this_rq->rd) || + if (!get_rd_overloaded(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cddc4795f72d..c7e7ae17c340 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -930,14 +930,14 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); -static inline int get_rd_overload(struct root_domain *rd) +static inline int get_rd_overloaded(struct root_domain *rd) { return READ_ONCE(rd->overloaded); } -static inline void set_rd_overload(struct root_domain *rd, int status) +static inline void set_rd_overloaded(struct root_domain *rd, int status) { - if (get_rd_overload(rd) != status) + if (get_rd_overloaded(rd) != status) WRITE_ONCE(rd->overloaded, status); } @@ -2541,7 +2541,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { - set_rd_overload(rq->rd, SG_OVERLOAD); + set_rd_overloaded(rq->rd, SG_OVERLOAD); } #endif -- cgit v1.2.3 From 7bda10ba7f453729f210264dd07d38989fb858d9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:44:16 +0100 Subject: sched/fair: Rename SG_OVERLOAD to SG_OVERLOADED Follow the rename of the root_domain::overloaded flag. Note that this also matches the SG_OVERUTILIZED flag better. Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf10665b6f4f..839a97a4ba2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9961,7 +9961,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (nr_running > 1) - *sg_status |= SG_OVERLOAD; + *sg_status |= SG_OVERLOADED; if (cpu_overutilized(i)) *sg_status |= SG_OVERUTILIZED; @@ -9986,7 +9986,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + *sg_status |= SG_OVERLOADED; } } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ @@ -10657,7 +10657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7e7ae17c340..07c6669b8250 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -851,7 +851,7 @@ struct perf_domain { }; /* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */ #define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ /* @@ -2541,7 +2541,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { - set_rd_overloaded(rq->rd, SG_OVERLOAD); + set_rd_overloaded(rq->rd, SG_OVERLOADED); } #endif -- cgit v1.2.3 From 4d0a63e5b841c759c9a306aff158420421ef016f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:54:42 +0100 Subject: sched/fair: Rename set_rd_overutilized_status() to set_rd_overutilized() The _status() postfix has no real meaning, simplify the naming and harmonize it with set_rd_overloaded(). Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 839a97a4ba2a..f29efd5f19f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6693,7 +6693,7 @@ static inline int is_rd_overutilized(struct root_domain *rd) return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } -static inline void set_rd_overutilized_status(struct root_domain *rd, +static inline void set_rd_overutilized(struct root_domain *rd, unsigned int status) { if (!sched_energy_enabled()) @@ -6711,7 +6711,7 @@ static inline void check_update_overutilized_status(struct rq *rq) */ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) - set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(rq->rd, SG_OVERUTILIZED); } #else static inline void check_update_overutilized_status(struct rq *rq) { } @@ -10660,10 +10660,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized_status(env->dst_rq->rd, + set_rd_overutilized(env->dst_rq->rd, sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, SG_OVERUTILIZED); } update_idle_cpu_scan(env, sum_util); -- cgit v1.2.3 From 786bf0e7e2ec90b349a7bab6e97947982ab31f2c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 25 Mar 2024 15:22:10 +0000 Subject: bpf: improve error message for unsupported helper BPF verifier emits "unknown func" message when given BPF program type does not support BPF helper. This message may be confusing for users, as important context that helper is unknown only to current program type is not provided. This patch changes message to "program of this type cannot use helper " and aligns dependent code in libbpf and tests. Any suggestions on improving/changing this message are welcome. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20240325152210.377548-1-yatsenko@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- tools/bpf/bpftool/feature.c | 3 ++- tools/lib/bpf/libbpf_probes.c | 6 ++++-- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- tools/testing/selftests/bpf/progs/verifier_helper_restricted.c | 8 ++++---- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17f26ba1a9e0..edb650667f44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10201,8 +10201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0ea06..c754a428c8c6 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 302188122439..9dfbe7750f56 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 94cb22b01482..99dc60fe59d5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -304,7 +304,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd090c..059aa716e3d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ -- cgit v1.2.3 From 55fc888ded83ed542f3de3e51bae03936a998349 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 27 Mar 2024 14:53:29 +0800 Subject: bpf,arena: Use helper sizeof_field in struct accessors Use the well defined helper sizeof_field() to calculate the size of a struct member, instead of doing custom calculations. Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240327065334.8140-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..6c81630c5293 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -37,7 +37,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { -- cgit v1.2.3 From 3124591f686115aca25d772c2ccb7b1e202c3197 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:50 -0700 Subject: bpf: add bpf_modify_return_test_tp() kfunc triggering tracepoint Add a simple bpf_modify_return_test_tp() kfunc, available to all program types, that is useful for various testing and benchmarking scenarios, as it allows to trigger most tracing BPF program types from BPF side, allowing to do complex testing and benchmarking scenarios. It is also attachable to for fmod_ret programs, making it a good and simple way to trigger fmod_ret program under test/benchmark. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/trace/events/bpf_test_run.h | 17 +++++++++++++++++ kernel/bpf/helpers.c | 1 + net/bpf/test_run.c | 8 ++++++++ 3 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h index 265447e3f71a..0c924d39b7cb 100644 --- a/include/trace/events/bpf_test_run.h +++ b/include/trace/events/bpf_test_run.h @@ -7,6 +7,23 @@ #include +TRACE_EVENT(bpf_trigger_tp, + + TP_PROTO(int nonce), + + TP_ARGS(nonce), + + TP_STRUCT__entry( + __field(int, nonce) + ), + + TP_fast_assign( + __entry->nonce = nonce; + ), + + TP_printk("nonce %d", __entry->nonce) +); + DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f860adab3eb9..d9e7aca8ae9e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2625,6 +2625,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 61efeadaff8d..f6aad4ed2ab2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -575,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, return a + *b + c + d + (long)e + f + g; } +__bpf_kfunc int bpf_modify_return_test_tp(int nonce) +{ + trace_bpf_trigger_tp(nonce); + + return nonce; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -622,6 +629,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) -- cgit v1.2.3 From ee3bad033d01066348913f3220ea81987721ed53 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 27 Mar 2024 11:20:22 +0800 Subject: bpf: Mitigate latency spikes associated with freeing non-preallocated htab Following the recent upgrade of one of our BPF programs, we encountered significant latency spikes affecting other applications running on the same host. After thorough investigation, we identified that these spikes were primarily caused by the prolonged duration required to free a non-preallocated htab with approximately 2 million keys. Notably, our kernel configuration lacks the presence of CONFIG_PREEMPT. In scenarios where kernel execution extends excessively, other threads might be starved of CPU time, resulting in latency issues across the system. To mitigate this, we've adopted a proactive approach by incorporating cond_resched() calls within the kernel code. This ensures that during lengthy kernel operations, the scheduler is invoked periodically to provide opportunities for other threads to execute. Signed-off-by: Yafang Shao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240327032022.78391-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a5349bc..e81059faae63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1490,6 +1490,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } -- cgit v1.2.3 From e8742081db7d01f980c6161ae1e8a1dbc1e30979 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 28 Mar 2024 11:58:01 -0700 Subject: bpf: Mark bpf prog stack with kmsan_unposion_memory in interpreter mode syzbot reported uninit memory usages during map_{lookup,delete}_elem. ========== BUG: KMSAN: uninit-value in __dev_map_lookup_elem kernel/bpf/devmap.c:441 [inline] BUG: KMSAN: uninit-value in dev_map_lookup_elem+0xf3/0x170 kernel/bpf/devmap.c:796 __dev_map_lookup_elem kernel/bpf/devmap.c:441 [inline] dev_map_lookup_elem+0xf3/0x170 kernel/bpf/devmap.c:796 ____bpf_map_lookup_elem kernel/bpf/helpers.c:42 [inline] bpf_map_lookup_elem+0x5c/0x80 kernel/bpf/helpers.c:38 ___bpf_prog_run+0x13fe/0xe0f0 kernel/bpf/core.c:1997 __bpf_prog_run256+0xb5/0xe0 kernel/bpf/core.c:2237 ========== The reproducer should be in the interpreter mode. The C reproducer is trying to run the following bpf prog: 0: (18) r0 = 0x0 2: (18) r1 = map[id:49] 4: (b7) r8 = 16777216 5: (7b) *(u64 *)(r10 -8) = r8 6: (bf) r2 = r10 7: (07) r2 += -229 ^^^^^^^^^^ 8: (b7) r3 = 8 9: (b7) r4 = 0 10: (85) call dev_map_lookup_elem#1543472 11: (95) exit It is due to the "void *key" (r2) passed to the helper. bpf allows uninit stack memory access for bpf prog with the right privileges. This patch uses kmsan_unpoison_memory() to mark the stack as initialized. This should address different syzbot reports on the uninit "void *key" argument during map_{lookup,delete}_elem. Reported-by: syzbot+603bcd9b0bf1d94dbb9b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000f9ce6d061494e694@google.com/ Reported-by: syzbot+eb02dc7f03dce0ef39f3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000a5c69c06147c2238@google.com/ Reported-by: syzbot+b4e65ca24fd4d0c734c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000ac56fb06143b6cfa@google.com/ Reported-by: syzbot+d2b113dc9fea5e1d2848@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/0000000000000d69b206142d1ff7@google.com/ Reported-by: syzbot+1a3cf6f08d68868f9db3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/0000000000006f876b061478e878@google.com/ Tested-by: syzbot+1a3cf6f08d68868f9db3@syzkaller.appspotmail.com Suggested-by: Yonghong Song Suggested-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240328185801.1843078-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5aacb1d3c4cc..ab400cdd7d7a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2218,6 +2218,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ @@ -2231,6 +2232,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ -- cgit v1.2.3 From 4475cd8bfd9bcb898953fcadb2f51b3432eb68a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 12:07:48 +0100 Subject: sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags SG_OVERLOADED and SG_OVERUTILIZED flags plus the sg_status bitmask are an unnecessary complication that only make the code harder to read and slower. We only ever set them separately: thule:~/tip> git grep SG_OVER kernel/sched/ kernel/sched/fair.c: set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); kernel/sched/fair.c: *sg_status |= SG_OVERLOADED; kernel/sched/fair.c: *sg_status |= SG_OVERUTILIZED; kernel/sched/fair.c: *sg_status |= SG_OVERLOADED; kernel/sched/fair.c: set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); kernel/sched/fair.c: sg_status & SG_OVERUTILIZED); kernel/sched/fair.c: } else if (sg_status & SG_OVERUTILIZED) { kernel/sched/fair.c: set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); kernel/sched/sched.h:#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */ kernel/sched/sched.h:#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ kernel/sched/sched.h: set_rd_overloaded(rq->rd, SG_OVERLOADED); And use them separately, which results in suboptimal code: /* update overload indicator if we are at root domain */ set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, Introduce separate sg_overloaded and sg_overutilized flags in update_sd_lb_stats() and its lower level functions, and change all of them to 'bool'. Remove the now unused SG_OVERLOADED and SG_OVERUTILIZED flags. Signed-off-by: Ingo Molnar Acked-by: Shrikanth Hegde Tested-by: Shrikanth Hegde Cc: Qais Yousef Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVPhODZ8/nbsqbP@gmail.com --- kernel/sched/fair.c | 36 ++++++++++++++++++------------------ kernel/sched/sched.h | 17 ++++++----------- 2 files changed, 24 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f29efd5f19f6..1dd37168da50 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6688,19 +6688,18 @@ static inline bool cpu_overutilized(int cpu) /* * overutilized value make sense only if EAS is enabled */ -static inline int is_rd_overutilized(struct root_domain *rd) +static inline bool is_rd_overutilized(struct root_domain *rd) { return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } -static inline void set_rd_overutilized(struct root_domain *rd, - unsigned int status) +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { if (!sched_energy_enabled()) return; - WRITE_ONCE(rd->overutilized, status); - trace_sched_overutilized_tp(rd, !!status); + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); } static inline void check_update_overutilized_status(struct rq *rq) @@ -6711,7 +6710,7 @@ static inline void check_update_overutilized_status(struct rq *rq) */ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) - set_rd_overutilized(rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(rq->rd, 1); } #else static inline void check_update_overutilized_status(struct rq *rq) { } @@ -9934,13 +9933,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { int i, nr_running, local_group; @@ -9961,10 +9962,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (nr_running > 1) - *sg_status |= SG_OVERLOADED; + *sg_overloaded = 1; if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -9986,7 +9987,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOADED; + *sg_overloaded = 1; } } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ @@ -10612,7 +10613,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; unsigned long sum_util = 0; - int sg_status = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -10628,7 +10629,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -10657,13 +10658,12 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized(env->dst_rq->rd, - sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - set_rd_overutilized(env->dst_rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overloaded); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } update_idle_cpu_scan(env, sum_util); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 07c6669b8250..7c39dbf31f75 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -713,7 +713,7 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ @@ -757,7 +757,7 @@ struct dl_rq { u64 next; } earliest_dl; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -850,10 +850,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -874,10 +870,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overloaded; + bool overloaded; /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -2540,9 +2536,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - set_rd_overloaded(rq->rd, SG_OVERLOADED); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); -- cgit v1.2.3 From 59f2f841179aa6a0899cb9cf53659149a35749b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 29 Mar 2024 10:14:39 -0700 Subject: bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie. syzbot reported the following lock sequence: cpu 2: grabs timer_base lock spins on bpf_lpm lock cpu 1: grab rcu krcp lock spins on timer_base lock cpu 0: grab bpf_lpm lock spins on rcu krcp lock bpf_lpm lock can be the same. timer_base lock can also be the same due to timer migration. but rcu krcp lock is always per-cpu, so it cannot be the same lock. Hence it's a false positive. To avoid lockdep complaining move kfree_rcu() after spin_unlock. Reported-by: syzbot+1fa663a2100308ab6eab@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240329171439.37813-1-alexei.starovoitov@gmail.com --- kernel/bpf/lpm_trie.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 939620b91c0e..0218a5132ab5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -316,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -390,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -437,6 +438,7 @@ out: } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -445,6 +447,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -514,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -529,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } -- cgit v1.2.3 From 7d8296b250f2eed73f1758607926d4d258dea5d4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:42 +0100 Subject: bitops: make BYTES_TO_BITS() treewide-available Avoid open-coding that simple expression each time by moving BYTES_TO_BITS() from the probes code to to export it to the rest of the kernel. Simplify the macro while at it. `BITS_PER_LONG / sizeof(long)` always equals to %BITS_PER_BYTE, regardless of the target architecture. Do the same for the tools ecosystem as well (incl. its version of bitops.h). The previous implementation had its implicit type of long, while the new one is int, so adjust the format literal accordingly in the perf code. Suggested-by: Andy Shevchenko Reviewed-by: Przemek Kitszel Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitops.h | 2 ++ kernel/trace/trace_probe.c | 2 -- tools/include/linux/bitops.h | 2 ++ tools/perf/util/probe-finder.c | 4 +--- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index f7f5a783da2a..e0cd09eb91cd 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -21,6 +21,8 @@ #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfe3ee6035ec..87337a0c8e03 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1180,8 +1180,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type, return ret; } -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 7319f6ced108..272f15d0e434 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -20,6 +20,8 @@ #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index c8923375e30d..630e16c54ed5 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -186,8 +186,6 @@ static_var: return ret2; } -#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_LONG / sizeof(long)) - static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, const char *cast, bool user_access) @@ -217,7 +215,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, total = dwarf_bytesize(vr_die); if (boffs < 0 || total < 0) return -ENOENT; - ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs, + ret = snprintf(buf, 16, "b%d@%d/%d", bsize, boffs, BYTES_TO_BITS(total)); goto formatted; } -- cgit v1.2.3 From 9dc182c58b5f5d4ac125ac85ad553f7142aa08d4 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 2 Apr 2024 07:33:47 +0000 Subject: bpf: Add a verbose message if map limit is reached When more than 64 maps are used by a program and its subprograms the verifier returns -E2BIG. Add a verbose message which highlights the source of the error and also print the actual limit. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240402073347.195920-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index edb650667f44..fcb62300f407 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18348,6 +18348,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (env->used_map_cnt >= MAX_USED_MAPS) { + verbose(env, "The total number of maps per program has reached the limit of %u\n", + MAX_USED_MAPS); fdput(f); return -E2BIG; } -- cgit v1.2.3 From ce09cbdd988887662546a1175bcfdfc6c8fdd150 Mon Sep 17 00:00:00 2001 From: Jose Fernandez Date: Mon, 1 Apr 2024 21:40:10 -0600 Subject: bpf: Improve program stats run-time calculation This patch improves the run-time calculation for program stats by capturing the duration as soon as possible after the program returns. Previously, the duration included u64_stats_t operations. While the instrumentation overhead is part of the total time spent when stats are enabled, distinguishing between the program's native execution time and the time spent due to instrumentation is crucial for accurate performance analysis. By making this change, the patch facilitates more precise optimization of BPF programs, enabling users to understand their performance in environments without stats enabled. I used a virtualized environment to measure the run-time over one minute for a basic raw_tracepoint/sys_enter program, which just increments a local counter. Although the virtualization introduced some performance degradation that could affect the results, I observed approximately a 16% decrease in average run-time reported by stats with this change (310 -> 260 nsec). Signed-off-by: Jose Fernandez Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402034010.25060-1-josef@netflix.com --- include/linux/filter.h | 6 ++++-- kernel/bpf/trampoline.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 44934b968b57..531b36090122 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -654,14 +654,16 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; - u64 start = sched_clock(); + u64 duration, start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + + duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index cc50607f8d8c..26ae703d3c3b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -885,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog, * Hence check that 'start' is valid. */ start > NO_START_TIME) { + u64 duration = sched_clock() - start; unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } -- cgit v1.2.3 From 2e114248e086fb376405ed3f89b220f8586a2541 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Tue, 2 Apr 2024 23:52:50 +0000 Subject: bpf: Replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. bpf sym names get looked up and compared/cleaned with various string apis. This suggests they need to be NUL-terminated (strncpy() suggests this but does not guarantee it). | static int compare_symbol_name(const char *name, char *namebuf) | { | cleanup_symbol_name(namebuf); | return strcmp(name, namebuf); | } | static void cleanup_symbol_name(char *s) | { | ... | res = strstr(s, ".llvm."); | ... | } Use strscpy() as this method guarantees NUL-termination on the destination buffer. This patch also replaces two uses of strncpy() used in log.c. These are simple replacements as postfix has been zero-initialized on the stack and has source arguments with a size less than the destination's size. Note that this patch uses the new 2-argument version of strscpy introduced in commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Signed-off-by: Justin Stitt Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Link: https://lore.kernel.org/bpf/20240402-strncpy-kernel-bpf-core-c-v1-1-7cb07a426e78@google.com --- kernel/bpf/core.c | 4 ++-- kernel/bpf/log.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab400cdd7d7a..ae406a2814db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -747,7 +747,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -813,7 +813,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 2a243cf37c60..4bd8f17a9f24 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) if (type & PTR_MAYBE_NULL) { if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); + strscpy(postfix, "or_null_"); else - strncpy(postfix, "_or_null", 16); + strscpy(postfix, "_or_null"); } snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", -- cgit v1.2.3 From 7bdbf7446305cb65c510c16d57cde82bc76b234a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:02 -0700 Subject: bpf: add special internal-only MOV instruction to resolve per-CPU addrs Add a new BPF instruction for resolving absolute addresses of per-CPU data from their per-CPU offsets. This instruction is internal-only and users are not allowed to use them directly. They will only be used for internal inlining optimizations for now between BPF verifier and BPF JITs. We use a special BPF_MOV | BPF_ALU64 | BPF_X form with insn->off field set to BPF_ADDR_PERCPU = -1. I used negative offset value to distinguish them from positive ones used by user-exposed instructions. Such instruction performs a resolution of a per-CPU offset stored in a register to a valid kernel address which can be dereferenced. It is useful in any use case where absolute address of a per-CPU data has to be resolved (e.g., in inlining bpf_map_lookup_elem()). BPF disassembler is also taught to recognize them to support dumping final BPF assembly code (non-JIT'ed version). Add arch-specific way for BPF JITs to mark support for this instructions. This patch also adds support for these instructions in x86-64 BPF JIT. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 16 ++++++++++++++++ include/linux/filter.h | 20 ++++++++++++++++++++ kernel/bpf/core.c | 5 +++++ kernel/bpf/disasm.c | 14 ++++++++++++++ 4 files changed, 55 insertions(+) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3b639d6f2f54..af89dd117dce 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1382,6 +1382,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image maybe_emit_mod(&prog, AUX_REG, dst_reg, true); EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); break; + } else if (insn_is_mov_percpu_addr(insn)) { + u32 off = (u32)(unsigned long)&this_cpu_off; + + /* mov , (if necessary) */ + EMIT_mov(dst_reg, src_reg); + + /* add , gs:[] */ + EMIT2(0x65, add_1mod(0x48, dst_reg)); + EMIT3(0x03, add_1reg(0x04, dst_reg), 0x25); + EMIT(off, 4); + break; } fallthrough; case BPF_ALU | BPF_MOV | BPF_X: @@ -3365,6 +3376,11 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } +bool bpf_jit_supports_percpu_insn(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/include/linux/filter.h b/include/linux/filter.h index 531b36090122..161d5f7b64ed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -178,6 +178,25 @@ struct ctl_table_header; .off = 0, \ .imm = 0 }) +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +#define BPF_MOV64_PERCPU_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = BPF_ADDR_PERCPU, \ + .imm = 0 }) + +static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ @@ -972,6 +991,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); +bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ae406a2814db..7a33a3a7e63c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2945,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void) return false; } +bool __weak bpf_jit_supports_percpu_insn(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bd2e2dd04740..309c4aa1b026 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn) insn->off == BPF_ADDR_SPACE_CAST; } +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); + } else if (is_mov_percpu_addr(insn)) { + verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", + insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', -- cgit v1.2.3 From 1ae6921009e5d72787e07ccc04754514ccf6bc99 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:03 -0700 Subject: bpf: inline bpf_get_smp_processor_id() helper If BPF JIT supports per-CPU MOV instruction, inline bpf_get_smp_processor_id() to eliminate unnecessary function calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fcb62300f407..17c06f1505e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20074,6 +20074,30 @@ patch_map_ops_generic: goto next_insn; } +#ifdef CONFIG_X86_64 + /* Implement bpf_get_smp_processor_id() inline. */ + if (insn->imm == BPF_FUNC_get_smp_processor_id && + prog->jit_requested && bpf_jit_supports_percpu_insn()) { + /* BPF_FUNC_get_smp_processor_id inlining is an + * optimization, so if pcpu_hot.cpu_number is ever + * changed in some incompatible and hard to support + * way, it's fine to back out this inlining logic + */ + insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +#endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { -- cgit v1.2.3 From db69718b8efac802c7cc20d5a6c7dfc913f99c43 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:04 -0700 Subject: bpf: inline bpf_map_lookup_elem() for PERCPU_ARRAY maps Using new per-CPU BPF instruction implement inlining for per-CPU ARRAY map lookup helper, if BPF JIT support is present. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 13358675ff2e..8c1e6d7654bb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ +static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + + BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); + + *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); + if (!map->bypass_spec_v1) { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); + } + + *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); + return insn - insn_buf; +} + static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -776,6 +808,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, + .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, -- cgit v1.2.3 From 0b56e637f705836b9aa51e2b1058c3c814c121a8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:05 -0700 Subject: bpf: inline bpf_map_lookup_elem() helper for PERCPU_HASH map Using new per-CPU BPF instruction, partially inline bpf_map_lookup_elem() helper for per-CPU hashmap BPF map. Just like for normal HASH map, we still generate a call into __htab_map_lookup_elem(), but after that we resolve per-CPU element address using a new instruction, saving on extra functions calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e81059faae63..83a9a74260e9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2308,6 +2308,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call for per-CPU hashmap */ +static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct htab_elem, key) + map->key_size); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + + return insn - insn_buf; +} + static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; @@ -2436,6 +2456,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, -- cgit v1.2.3 From af682b767a41772499f8e54ca7d7e1deb3395f44 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2024 16:38:00 -0700 Subject: bpf: Optimize emit_mov_imm64(). Turned out that bpf prog callback addresses, bpf prog addresses used in bpf_trampoline, and in other cases the 64-bit address can be represented as sign extended 32-bit value. According to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82339 "Skylake has 0.64c throughput for mov r64, imm64, vs. 0.25 for mov r32, imm32." So use shorter encoding and faster instruction when possible. Special care is needed in jit_subprogs(), since bpf_pseudo_func() instruction cannot change its size during the last step of JIT. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/CAADnVQKFfpY-QZBrOU2CG8v2du8Lgyb7MNVmOZVK_yTyOdNbBA@mail.gmail.com Link: https://lore.kernel.org/bpf/20240401233800.42737-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 5 ++++- kernel/bpf/verifier.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 964e8154da66..6cf9a5697c09 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -816,9 +816,10 @@ done: static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { + u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo; u8 *prog = *pprog; - if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + if (is_uimm32(imm64)) { /* * For emitting plain u32, where sign bit must not be * propagated LLVM tends to load imm64 over mov32 @@ -826,6 +827,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, * 'mov %eax, imm32' instead. */ emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else if (is_simm32(imm64)) { + emit_mov_imm32(&prog, true, dst_reg, imm32_lo); } else { /* movabsq rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17c06f1505e4..1e03ba9ed07b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19147,12 +19147,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) env->insn_aux_data[i].call_imm = insn->imm; /* point imm to __bpf_call_base+1 from JITs point of view */ insn->imm = 1; - if (bpf_pseudo_func(insn)) + if (bpf_pseudo_func(insn)) { +#if defined(MODULES_VADDR) + u64 addr = MODULES_VADDR; +#else + u64 addr = VMALLOC_START; +#endif /* jit (e.g. x86_64) may emit fewer instructions * if it learns a u32 imm is the same as a u64 imm. - * Force a non zero here. + * Set close enough to possible prog address. */ - insn[1].imm = 1; + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + } } err = bpf_prog_alloc_jited_linfo(prog); -- cgit v1.2.3 From 230d97d39ee2eb9030309f04f98615aaeb420dac Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:47 +0200 Subject: fsnotify: create a wrapper fsnotify_find_inode_mark() In preparation to passing an object pointer to fsnotify_find_mark(), add a wrapper fsnotify_find_inode_mark() and use it where possible. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-4-amir73il@gmail.com> --- fs/nfsd/filecache.c | 4 ++-- fs/notify/dnotify/dnotify.c | 4 ++-- fs/notify/inotify/inotify_user.c | 2 +- include/linux/fsnotify_backend.h | 7 +++++++ kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 6 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ddd3e0d9cfa6..ad9083ca144b 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -159,8 +159,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) do { fsnotify_group_lock(nfsd_file_fsnotify_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, - nfsd_file_fsnotify_group); + mark = fsnotify_find_inode_mark(inode, + nfsd_file_fsnotify_group); if (mark) { nfm = nfsd_file_mark_get(container_of(mark, struct nfsd_file_mark, diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 3464fa7e8538..f3669403fabf 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -162,7 +162,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); + fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 85d8fdd55329..4ffc30606e0b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -544,7 +544,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int create = (arg & IN_MASK_CREATE); int ret; - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); + fsn_mark = fsnotify_find_inode_mark(inode, group); if (!fsn_mark) return -ENOENT; else if (create) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d4e3bc55d174..992b57a7e95f 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -789,6 +789,13 @@ static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } +static inline struct fsnotify_mark *fsnotify_find_inode_mark( + struct inode *inode, + struct fsnotify_group *group) +{ + return fsnotify_find_mark(&inode->i_fsnotify_marks, group); +} + /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1b07e6f12a07..f2f38903b2fe 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -463,7 +463,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) int n; fsnotify_group_lock(audit_tree_group); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); + mark = fsnotify_find_inode_mark(inode, audit_tree_group); if (!mark) return create_chunk(inode, tree); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7a98cd176a12..7f358740e958 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -90,7 +90,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) struct audit_parent *parent = NULL; struct fsnotify_mark *entry; - entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group); + entry = fsnotify_find_inode_mark(inode, audit_watch_group); if (entry) parent = container_of(entry, struct audit_parent, mark); -- cgit v1.2.3 From 5e6a3c1ee693da1793739bb378b224bcf33d7f14 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Apr 2024 17:26:39 -0700 Subject: bpf: make bpf_get_branch_snapshot() architecture-agnostic perf_snapshot_branch_stack is set up in an architecture-agnostic way, so there is no reason for BPF subsystem to keep track of which architectures do support LBR or not. E.g., it looks like ARM64 might soon get support for BRBE ([0]), which (with proper integration) should be possible to utilize using this BPF helper. perf_snapshot_branch_stack static call will point to __static_call_return0() by default, which just returns zero, which will lead to -ENOENT, as expected. So no need to guard anything here. [0] https://lore.kernel.org/linux-arm-kernel/20240125094119.2542332-1-anshuman.khandual@arm.com/ Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404002640.1774210-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6d0c95638e1b..afb232b1d7c2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1188,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; @@ -1203,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return -ENOENT; return entry_cnt * br_entry_size; -#endif } static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { -- cgit v1.2.3 From 314a53623cd4e62e1b88126e5ed2bc87073d90ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Apr 2024 17:26:40 -0700 Subject: bpf: inline bpf_get_branch_snapshot() helper Inline bpf_get_branch_snapshot() helper using architecture-agnostic inline BPF code which calls directly into underlying callback of perf_snapshot_branch_stack static call. This callback is set early during kernel initialization and is never updated or reset, so it's ok to fetch actual implementation using static_call_query() and call directly into it. This change eliminates a full function call and saves one LBR entry in PERF_SAMPLE_BRANCH_ANY LBR mode. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404002640.1774210-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e03ba9ed07b..ffaa9f7f153c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20188,6 +20188,61 @@ patch_map_ops_generic: goto next_insn; } + /* Implement bpf_get_branch_snapshot inline. */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_get_branch_snapshot) { + /* We are dealing with the following func protos: + * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); + * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); + */ + const u32 br_entry_size = sizeof(struct perf_branch_entry); + + /* struct perf_branch_entry is part of UAPI and is + * used as an array element, so extremely unlikely to + * ever grow or shrink + */ + BUILD_BUG_ON(br_entry_size != 24); + + /* if (unlikely(flags)) return -EINVAL */ + insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); + + /* Transform size (bytes) into number of entries (cnt = size / 24). + * But to avoid expensive division instruction, we implement + * divide-by-3 through multiplication, followed by further + * division by 8 through 3-bit right shift. + * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., + * p. 227, chapter "Unsigned Divison by 3" for details and proofs. + * + * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. + */ + insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); + insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); + insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); + + /* call perf_snapshot_branch_stack implementation */ + insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); + /* if (entry_cnt == 0) return -ENOENT */ + insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); + /* return entry_cnt * sizeof(struct perf_branch_entry) */ + insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); + insn_buf[7] = BPF_JMP_A(3); + /* return -EINVAL; */ + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + insn_buf[9] = BPF_JMP_A(1); + /* return -ENOENT; */ + insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); + cnt = 11; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_kptr_xchg inline */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_kptr_xchg && -- cgit v1.2.3 From 1f2a74b41ea8b902687eb97c4e7e3f558801865b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Apr 2024 14:45:35 -0700 Subject: bpf: prevent r10 register from being marked as precise r10 is a special register that is not under BPF program's control and is always effectively precise. The rest of precision logic assumes that only r0-r9 SCALAR registers are marked as precise, so prevent r10 from being marked precise. This can happen due to signed cast instruction allowing to do something like `r0 = (s8)r10;`, which later, if r0 needs to be precise, would lead to an attempt to mark r10 as precise. Prevent this with an extra check during instruction backtracking. Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Reported-by: syzbot+148110ee7cf72f39f33e@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404214536.3551295-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffaa9f7f153c..41446bea8fab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3615,7 +3615,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * sreg needs precision before this insn */ bt_clear_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3631,7 +3632,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * both dreg and sreg need precision * before this insn */ - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ -- cgit v1.2.3 From 58babe27180c8d4cb54d831589cf801bd9268876 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2024 16:26:25 +0200 Subject: bpf: fix perf_snapshot_branch_stack link failure The newly added code to handle bpf_get_branch_snapshot fails to link when CONFIG_PERF_EVENTS is disabled: aarch64-linux-ld: kernel/bpf/verifier.o: in function `do_misc_fixups': verifier.c:(.text+0x1090c): undefined reference to `__SCK__perf_snapshot_branch_stack' Add a build-time check for that Kconfig symbol around the code to remove the link time dependency. Fixes: 314a53623cd4 ("bpf: inline bpf_get_branch_snapshot() helper") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240405142637.577046-1-arnd@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41446bea8fab..4353cb09c35b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20191,7 +20191,8 @@ patch_map_ops_generic: } /* Implement bpf_get_branch_snapshot inline. */ - if (prog->jit_requested && BITS_PER_LONG == 64 && + if (IS_ENABLED(CONFIG_PERF_EVENTS) && + prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_get_branch_snapshot) { /* We are dealing with the following func protos: * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); -- cgit v1.2.3 From 0a525621b7e5b49202b19d8f75382c6778fdd0c1 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:34 +0800 Subject: bpf: store both map ptr and state in bpf_insn_aux_data Currently, bpf_insn_aux_data->map_ptr_state is used to store either map_ptr or its poison state (i.e., BPF_MAP_PTR_POISON). Thus BPF_MAP_PTR_POISON must be checked before reading map_ptr. In certain cases, we may need valid map_ptr even in case of poison state. This will be explained in next patch with bpf_for_each_map_elem() helper. This patch changes map_ptr_state into a new struct including both map pointer and its state (poison/unpriv). It's in the same union with struct bpf_loop_inline_state, so there is no extra memory overhead. Besides, macros BPF_MAP_PTR_UNPRIV/BPF_MAP_PTR_POISON/BPF_MAP_PTR are no longer needed. This patch does not change any existing functionality. Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-2-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 9 ++++++++- kernel/bpf/verifier.c | 36 ++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7cb1b75eee38..36d19cd32eb5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -502,6 +502,13 @@ struct bpf_loop_inline_state { u32 callback_subprogno; /* valid when fit_for_inline is true */ }; +/* pointer and state for maps */ +struct bpf_map_ptr_state { + struct bpf_map *map_ptr; + bool poison; + bool unpriv; +}; + /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) @@ -514,7 +521,7 @@ struct bpf_loop_inline_state { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_ptr_state; /* pointer/poison value for maps */ + struct bpf_map_ptr_state map_ptr_state; s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4353cb09c35b..62fedb1f3312 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem { #define BPF_MAP_KEY_POISON (1ULL << 63) #define BPF_MAP_KEY_SEEN (1ULL << 62) -#define BPF_MAP_PTR_UNPRIV 1UL -#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ - POISON_POINTER_DELTA)) -#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); @@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; + return aux->map_ptr_state.poison; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state.unpriv; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, - const struct bpf_map *map, bool unpriv) + struct bpf_map *map, + bool unpriv, bool poison) { - BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_ptr_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state.unpriv = unpriv; + aux->map_ptr_state.poison = poison; + aux->map_ptr_state.map_ptr = map; } static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) @@ -9660,7 +9656,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return -EINVAL; } - map = BPF_MAP_PTR(insn_aux->map_ptr_state); + map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { verbose(env, "callback function not allowed for map\n"); @@ -10019,12 +10015,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_ptr_state)) + if (!aux->map_ptr_state.map_ptr) + bpf_map_ptr_store(aux, meta->map_ptr, + !meta->map_ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map_ptr) bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1); - else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) - bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - !meta->map_ptr->bypass_spec_v1); + !meta->map_ptr->bypass_spec_v1, true); return 0; } @@ -19840,7 +19836,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) !bpf_map_ptr_unpriv(aux)) { struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.map = aux->map_ptr_state.map_ptr, .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; @@ -19869,7 +19865,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -19977,7 +19973,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { -- cgit v1.2.3 From 9d482da9e17a4ddd5563428f74302a36b2610306 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:35 +0800 Subject: bpf: allow invoking bpf_for_each_map_elem with different maps Taking different maps within a single bpf_for_each_map_elem call is not allowed before, because from the second map, bpf_insn_aux_data->map_ptr_state will be marked as *poison*. In fact both map_ptr and state are needed to support this use case: map_ptr is used by set_map_elem_callback_state() while poison state is needed to determine whether to use direct call. Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-3-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62fedb1f3312..590db4e4c071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9651,11 +9651,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_map *map; int err; - if (bpf_map_ptr_poisoned(insn_aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - + /* valid map_ptr and poison value does not matter */ map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { -- cgit v1.2.3 From a8e03b6bbb2cc7cf387d1ce335e4ce4c3bdfef9b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 5 Apr 2024 09:30:40 -0500 Subject: bpf: Allow invoking kfuncs from BPF_PROG_TYPE_SYSCALL progs Currently, a set of core BPF kfuncs (e.g. bpf_task_*, bpf_cgroup_*, bpf_cpumask_*, etc) cannot be invoked from BPF_PROG_TYPE_SYSCALL programs. The whitelist approach taken for enabling kfuncs makes sense: it not safe to call these kfuncs from every program type. For example, it may not be safe to call bpf_task_acquire() in an fentry to free_task(). BPF_PROG_TYPE_SYSCALL, on the other hand, is a perfectly safe program type from which to invoke these kfuncs, as it's a very controlled environment, and we should never be able to run into any of the typical problems such as recursive invoations, acquiring references on freeing kptrs, etc. Being able to invoke these kfuncs would be useful, as BPF_PROG_TYPE_SYSCALL can be invoked with BPF_PROG_RUN, and would therefore enable user space programs to synchronously call into BPF to manipulate these kptrs. This patch therefore enables invoking the aforementioned core kfuncs from BPF_PROG_TYPE_SYSCALL progs. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240405143041.632519-2-void@manifault.com --- kernel/bpf/cpumask.c | 1 + kernel/bpf/helpers.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index dad0fb1c8e87..33c473d676a5 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -474,6 +474,7 @@ static int __init cpumask_kfunc_init(void) ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, ARRAY_SIZE(cpumask_dtors), THIS_MODULE); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d9e7aca8ae9e..8cde717137bd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2653,6 +2653,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); -- cgit v1.2.3 From a2ea3cd78317ae8995b65b52299158bbae52a77f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 22:01:05 +0300 Subject: irqdomain: Check virq for 0 before use in irq_dispose_mapping() It's a bit hard to read the logic since the virq is used before checking it for 0. Rearrange the code to make it better to understand. This, in particular, should clearly answer the question whether the caller needs to perform this check or not, and there are plenty of places for both variants, confirming a confusion. Fun fact that the new code is shorter: Function old new delta irq_dispose_mapping 278 271 -7 Total: Before=11625, After=11618, chg -0.06% when compiled by GCC on Debian for x86_64. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240405190105.3932034-1-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3dd1c871e091..aadc8891cc16 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -909,10 +909,11 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping); */ void irq_dispose_mapping(unsigned int virq) { - struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_data *irq_data; struct irq_domain *domain; - if (!virq || !irq_data) + irq_data = virq ? irq_get_irq_data(virq) : NULL; + if (!irq_data) return; domain = irq_data->domain; -- cgit v1.2.3 From 82ccdf062a64f3c4ac575c16179ce68edbbbe8e4 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 22 Mar 2024 15:04:41 +0800 Subject: hrtimer: Remove unused function The function is defined, but not called anywhere: kernel/time/hrtimer.c:1880:20: warning: unused function '__hrtimer_peek_ahead_timers'. Remove it. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240322070441.29646-1-jiapeng.chong@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8611 --- kernel/time/hrtimer.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 70625dff62ce..cae9d04b5584 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1875,25 +1875,7 @@ retry: tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } - -/* called with interrupts disabled */ -static inline void __hrtimer_peek_ahead_timers(void) -{ - struct tick_device *td; - - if (!hrtimer_hres_active()) - return; - - td = this_cpu_ptr(&tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); -} - -#else /* CONFIG_HIGH_RES_TIMERS */ - -static inline void __hrtimer_peek_ahead_timers(void) { } - -#endif /* !CONFIG_HIGH_RES_TIMERS */ +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy -- cgit v1.2.3 From d2e58ab5cda2a225c406ac10d0a8b960bc5a39b6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:10 +0200 Subject: vdso: Add vdso_data:: Max_cycles Add vdso_data::max_cycles in preparation to use it to detect potential multiplication overflow. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-7-adrian.hunter@intel.com --- include/vdso/datapage.h | 4 ++++ kernel/time/vsyscall.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index c71ddb6d4691..d04d394db064 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -61,6 +61,7 @@ struct vdso_timestamp { * @seq: timebase sequence counter * @clock_mode: clock mode * @cycle_last: timebase at clocksource init + * @max_cycles: maximum cycles which won't overflow 64bit multiplication * @mask: clocksource mask * @mult: clocksource multiplier * @shift: clocksource shift @@ -92,6 +93,9 @@ struct vdso_data { s32 clock_mode; u64 cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + u64 max_cycles; +#endif u64 mask; u32 mult; u32 shift; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index f0d5062d9cbc..9193d6133e5d 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -22,10 +22,16 @@ static inline void update_vdso_data(struct vdso_data *vdata, u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; +#endif vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; +#endif vdata[CS_RAW].mask = tk->tkr_raw.mask; vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; -- cgit v1.2.3 From e98ab3d4159e6bab4e391f376a1e548dd4d32524 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:13 +0200 Subject: timekeeping: Move timekeeping helper functions Move timekeeping helper functions to prepare for their reuse. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-10-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b58dffc58a8f..3375f0a6400f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -381,23 +381,31 @@ static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 de return nsec; } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { u64 delta; - delta = timekeeping_get_delta(tkr); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); return timekeeping_delta_to_ns(tkr, delta); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) { - u64 delta; + u64 delta, cycles = tk_clock_read(tkr); - /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); return timekeeping_delta_to_ns(tkr, delta); } +static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) +{ + u64 delta; + + delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} + /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -431,14 +439,6 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, memcpy(base + 1, base, sizeof(*base)); } -static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) -{ - u64 delta, cycles = tk_clock_read(tkr); - - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); -} - static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; -- cgit v1.2.3 From a729a63c6b2ebd8bc37646519d404f005ea8f1b2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:14 +0200 Subject: timekeeping: Rename fast_tk_get_delta_ns() to __timekeeping_get_ns() Rename fast_tk_get_delta_ns() to __timekeeping_get_ns() to prepare for its reuse as a general timekeeping helper function. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-11-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3375f0a6400f..63061332a75c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -390,7 +390,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c return timekeeping_delta_to_ns(tkr, delta); } -static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) +static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) { u64 delta, cycles = tk_clock_read(tkr); @@ -449,7 +449,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += fast_tk_get_delta_ns(tkr); + now += __timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -565,7 +565,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = fast_tk_get_delta_ns(tkr); + delta = __timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) -- cgit v1.2.3 From 9af4548e828aa2ea66f54433c5747f64124a6240 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:15 +0200 Subject: timekeeping: Tidy timekeeping_cycles_to_ns() slightly Put together declaration and initialization of the local variable 'delta'. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-12-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63061332a75c..c698219b152d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -383,10 +383,9 @@ static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 de static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { - u64 delta; + /* Calculate the delta since the last update_wall_time() */ + u64 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); return timekeeping_delta_to_ns(tkr, delta); } -- cgit v1.2.3 From 670be12ba8f5d20ee2fb0531be6977005cd62401 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:16 +0200 Subject: timekeeping: Reuse timekeeping_cycles_to_ns() Simplify __timekeeping_get_ns() by reusing timekeeping_cycles_to_ns(). No functional change. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-13-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c698219b152d..f81d675291e0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -391,10 +391,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) { - u64 delta, cycles = tk_clock_read(tkr); - - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); + return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) -- cgit v1.2.3 From e8e9d21a5df655a62ab4611fd437fb7510d2f85c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:17 +0200 Subject: timekeeping: Refactor timekeeping helpers Simplify the usage of timekeeping sanity checking, in preparation for consolidating timekeeping helpers. This works towards eliminating timekeeping_delta_to_ns() in favour of timekeeping_cycles_to_ns(). No functional change. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-14-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f81d675291e0..618328cd4bc4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -237,7 +237,7 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) } } -static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) +static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr) { struct timekeeper *tk = &tk_core.timekeeper; u64 now, last, mask, max, delta; @@ -281,17 +281,9 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) { } -static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) +static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr) { - u64 cycle_now, delta; - - /* read clocksource */ - cycle_now = tk_clock_read(tkr); - - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - - return delta; + BUG(); } #endif @@ -396,10 +388,10 @@ static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { - u64 delta; + if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING)) + return timekeeping_delta_to_ns(tkr, timekeeping_debug_get_delta(tkr)); - delta = timekeeping_get_delta(tkr); - return timekeeping_delta_to_ns(tkr, delta); + return __timekeeping_get_ns(tkr); } /** -- cgit v1.2.3 From e84f43e34faf85816587f80594541ec978449d6e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:18 +0200 Subject: timekeeping: Consolidate timekeeping helpers Consolidate timekeeping helpers, making use of timekeeping_cycles_to_ns() in preference to directly using timekeeping_delta_to_ns(). No functional change. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-15-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 618328cd4bc4..1bbfe1ff8d24 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -237,7 +237,9 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) } } -static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr) +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles); + +static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) { struct timekeeper *tk = &tk_core.timekeeper; u64 now, last, mask, max, delta; @@ -266,22 +268,22 @@ static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr) */ if (unlikely((~delta & mask) < (mask >> 3))) { tk->underflow_seen = 1; - delta = 0; + now = last; } /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > max)) { tk->overflow_seen = 1; - delta = tkr->clock->max_cycles; + now = last + max; } - return delta; + return timekeeping_cycles_to_ns(tkr, now); } #else static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) { } -static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr) +static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) { BUG(); } @@ -389,7 +391,7 @@ static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING)) - return timekeeping_delta_to_ns(tkr, timekeeping_debug_get_delta(tkr)); + return timekeeping_debug_get_ns(tkr); return __timekeeping_get_ns(tkr); } -- cgit v1.2.3 From 3094c6db1cba0bbca6ea19c777762c26fee747d7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:19 +0200 Subject: timekeeping: Fold in timekeeping_delta_to_ns() timekeeping_delta_to_ns() is now called only from timekeeping_cycles_to_ns(), and it is not useful otherwise. Simplify the code by folding it into timekeeping_cycles_to_ns(). No functional change. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-16-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1bbfe1ff8d24..749387f22f0f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -364,23 +364,12 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ - -static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) -{ - u64 nsec; - - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; - - return nsec; -} - static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); + return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) -- cgit v1.2.3 From e809a80aa0bcf802f99407c23fd6be6fd4eb250a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:20 +0200 Subject: timekeeping: Prepare timekeeping_cycles_to_ns() for overflow safety Open code clocksource_delta() in timekeeping_cycles_to_ns() so that overflow safety can be added efficiently. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-17-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 749387f22f0f..d17484082e2c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -367,7 +367,17 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ - u64 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; + + if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) { + /* + * Handle clocksource inconsistency between CPUs to prevent + * time from going backwards by checking for the MSB of the + * mask being set in the delta. + */ + if (unlikely(delta & ~(mask >> 1))) + return tkr->xtime_nsec >> tkr->shift; + } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -- cgit v1.2.3 From fcf190c369149c3b04539797cedf28741eb14164 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:21 +0200 Subject: timekeeping: Make delta calculation overflow safe Kernel timekeeping is designed to keep the change in cycles (since the last timer interrupt) below max_cycles, which prevents multiplication overflow when converting cycles to nanoseconds. However, if timer interrupts stop, the calculation will eventually overflow. Add protection against that. In timekeeping_cycles_to_ns() calculation, check against max_cycles, falling back to a slower higher precision calculation. In timekeeping_forward_now(), process delta in chunks of at most max_cycles. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-18-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d17484082e2c..111dfdbd488f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -364,19 +364,32 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ +static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) +{ + return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); +} + static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; - if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) { - /* - * Handle clocksource inconsistency between CPUs to prevent - * time from going backwards by checking for the MSB of the - * mask being set in the delta. - */ - if (unlikely(delta & ~(mask >> 1))) - return tkr->xtime_nsec >> tkr->shift; + /* + * This detects the case where the delta overflows the multiplication + * with tkr->mult. + */ + if (unlikely(delta > tkr->clock->max_cycles)) { + if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) { + /* + * Handle clocksource inconsistency between CPUs to prevent + * time from going backwards by checking for the MSB of the + * mask being set in the delta. + */ + if (unlikely(delta & ~(mask >> 1))) + return tkr->xtime_nsec >> tkr->shift; + } + + return delta_to_ns_safe(tkr, delta); } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; @@ -789,10 +802,15 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; - tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; - tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + while (delta > 0) { + u64 max = tk->tkr_mono.clock->max_cycles; + u64 incr = delta < max ? delta : max; - tk_normalize_xtime(tk); + tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; + tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; + tk_normalize_xtime(tk); + delta -= incr; + } } /** -- cgit v1.2.3 From 135225a363ae67bc90bde7a2cbbe1ea0f152ba22 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:22 +0200 Subject: timekeeping: Let timekeeping_cycles_to_ns() handle both under and overflow For the case !CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE, forego overflow protection in the range (mask << 1) < delta <= mask, and interpret it always as an inconsistency between CPU clock values. That allows slightly neater code, and it is on a slow path so has no effect on performance. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-19-adrian.hunter@intel.com --- kernel/time/timekeeping.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 111dfdbd488f..4e18db1819f8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -266,17 +266,14 @@ static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) * Try to catch underflows by checking if we are seeing small * mask-relative negative values. */ - if (unlikely((~delta & mask) < (mask >> 3))) { + if (unlikely((~delta & mask) < (mask >> 3))) tk->underflow_seen = 1; - now = last; - } - /* Cap delta value to the max_cycles values to avoid mult overflows */ - if (unlikely(delta > max)) { + /* Check for multiplication overflows */ + if (unlikely(delta > max)) tk->overflow_seen = 1; - now = last + max; - } + /* timekeeping_cycles_to_ns() handles both under and overflow */ return timekeeping_cycles_to_ns(tkr, now); } #else @@ -375,19 +372,17 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; /* - * This detects the case where the delta overflows the multiplication - * with tkr->mult. + * This detects both negative motion and the case where the delta + * overflows the multiplication with tkr->mult. */ if (unlikely(delta > tkr->clock->max_cycles)) { - if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) { - /* - * Handle clocksource inconsistency between CPUs to prevent - * time from going backwards by checking for the MSB of the - * mask being set in the delta. - */ - if (unlikely(delta & ~(mask >> 1))) - return tkr->xtime_nsec >> tkr->shift; - } + /* + * Handle clocksource inconsistency between CPUs to prevent + * time from going backwards by checking for the MSB of the + * mask being set in the delta. + */ + if (delta & ~(mask >> 1)) + return tkr->xtime_nsec >> tkr->shift; return delta_to_ns_safe(tkr, delta); } -- cgit v1.2.3 From d0304569fb019d1bcfbbbce1ce6df6b96f04079b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 25 Mar 2024 08:40:23 +0200 Subject: clocksource: Make watchdog and suspend-timing multiplication overflow safe Kernel timekeeping is designed to keep the change in cycles (since the last timer interrupt) below max_cycles, which prevents multiplication overflow when converting cycles to nanoseconds. However, if timer interrupts stop, the clocksource_cyc2ns() calculation will eventually overflow. Add protection against that. Simplify by folding together clocksource_delta() and clocksource_cyc2ns() into cycles_to_nsec_safe(). Check against max_cycles, falling back to a slower higher precision calculation. Suggested-by: Thomas Gleixner Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325064023.2997-20-adrian.hunter@intel.com --- kernel/time/clocksource.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e5b260aa0e02..4d50d53ac719 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -20,6 +20,16 @@ #include "tick-internal.h" #include "timekeeping_internal.h" +static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) +{ + u64 delta = clocksource_delta(end, start, cs->mask); + + if (likely(delta < cs->max_cycles)) + return clocksource_cyc2ns(delta, cs->mult, cs->shift); + + return mul_u64_u32_shr(delta, cs->mult, cs->shift); +} + /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks * @mult: pointer to mult variable @@ -222,8 +232,8 @@ enum wd_read_status { static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { unsigned int nretries, max_retries; - u64 wd_end, wd_end2, wd_delta; int64_t wd_delay, wd_seq_delay; + u64 wd_end, wd_end2; max_retries = clocksource_get_max_watchdog_retry(); for (nretries = 0; nretries <= max_retries; nretries++) { @@ -234,9 +244,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, wd_end2 = watchdog->read(watchdog); local_irq_enable(); - wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); - wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, - watchdog->shift); + wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); if (wd_delay <= WATCHDOG_MAX_SKEW) { if (nretries > 1 || nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", @@ -254,8 +262,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, * report system busy, reinit the watchdog and skip the current * watchdog test. */ - wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask); - wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift); + wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) goto skip_test; } @@ -366,8 +373,7 @@ void clocksource_verify_percpu(struct clocksource *cs) delta = (csnow_end - csnow_mid) & cs->mask; if (delta < 0) cpumask_set_cpu(cpu, &cpus_ahead); - delta = clocksource_delta(csnow_end, csnow_begin, cs->mask); - cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); if (cs_nsec > cs_nsec_max) cs_nsec_max = cs_nsec; if (cs_nsec < cs_nsec_min) @@ -398,8 +404,8 @@ static inline void clocksource_reset_watchdog(void) static void clocksource_watchdog(struct timer_list *unused) { - u64 csnow, wdnow, cslast, wdlast, delta; int64_t wd_nsec, cs_nsec, interval; + u64 csnow, wdnow, cslast, wdlast; int next_cpu, reset_pending; struct clocksource *cs; enum wd_read_status read_ret; @@ -456,12 +462,8 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); - wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, - watchdog->shift); - - delta = clocksource_delta(csnow, cs->cs_last, cs->mask); - cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); + cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); wdlast = cs->wd_last; /* save these in case we print them */ cslast = cs->cs_last; cs->cs_last = csnow; @@ -832,7 +834,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles) */ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) { - u64 now, delta, nsec = 0; + u64 now, nsec = 0; if (!suspend_clocksource) return 0; @@ -847,12 +849,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) else now = suspend_clocksource->read(suspend_clocksource); - if (now > suspend_start) { - delta = clocksource_delta(now, suspend_start, - suspend_clocksource->mask); - nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult, - suspend_clocksource->shift); - } + if (now > suspend_start) + nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now); /* * Disable the suspend timer to save power if current clocksource is -- cgit v1.2.3 From d61c2695bddf56f4527f71cc6ebc31897be36cff Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:05 +0100 Subject: PM: EM: Refactor em_adjust_new_capacity() Extract em_table_dup() and em_recalc_and_update() from em_adjust_new_capacity(). Both functions will be later reused by the 'update EM due to chip binning' functionality. Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 58 ++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 9e1c9aa399ea..6960dd7393b2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -674,23 +674,15 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -/* - * Adjustment of CPU performance values after boot, when all CPUs capacites - * are correctly calculated. - */ -static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) +static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) { struct em_perf_table __rcu *em_table; struct em_perf_state *ps, *new_ps; - int ret, ps_size; + int ps_size; em_table = em_table_alloc(pd); - if (!em_table) { - dev_warn(dev, "EM: allocation failed\n"); - return; - } + if (!em_table) + return NULL; new_ps = em_table->state; @@ -702,24 +694,52 @@ static void em_adjust_new_capacity(struct device *dev, rcu_read_unlock(); - em_init_performance(dev, pd, new_ps, pd->nr_perf_states); - ret = em_compute_costs(dev, new_ps, NULL, pd->nr_perf_states, + return em_table; +} + +static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, + struct em_perf_table __rcu *em_table) +{ + int ret; + + ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, pd->flags); - if (ret) { - dev_warn(dev, "EM: compute costs failed\n"); - return; - } + if (ret) + goto free_em_table; ret = em_dev_update_perf_domain(dev, em_table); if (ret) - dev_warn(dev, "EM: update failed %d\n", ret); + goto free_em_table; /* * This is one-time-update, so give up the ownership in this updater. * The EM framework has incremented the usage counter and from now * will keep the reference (then free the memory when needed). */ +free_em_table: em_table_free(em_table); + return ret; +} + +/* + * Adjustment of CPU performance values after boot, when all CPUs capacites + * are correctly calculated. + */ +static void em_adjust_new_capacity(struct device *dev, + struct em_perf_domain *pd, + u64 max_cap) +{ + struct em_perf_table __rcu *em_table; + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return; + } + + em_init_performance(dev, pd, em_table->state, pd->nr_perf_states); + + em_recalc_and_update(dev, pd, em_table); } static void em_check_capacity_update(void) -- cgit v1.2.3 From cf61d53b026805e8222ca28ac2795611eb7fa547 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:06 +0100 Subject: PM: EM: Add em_dev_update_chip_binning() Add a function which allows to modify easily the EM after the new voltage information is available. The device drivers for the chip can adjust the voltage values after setup. The voltage for the same frequency in OPP can be different due to chip binning. The voltage impacts the power usage and the EM power values can be updated to reflect that. Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 5 +++++ kernel/power/energy_model.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 70cd7258cd29..1ff52020cf75 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -172,6 +172,7 @@ struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); +int em_dev_update_chip_binning(struct device *dev); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -386,6 +387,10 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, { return -EINVAL; } +static inline int em_dev_update_chip_binning(struct device *dev) +{ + return -EINVAL; +} #endif #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6960dd7393b2..927cc55ba0b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -808,3 +808,51 @@ static void em_update_workfn(struct work_struct *work) { em_check_capacity_update(); } + +/** + * em_dev_update_chip_binning() - Update Energy Model after the new voltage + * information is present in the OPPs. + * @dev : Device for which the Energy Model has to be updated. + * + * This function allows to update easily the EM with new values available in + * the OPP framework and DT. It can be used after the chip has been properly + * verified by device drivers and the voltages adjusted for the 'chip binning'. + */ +int em_dev_update_chip_binning(struct device *dev) +{ + struct em_perf_table __rcu *em_table; + struct em_perf_domain *pd; + int i, ret; + + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + pd = em_pd_get(dev); + if (!pd) { + dev_warn(dev, "Couldn't find Energy Model\n"); + return -EINVAL; + } + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return -ENOMEM; + } + + /* Update power values which might change due to new voltage in OPPs */ + for (i = 0; i < pd->nr_perf_states; i++) { + unsigned long freq = em_table->state[i].frequency; + unsigned long power; + + ret = dev_pm_opp_calc_power(dev, &power, &freq); + if (ret) { + em_table_free(em_table); + return ret; + } + + em_table->state[i].power = power; + } + + return em_recalc_and_update(dev, pd, em_table); +} +EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); -- cgit v1.2.3 From 2125c0034c5dfd61171b494bd309bb7637bff6eb Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2024 09:47:48 -0400 Subject: cgroup/cpuset: Make cpuset hotplug processing synchronous Since commit 3a5a6d0c2b03("cpuset: don't nest cgroup_mutex inside get_online_cpus()"), cpuset hotplug was done asynchronously via a work function. This is to avoid recursive locking of cgroup_mutex. Since then, the cgroup locking scheme has changed quite a bit. A cpuset_mutex was introduced to protect cpuset specific operations. The cpuset_mutex is then replaced by a cpuset_rwsem. With commit d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order"), cpu_hotplug_lock is acquired before cpuset_rwsem. Later on, cpuset_rwsem is reverted back to cpuset_mutex. All these locking changes allow the hotplug code to call into cpuset core directly. The following commits were also merged due to the asynchronous nature of cpuset hotplug processing. - commit b22afcdf04c9 ("cpu/hotplug: Cure the cpusets trainwreck") - commit 50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs") - commit 28b89b9e6f7b ("cpuset: handle race between CPU hotplug and cpuset_hotplug_work") Clean up all these bandages by making cpuset hotplug processing synchronous again with the exception that the call to cgroup_transfer_tasks() to transfer tasks out of an empty cgroup v1 cpuset, if necessary, will still be done via a work function due to the existing cgroup_mutex -> cpu_hotplug_lock dependency. It is possible to reverse that dependency, but that will require updating a number of different cgroup controllers. This special hotplug code path should be rarely taken anyway. As all the cpuset states will be updated by the end of the hotplug operation, we can revert most the above commits except commit 50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs") which is partially reverted. Also removing some cpus_read_lock trylock attempts in the cpuset partition code as they are no longer necessary since the cpu_hotplug_lock is now held for the whole duration of the cpuset hotplug code path. Signed-off-by: Waiman Long Tested-by: Valentin Schneider Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 3 -- kernel/cgroup/cpuset.c | 141 ++++++++++++++++++++----------------------------- kernel/cpu.c | 48 ----------------- kernel/power/process.c | 2 - 4 files changed, 56 insertions(+), 138 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 0ce6ff0d9c9a..de4cf0ee96f7 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -70,7 +70,6 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); @@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void) partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..d8d3439eda4e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -201,6 +201,14 @@ struct cpuset { struct list_head remote_sibling; }; +/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + /* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ @@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); - static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) @@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk, rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void) /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1260,12 +1250,17 @@ static void rebuild_sched_domains_locked(void) } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } @@ -2079,14 +2074,11 @@ write_error: /* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - } notify_partition_change(cs, old_prs); return 0; @@ -3599,8 +3591,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * cpuset_handle_hotplug may call back into cgroup core asynchronously + * via cgroup_transfer_tasks() and waiting for it from a cgroupfs * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after @@ -3609,7 +3601,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, */ css_get(&cs->css); kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); cpus_read_lock(); mutex_lock(&cpuset_mutex); @@ -4354,6 +4345,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, @@ -4383,12 +4384,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. + * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); } } @@ -4421,30 +4431,6 @@ void cpuset_force_rebuild(void) force_rebuild = true; } -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; -} - /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4493,13 +4479,11 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); - cpus_read_unlock(); } /* @@ -4519,18 +4503,8 @@ retry: else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; - /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4558,8 +4532,7 @@ unlock: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4573,8 +4546,10 @@ unlock: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4585,6 +4560,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -4666,7 +4642,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; - rebuild_sched_domains(); + rebuild_sched_domains_cpuslocked(); } free_cpumasks(NULL, ptmp); @@ -4679,12 +4655,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -4695,7 +4666,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 8f6affd051f7..49ce3f309688 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -/* - * - * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock - * protected region. - * - * The operation is still serialized against concurrent CPU hotplug via - * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ - * serialized against other hotplug related activity like adding or - * removing of state callbacks and state instances, which invoke either the - * startup or the teardown callback of the affected state. - * - * This is required for subsystems which are unfixable vs. CPU hotplug and - * evade lock inversion problems by scheduling work which has to be - * completed _before_ cpu_up()/_cpu_down() returns. - * - * Don't even think about adding anything to this for any new code or even - * drivers. It's only purpose is to keep existing lock order trainwrecks - * working. - * - * For cpu_down() there might be valid reasons to finish cleanups which are - * not required to be done under cpu_hotplug_lock, but that's a different - * story and would be not invoked via this. - */ -static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) -{ - /* - * cpusets delegate hotplug operations to a worker to "solve" the - * lock order problems. Wait for the worker, but only if tasks are - * _not_ frozen (suspend, hibernate) as that would wait forever. - * - * The wait is required because otherwise the hotplug operation - * returns with inconsistent state, which could even be observed in - * user space when a new CPU is brought up. The CPU plug uevent - * would be delivered and user space reacting on it would fail to - * move tasks to the newly plugged CPU up to the point where the - * work has finished because up to that point the newly plugged CPU - * is not assignable in cpusets/cgroups. On unplug that's not - * necessarily a visible issue, but it is still inconsistent state, - * which is the real problem which needs to be "fixed". This can't - * prevent the transient state between scheduling the work and - * returning from waiting for it. - */ - if (!tasks_frozen) - cpuset_wait_for_hotplug(); -} - #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) @@ -1494,7 +1448,6 @@ out: */ lockup_detector_cleanup(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } @@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) out: cpus_write_unlock(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } diff --git a/kernel/power/process.c b/kernel/power/process.c index cae81a87cc91..66ac067d9ae6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -194,8 +194,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ -- cgit v1.2.3 From 31103f40b1b5d4382446b4d5af37e61dce31f8d5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 8 Apr 2024 16:44:04 +0800 Subject: workqueue: Add destroy_work_on_stack() in workqueue_softirq_dead() This commit add missed destroy_work_on_stack() operations for dead_work.work. Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a8cbaede1e22..3c3154b40698 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3661,6 +3661,7 @@ void workqueue_softirq_dead(unsigned int cpu) queue_work(system_bh_wq, &dead_work.work); wait_for_completion(&dead_work.done); + destroy_work_on_stack(&dead_work.work); } } -- cgit v1.2.3 From 8f0acb7f3a1331559e325566c00c26d1523dfe06 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 14 Mar 2024 18:04:01 +0800 Subject: clocksource: Convert s[n]printf() to sysfs_emit() Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). Signed-off-by: Li Zhijian Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240314100402.1326582-1-lizhijian@fujitsu.com --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4d50d53ac719..d25ba49e313c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1334,7 +1334,7 @@ static ssize_t current_clocksource_show(struct device *dev, ssize_t count = 0; mutex_lock(&clocksource_mutex); - count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + count = sysfs_emit(buf, "%s\n", curr_clocksource->name); mutex_unlock(&clocksource_mutex); return count; -- cgit v1.2.3 From 98fe0fcb326a923740cb8900aa7ed7fe538c984a Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 14 Mar 2024 18:04:02 +0800 Subject: clockevents: Convert s[n]printf() to sysfs_emit() Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). Signed-off-by: Li Zhijian Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240314100402.1326582-2-lizhijian@fujitsu.com --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a7ca458cdd9c..60a6484831b1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -677,7 +677,7 @@ static ssize_t current_device_show(struct device *dev, raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) - count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + count = sysfs_emit(buf, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } -- cgit v1.2.3 From 3183059ad82a0daa8292daf43c325bac57daceb5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Feb 2024 15:07:25 -0800 Subject: rcu: Add lockdep checks and kernel-doc header to rcu_softirq_qs() There is some indications that rcu_softirq_qs() might be more generally used than anticipated. This commit therefore adds some lockdep assertions and some cautionary tales in a new kernel-doc header. Link: https://lore.kernel.org/all/Zd4DXTyCf17lcTfq@debian.debian/ Signed-off-by: Paul E. McKenney Cc: Eric Dumazet Cc: Jakub Kicinski Cc: "David S. Miller" Cc: Yan Zhai Cc: Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9642dd06c25..2795a1457acf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -240,8 +240,36 @@ static long rcu_get_n_cbs_cpu(int cpu) return 0; } +/** + * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing + * + * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU. + * This is a special-purpose function to be used in the softirq + * infrastructure and perhaps the occasional long-running softirq + * handler. + * + * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is + * equivalent to momentarily completely enabling preemption. For + * example, given this code:: + * + * local_bh_disable(); + * do_something(); + * rcu_softirq_qs(); // A + * do_something_else(); + * local_bh_enable(); // B + * + * A call to synchronize_rcu() that began concurrently with the + * call to do_something() would be guaranteed to wait only until + * execution reached statement A. Without that rcu_softirq_qs(), + * that same synchronize_rcu() would instead be guaranteed to wait + * until execution reached statement B. + */ void rcu_softirq_qs(void) { + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal rcu_softirq_qs() in RCU read-side critical section"); rcu_qs(); rcu_preempt_deferred_qs(current); rcu_tasks_qs(current, false); -- cgit v1.2.3 From 0a0467af0a4d8b4e3832945d1cc287d1cb52573e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Feb 2024 17:00:31 -0800 Subject: rcutorture: Dump # online CPUs on insufficient cb-flood laundering This commit adds the number of online CPUs to the state dump following an unsuccesful callback-flood test. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45d6b4c3d199..6611ef3e71c3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2833,12 +2833,12 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); - pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n", __func__, stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, - n_max_gps, n_max_cbs, cver, gps); + n_max_gps, n_max_cbs, cver, gps, num_online_cpus()); atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); mutex_lock(&rcu_fwd_mutex); // Serialize histograms. rcu_torture_fwd_cb_hist(rfp); -- cgit v1.2.3 From f8039457eedc378fa7a186054ee3032b8a41eaee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Feb 2024 09:11:08 -0800 Subject: rcutorture: Dump GP kthread state on insufficient cb-flood laundering If a callback flood prevents grace period from completing, rcutorture does a WARN_ON(). Avoiding this WARN_ON() currently requires that at least three grace periods elapse during an eight-second callback-flood interval. Unfortunately, the current debug information does not include anything about the grace-period state. This commit therefore adds a call to cur_ops->gp_kthread_dbg(), if this function pointer is non-NULL. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6611ef3e71c3..eff51a26216f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2832,7 +2832,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { - WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n", __func__, stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, -- cgit v1.2.3 From c507e195016ccca18e375d14cfeb1186dac60b2c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2024 17:00:39 -0800 Subject: rcutorture: ASSERT_EXCLUSIVE_WRITER() for ->rtort_pipe_count updates It turns out that only one CPU at a time will ever invoke rcu_torture_pipe_update_one() on a given rcu_torture structure. This commit therefore adds three ASSERT_EXCLUSIVE_WRITER() calls to enlist KCSAN's aid in checking this. Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eff51a26216f..d8c12eba35b7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -466,6 +466,7 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(rp->rtort_pipe_count, i + 1); + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; @@ -1399,6 +1400,7 @@ rcu_torture_writer(void *arg) if (rp == NULL) continue; rp->rtort_pipe_count = 0; + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); rcu_torture_writer_state = RTWS_DELAY; udelay(torture_random(&rand) & 0x3ff); rcu_torture_writer_state = RTWS_REPLACE; @@ -1414,6 +1416,7 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count); // Make sure readers block polled grace periods. if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { -- cgit v1.2.3 From c342b42fa47f4257fccfeadc8e32c51b1be17a1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Feb 2024 15:16:20 -0800 Subject: rcu-tasks: Make Tasks RCU wait idly for grace-period delays Currently, all waits for grace periods sleep at TASK_UNINTERRUPTIBLE, regardless of RCU flavor. This has worked well, but there have been cases where a longer-than-average Tasks RCU grace period has triggered softlockup splats, many of them, before the Tasks RCU CPU stall warning appears. These softlockup splats unnecessarily consume console bandwidth and complicate diagnosis of the underlying problem. Plus a long but not pathologically long Tasks RCU grace period might trigger a few softlockup splats before completing normally, which generates noise for no good reason. This commit therefore causes Tasks RCU grace periods to sleep at TASK_IDLE priority. If there really is a persistent problem, the eventual Tasks RCU CPU stall warning will flag it, and without the extra noise. Reported-by: Breno Leitao Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/rcupdate_wait.h | 18 +++++++++--------- kernel/rcu/tasks.h | 6 +++++- kernel/rcu/update.c | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index d07f0848802e..303ab9bee155 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -19,18 +19,18 @@ struct rcu_synchronize { }; void wakeme_after_rcu(struct rcu_head *head); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); -#define _wait_rcu_gp(checktiny, ...) \ -do { \ - call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ - struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ - __wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array), \ - __crcu_array, __rs_array); \ +#define _wait_rcu_gp(checktiny, state, ...) \ +do { \ + call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ + struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ + __wait_rcu_gp(checktiny, state, ARRAY_SIZE(__crcu_array), __crcu_array, __rs_array); \ } while (0) -#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) +#define wait_rcu_gp(...) _wait_rcu_gp(false, TASK_UNINTERRUPTIBLE, __VA_ARGS__) +#define wait_rcu_gp_state(state, ...) _wait_rcu_gp(false, state, __VA_ARGS__) /** * synchronize_rcu_mult - Wait concurrently for multiple grace periods @@ -54,7 +54,7 @@ do { \ * grace period. */ #define synchronize_rcu_mult(...) \ - _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), TASK_UNINTERRUPTIBLE, __VA_ARGS__) static inline void cond_resched_rcu(void) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 147b5945d67a..82e458ea0728 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -74,6 +74,7 @@ struct rcu_tasks_percpu { * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. @@ -107,6 +108,7 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; int percpu_enqueue_shift; int percpu_enqueue_lim; @@ -134,6 +136,7 @@ static struct rcu_tasks rt_name = \ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ + .wait_state = TASK_UNINTERRUPTIBLE, \ .rtpcpu = &rt_name ## __percpu, \ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ @@ -638,7 +641,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) // If the grace-period kthread is running, use it. if (READ_ONCE(rtp->kthread_ptr)) { - wait_rcu_gp(rtp->call_func); + wait_rcu_gp_state(rtp->wait_state, rtp->call_func); return; } rcu_tasks_one_gp(rtp, true); @@ -1160,6 +1163,7 @@ static int __init rcu_spawn_tasks_kthread(void) rcu_tasks.postscan_func = rcu_tasks_postscan; rcu_tasks.holdouts_func = check_all_holdout_tasks; rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_tasks.wait_state = TASK_IDLE; rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 46aaaa9fe339..f8436969e0c8 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head) } EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; @@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, if (crcu_array[j] == crcu_array[i]) break; if (j == i) { - wait_for_completion(&rs_array[i].completion); + wait_for_completion_state(&rs_array[i].completion, state); destroy_rcu_head_on_stack(&rs_array[i].head); } } -- cgit v1.2.3 From b993115b44d769cb34b394d92658226df81a6c89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:16:17 -0800 Subject: bpf: Select new NEED_TASKS_RCU Kconfig option Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. A new NEED_TASKS_RCU Kconfig option has been created to allow this enablement logic to be in one place in kernel/rcu/Kconfig. Therefore, make BPF select the new NEED_TASKS_RCU Kconfig option. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Cc: Ankur Arora Cc: Thomas Gleixner Cc: Steven Rostedt Acked-by: Mark Rutland Signed-off-by: Uladzislau Rezki (Sony) --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bc25f5098a25..4100df44c665 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -28,7 +28,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET -- cgit v1.2.3 From f03e8c1060f86c23eb49bafee99d9fcbd1c1bd77 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 27 Mar 2024 12:59:35 +0200 Subject: printk: Save console options for add_preferred_console_match() Driver subsystems may need to translate the preferred console name to the character device name used. We already do some of this in console_setup() with a few hardcoded names, but that does not scale well. The console options are parsed early in console_setup(), and the consoles are added with __add_preferred_console(). At this point we don't know much about the character device names and device drivers getting probed. To allow driver subsystems to set up a preferred console, let's save the kernel command line console options. To add a preferred console from a driver subsystem with optional character device name translation, let's add a new function add_preferred_console_match(). This allows the serial core layer to support console=DEVNAME:0.0 style hardware based addressing in addition to the current console=ttyS0 style naming. And we can start moving console_setup() character device parsing to the driver subsystem specific code. We use a separate array from the console_cmdline array as the character device name and index may be unknown at the console_setup() time. And eventually there's no need to call __add_preferred_console() until the subsystem is ready to handle the console. Adding the console name in addition to the character device name, and a flag for an added console, could be added to the struct console_cmdline. And the console_cmdline array handling could be modified accordingly. But that complicates things compared saving the console options, and then adding the consoles when the subsystems handling the consoles are ready. Co-developed-by: Andy Shevchenko Signed-off-by: Tony Lindgren Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240327110021.59793-2-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- include/linux/printk.h | 3 + kernel/printk/Makefile | 2 +- kernel/printk/conopt.c | 146 ++++++++++++++++++++++++++++++++++++++++ kernel/printk/console_cmdline.h | 6 ++ kernel/printk/printk.c | 14 +++- 5 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 kernel/printk/conopt.c (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 955e31860095..5e038e782256 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -60,6 +60,9 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET +int add_preferred_console_match(const char *match, const char *name, + const short idx); + extern int console_printk[]; #define console_loglevel (console_printk[0]) diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 39a2b61c7232..040fe7d1eda2 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y = printk.o +obj-y = printk.o conopt.o obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/conopt.c b/kernel/printk/conopt.c new file mode 100644 index 000000000000..9d507bac3657 --- /dev/null +++ b/kernel/printk/conopt.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel command line console options for hardware based addressing + * + * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ + * Author: Tony Lindgren + */ + +#include +#include +#include +#include + +#include + +#include "console_cmdline.h" + +/* + * Allow longer DEVNAME:0.0 style console naming such as abcd0000.serial:0.0 + * in addition to the legacy ttyS0 style naming. + */ +#define CONSOLE_NAME_MAX 32 + +#define CONSOLE_OPT_MAX 16 +#define CONSOLE_BRL_OPT_MAX 16 + +struct console_option { + char name[CONSOLE_NAME_MAX]; + char opt[CONSOLE_OPT_MAX]; + char brl_opt[CONSOLE_BRL_OPT_MAX]; + u8 has_brl_opt:1; +}; + +/* Updated only at console_setup() time, no locking needed */ +static struct console_option conopt[MAX_CMDLINECONSOLES]; + +/** + * console_opt_save - Saves kernel command line console option for driver use + * @str: Kernel command line console name and option + * @brl_opt: Braille console options + * + * Saves a kernel command line console option for driver subsystems to use for + * adding a preferred console during init. Called from console_setup() only. + * + * Return: 0 on success, negative error code on failure. + */ +int __init console_opt_save(const char *str, const char *brl_opt) +{ + struct console_option *con; + size_t namelen, optlen; + const char *opt; + int i; + + namelen = strcspn(str, ","); + if (namelen == 0 || namelen >= CONSOLE_NAME_MAX) + return -EINVAL; + + opt = str + namelen; + if (*opt == ',') + opt++; + + optlen = strlen(opt); + if (optlen >= CONSOLE_OPT_MAX) + return -EINVAL; + + for (i = 0; i < MAX_CMDLINECONSOLES; i++) { + con = &conopt[i]; + + if (con->name[0]) { + if (!strncmp(str, con->name, namelen)) + return 0; + continue; + } + + /* + * The name isn't terminated, only opt is. Empty opt is fine, + * but brl_opt can be either empty or NULL. For more info, see + * _braille_console_setup(). + */ + strscpy(con->name, str, namelen + 1); + strscpy(con->opt, opt, CONSOLE_OPT_MAX); + if (brl_opt) { + strscpy(con->brl_opt, brl_opt, CONSOLE_BRL_OPT_MAX); + con->has_brl_opt = 1; + } + + return 0; + } + + return -ENOMEM; +} + +static struct console_option *console_opt_find(const char *name) +{ + struct console_option *con; + int i; + + for (i = 0; i < MAX_CMDLINECONSOLES; i++) { + con = &conopt[i]; + if (!strcmp(name, con->name)) + return con; + } + + return NULL; +} + +/** + * add_preferred_console_match - Adds a preferred console if a match is found + * @match: Expected console on kernel command line, such as console=DEVNAME:0.0 + * @name: Name of the console character device to add such as ttyS + * @idx: Index for the console + * + * Allows driver subsystems to add a console after translating the command + * line name to the character device name used for the console. Options are + * added automatically based on the kernel command line. Duplicate preferred + * consoles are ignored by __add_preferred_console(). + * + * Return: 0 on success, negative error code on failure. + */ +int add_preferred_console_match(const char *match, const char *name, + const short idx) +{ + struct console_option *con; + char *brl_opt = NULL; + + if (!match || !strlen(match) || !name || !strlen(name) || + idx < 0) + return -EINVAL; + + con = console_opt_find(match); + if (!con) + return -ENOENT; + + /* + * See __add_preferred_console(). It checks for NULL brl_options to set + * the preferred_console flag. Empty brl_opt instead of NULL leads into + * the preferred_console flag not set, and CON_CONSDEV not being set, + * and the boot console won't get disabled at the end of console_setup(). + */ + if (con->has_brl_opt) + brl_opt = con->brl_opt; + + console_opt_add_preferred_console(name, idx, con->opt, brl_opt); + + return 0; +} diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 3ca74ad391d6..a125e0235589 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -2,6 +2,12 @@ #ifndef _CONSOLE_CMDLINE_H #define _CONSOLE_CMDLINE_H +#define MAX_CMDLINECONSOLES 8 + +int console_opt_save(const char *str, const char *brl_opt); +int console_opt_add_preferred_console(const char *name, const short idx, + char *options, char *brl_options); + struct console_cmdline { char name[16]; /* Name of the driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index adf99c05adca..05919491c5a2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -383,9 +383,6 @@ static int console_locked; /* * Array of consoles built from command line options (console=) */ - -#define MAX_CMDLINECONSOLES 8 - static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; @@ -2503,6 +2500,10 @@ static int __init console_setup(char *str) if (_braille_console_setup(&str, &brl_options)) return 1; + /* Save the console for driver subsystem use */ + if (console_opt_save(str, brl_options)) + return 1; + /* * Decode str into name, index, options. */ @@ -2533,6 +2534,13 @@ static int __init console_setup(char *str) } __setup("console=", console_setup); +/* Only called from add_preferred_console_match() */ +int console_opt_add_preferred_console(const char *name, const short idx, + char *options, char *brl_options) +{ + return __add_preferred_console(name, idx, options, brl_options, true); +} + /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name -- cgit v1.2.3 From 8a831c584e6e80cf68f79893dc395c16cdf47dc8 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 27 Mar 2024 12:59:36 +0200 Subject: printk: Don't try to parse DEVNAME:0.0 console options Currently console_setup() tries to make a console index out of any digits passed in the kernel command line for console. In the DEVNAME:0.0 case, the name can contain a device IO address, so bail out on console names with a ':'. Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20240327110021.59793-3-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 05919491c5a2..67937e5c6010 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2504,6 +2504,10 @@ static int __init console_setup(char *str) if (console_opt_save(str, brl_options)) return 1; + /* Don't attempt to parse a DEVNAME:0.0 style console */ + if (strchr(str, ':')) + return 1; + /* * Decode str into name, index, options. */ -- cgit v1.2.3 From b73c9cbe4f1fc02645228aa575998dd54067f8ef Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 27 Mar 2024 12:59:37 +0200 Subject: printk: Flag register_console() if console is set on command line If add_preferred_console() is not called early in setup_console(), we can end up having register_console() call try_enable_default_console() before a console device has called add_preferred_console(). Let's set console_set_on_cmdline flag in console_setup() to prevent this from happening. Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20240327110021.59793-4-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 67937e5c6010..1e3b21682547 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2504,6 +2504,9 @@ static int __init console_setup(char *str) if (console_opt_save(str, brl_options)) return 1; + /* Flag register_console() to not call try_enable_default_console() */ + console_set_on_cmdline = 1; + /* Don't attempt to parse a DEVNAME:0.0 style console */ if (strchr(str, ':')) return 1; @@ -3507,7 +3510,7 @@ void register_console(struct console *newcon) * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ - if (preferred_console < 0) { + if (preferred_console < 0 && !console_set_on_cmdline) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); -- cgit v1.2.3 From d503a04f8bc0c75dc9db9452d8cc79d748afb752 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Apr 2024 16:11:33 -0700 Subject: bpf: Add support for certain atomics in bpf_arena to x86 JIT Support atomics in bpf_arena that can be JITed as a single x86 instruction. Instructions that are JITed as loops are not supported at the moment, since they require more complex extable and loop logic. JITs can choose to do smarter things with bpf_jit_supports_insn(). Like arm64 may decide to support all bpf atomics instructions when emit_lse_atomic is available and none in ll_sc mode. bpf_jit_supports_percpu_insn(), bpf_jit_supports_ptr_xchg() and other such callbacks can be replaced with bpf_jit_supports_insn() in the future. Signed-off-by: Alexei Starovoitov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240405231134.17274-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- arch/x86/net/bpf_jit_comp.c | 72 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/filter.h | 4 +++ kernel/bpf/core.c | 5 ++++ kernel/bpf/verifier.c | 19 +++++++++++- 4 files changed, 99 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6cf9a5697c09..2b5a475c4dd0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1172,6 +1172,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } +static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + EMIT1(0xF0); /* lock prefix */ + switch (size) { + case BPF_W: + EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg)); + break; + case BPF_DW: + EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg)); + break; + default: + pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n"); + return -EFAULT; + } + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* lock *(u32/u64*)(dst_reg + idx_reg + off) = src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off); + *pprog = prog; + return 0; +} + #define DONT_CLEAR 1 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) @@ -1982,6 +2030,15 @@ populate_extable: return err; break; + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: + start_of_ldx = prog; + err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code), + dst_reg, src_reg, X86_REG_R12, insn->off); + if (err) + return err; + goto populate_extable; + /* call */ case BPF_JMP | BPF_CALL: { int offs; @@ -3486,6 +3543,21 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) + return false; + } + return true; +} + bool bpf_jit_supports_ptr_xchg(void) { return true; diff --git a/include/linux/filter.h b/include/linux/filter.h index 161d5f7b64ed..7a27f19bf44d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -75,6 +75,9 @@ struct ctl_table_header; /* unused opcode to mark special load instruction. Same as BPF_MSH */ #define BPF_PROBE_MEM32 0xa0 +/* unused opcode to mark special atomic instruction */ +#define BPF_PROBE_ATOMIC 0xe0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 @@ -997,6 +1000,7 @@ bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7a33a3a7e63c..a41718eaeefe 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2965,6 +2965,11 @@ bool __weak bpf_jit_supports_arena(void) return false; } +bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 590db4e4c071..2aad6d90550f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6970,6 +6970,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_missmatch); + static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int load_reg; @@ -7030,7 +7033,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg) || - is_arena_reg(env, insn->dst_reg)) { + (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7066,6 +7069,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (is_arena_reg(env, insn->dst_reg)) { + err = save_aux_ptr_type(env, PTR_TO_ARENA, false); + if (err) + return err; + } /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); @@ -18955,6 +18963,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && + env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { + insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; } else { continue; } @@ -19226,6 +19240,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) BPF_CLASS(insn->code) == BPF_ST) && BPF_MODE(insn->code) == BPF_PROBE_MEM32) num_exentries++; + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; -- cgit v1.2.3 From 699c23f02c65cbfc3e638f14ce0d70c23a2e1f02 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:27 -0700 Subject: bpf: Add bpf_link support for sk_msg and sk_skb progs Add bpf_link support for sk_msg and sk_skb programs. We have an internal request to support bpf_link for sk_msg programs so user space can have a uniform handling with bpf_link based libbpf APIs. Using bpf_link based libbpf API also has a benefit which makes system robust by decoupling prog life cycle and attachment life cycle. Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043527.3737160-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/skmsg.h | 4 + include/uapi/linux/bpf.h | 5 + kernel/bpf/syscall.c | 4 + net/core/sock_map.c | 263 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 5 + 6 files changed, 271 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 62762390c93d..5034c1b4ded7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2996,6 +2996,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); @@ -3094,6 +3095,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..9c8dd4c01412 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,10 @@ struct sk_psock_progs { struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; + struct bpf_link *msg_parser_link; + struct bpf_link *stream_parser_link; + struct bpf_link *stream_verdict_link; + struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fe9f11c8abe..cee0a7915c08 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -6724,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e44c276e8617..7d392ec83655 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f65e..63c016b4c169 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -1454,55 +1466,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1527,7 +1568,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1657,6 +1698,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6fe9f11c8abe..cee0a7915c08 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -6724,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From 693f75b91a9171e99f84fc193e39f48e21ba4a4f Mon Sep 17 00:00:00 2001 From: Sreenath Vijayan Date: Wed, 13 Mar 2024 15:50:52 +0530 Subject: printk: Add function to replay kernel log on consoles Add a generic function console_replay_all() for replaying the kernel log on consoles, in any context. It would allow viewing the logs on an unresponsive terminal via sysrq. Reuse the existing code from console_flush_on_panic() for resetting the sequence numbers, by introducing a new helper function __console_rewind_all(). It is safe to be called under console_lock(). Try to acquire lock on the console subsystem without waiting. If successful, reset the sequence number to oldest available record on all consoles and call console_unlock() which will automatically flush the messages to the consoles. Suggested-by: John Ogness Suggested-by: Petr Mladek Signed-off-by: Shimoyashiki Taichi Reviewed-by: John Ogness Signed-off-by: Sreenath Vijayan Link: https://lore.kernel.org/r/90ee131c643a5033d117b556c0792de65129d4c3.1710220326.git.sreenath.vijayan@sony.com Signed-off-by: Greg Kroah-Hartman --- include/linux/printk.h | 4 +++ kernel/printk/printk.c | 77 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 5e038e782256..1c5936ab991f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -195,6 +195,7 @@ void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); +void console_replay_all(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -274,6 +275,9 @@ static inline void dump_stack(void) static inline void printk_trigger_flush(void) { } +static inline void console_replay_all(void) +{ +} #endif bool this_cpu_in_panic(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1e3b21682547..6206804d275b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3161,6 +3161,40 @@ void console_unblank(void) pr_flush(1000, true); } +/* + * Rewind all consoles to the oldest available record. + * + * IMPORTANT: The function is safe only when called under + * console_lock(). It is not enforced because + * it is used as a best effort in panic(). + */ +static void __console_rewind_all(void) +{ + struct console *c; + short flags; + int cookie; + u64 seq; + + seq = prb_first_valid_seq(prb); + + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + flags = console_srcu_read_flags(c); + + if (flags & CON_NBCON) { + nbcon_seq_force(c, seq); + } else { + /* + * This assignment is safe only when called under + * console_lock(). On panic, legacy consoles are + * only best effort. + */ + c->seq = seq; + } + } + console_srcu_read_unlock(cookie); +} + /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones @@ -3189,30 +3223,8 @@ void console_flush_on_panic(enum con_flush_mode mode) */ console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) { - struct console *c; - short flags; - int cookie; - u64 seq; - - seq = prb_first_valid_seq(prb); - - cookie = console_srcu_read_lock(); - for_each_console_srcu(c) { - flags = console_srcu_read_flags(c); - - if (flags & CON_NBCON) { - nbcon_seq_force(c, seq); - } else { - /* - * This is an unsynchronized assignment. On - * panic legacy consoles are only best effort. - */ - c->seq = seq; - } - } - console_srcu_read_unlock(cookie); - } + if (mode == CONSOLE_REPLAY_ALL) + __console_rewind_all(); console_flush_all(false, &next_seq, &handover); } @@ -4301,6 +4313,23 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); +/** + * console_replay_all - replay kernel log on consoles + * + * Try to obtain lock on console subsystem and replay all + * available records in printk buffer on the consoles. + * Does nothing if lock is not obtained. + * + * Context: Any context. + */ +void console_replay_all(void) +{ + if (console_trylock()) { + __console_rewind_all(); + /* Consoles are flushed as part of console_unlock(). */ + console_unlock(); + } +} #endif #ifdef CONFIG_SMP -- cgit v1.2.3 From 79a34e3d8411050c3c7550c5163d6f9dc41e8f66 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 21 Mar 2024 20:52:47 +0100 Subject: locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail() Use atomic_try_cmpxchg_relaxed(*ptr, &old, new) instead of atomic_cmpxchg_relaxed (*ptr, old, new) == old in xchg_tail(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after CMPXCHG. No functional change intended. Since this code requires NR_CPUS >= 16k, I have tested it by unconditionally setting _Q_PENDING_BITS to 1 in . Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240321195309.484275-1-ubizjak@gmail.com --- kernel/locking/qspinlock.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ebe6b8ec7cb3..1df5fef8a656 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) */ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { - u32 old, new, val = atomic_read(&lock->val); + u32 old, new; - for (;;) { - new = (val & _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; /* * We can use relaxed semantics since the caller ensures that * the MCS node is properly initialized before updating the * tail. */ - old = atomic_cmpxchg_relaxed(&lock->val, val, new); - if (old == val) - break; + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - val = old; - } return old; } #endif /* _Q_PENDING_BITS == 8 */ -- cgit v1.2.3 From 66bc1a173328dec3e37c203a999f2a2914c96b56 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 6 Apr 2024 15:52:02 +0200 Subject: treewide: Use sysfs_bin_attr_simple_read() helper Deduplicate ->read() callbacks of bin_attributes which are backed by a simple buffer in memory: Use the newly introduced sysfs_bin_attr_simple_read() helper instead, either by referencing it directly or by declaring such bin_attributes with BIN_ATTR_SIMPLE_RO() or BIN_ATTR_SIMPLE_ADMIN_RO(). Aside from a reduction of LoC, this shaves off a few bytes from vmlinux (304 bytes on an x86_64 allyesconfig). No functional change intended. Signed-off-by: Lukas Wunner Acked-by: Zhi Wang Acked-by: Michael Ellerman Acked-by: Rafael J. Wysocki Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/92ee0a0e83a5a3f3474845db6c8575297698933a.1712410202.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/powernv/opal.c | 10 +-------- drivers/acpi/bgrt.c | 9 +------- drivers/firmware/dmi_scan.c | 12 ++-------- drivers/firmware/efi/rci2-table.c | 10 +-------- drivers/gpu/drm/i915/gvt/firmware.c | 26 +++++----------------- .../intel/int340x_thermal/int3400_thermal.c | 9 +------- init/initramfs.c | 10 +-------- kernel/module/sysfs.c | 13 +---------- 8 files changed, 14 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 45dd77e3ccf6..5d0f35bb917e 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -792,14 +792,6 @@ static int __init opal_sysfs_init(void) return 0; } -static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t off, size_t count) -{ - return memory_read_from_buffer(buf, count, &off, bin_attr->private, - bin_attr->size); -} - static int opal_add_one_export(struct kobject *parent, const char *export_name, struct device_node *np, const char *prop_name) { @@ -826,7 +818,7 @@ static int opal_add_one_export(struct kobject *parent, const char *export_name, sysfs_bin_attr_init(attr); attr->attr.name = name; attr->attr.mode = 0400; - attr->read = export_attr_read; + attr->read = sysfs_bin_attr_simple_read; attr->private = __va(vals[0]); attr->size = vals[1]; diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c index e4fb9e225ddf..d1d9c9289087 100644 --- a/drivers/acpi/bgrt.c +++ b/drivers/acpi/bgrt.c @@ -29,14 +29,7 @@ BGRT_SHOW(type, image_type); BGRT_SHOW(xoffset, image_offset_x); BGRT_SHOW(yoffset, image_offset_y); -static ssize_t image_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, loff_t off, size_t count) -{ - memcpy(buf, attr->private + off, count); - return count; -} - -static BIN_ATTR_RO(image, 0); /* size gets filled in later */ +static BIN_ATTR_SIMPLE_RO(image); static struct attribute *bgrt_attributes[] = { &bgrt_attr_version.attr, diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 015c95a825d3..3d0f773ab876 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -746,16 +746,8 @@ static void __init dmi_scan_machine(void) pr_info("DMI not present or invalid.\n"); } -static ssize_t raw_table_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t pos, size_t count) -{ - memcpy(buf, attr->private + pos, count); - return count; -} - -static BIN_ATTR(smbios_entry_point, S_IRUSR, raw_table_read, NULL, 0); -static BIN_ATTR(DMI, S_IRUSR, raw_table_read, NULL, 0); +static BIN_ATTR_SIMPLE_ADMIN_RO(smbios_entry_point); +static BIN_ATTR_SIMPLE_ADMIN_RO(DMI); static int __init dmi_init(void) { diff --git a/drivers/firmware/efi/rci2-table.c b/drivers/firmware/efi/rci2-table.c index de1a9a1f9f14..4fd45d6f69a4 100644 --- a/drivers/firmware/efi/rci2-table.c +++ b/drivers/firmware/efi/rci2-table.c @@ -40,15 +40,7 @@ static u8 *rci2_base; static u32 rci2_table_len; unsigned long rci2_table_phys __ro_after_init = EFI_INVALID_TABLE_ADDR; -static ssize_t raw_table_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t pos, size_t count) -{ - memcpy(buf, attr->private + pos, count); - return count; -} - -static BIN_ATTR(rci2, S_IRUSR, raw_table_read, NULL, 0); +static BIN_ATTR_SIMPLE_ADMIN_RO(rci2); static u16 checksum(void) { diff --git a/drivers/gpu/drm/i915/gvt/firmware.c b/drivers/gpu/drm/i915/gvt/firmware.c index 4dd52ac2043e..5e66a260df12 100644 --- a/drivers/gpu/drm/i915/gvt/firmware.c +++ b/drivers/gpu/drm/i915/gvt/firmware.c @@ -50,21 +50,7 @@ struct gvt_firmware_header { #define dev_to_drm_minor(d) dev_get_drvdata((d)) -static ssize_t -gvt_firmware_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t offset, size_t count) -{ - memcpy(buf, attr->private + offset, count); - return count; -} - -static struct bin_attribute firmware_attr = { - .attr = {.name = "gvt_firmware", .mode = (S_IRUSR)}, - .read = gvt_firmware_read, - .write = NULL, - .mmap = NULL, -}; +static BIN_ATTR_SIMPLE_ADMIN_RO(gvt_firmware); static int expose_firmware_sysfs(struct intel_gvt *gvt) { @@ -107,10 +93,10 @@ static int expose_firmware_sysfs(struct intel_gvt *gvt) crc32_start = offsetof(struct gvt_firmware_header, version); h->crc32 = crc32_le(0, firmware + crc32_start, size - crc32_start); - firmware_attr.size = size; - firmware_attr.private = firmware; + bin_attr_gvt_firmware.size = size; + bin_attr_gvt_firmware.private = firmware; - ret = device_create_bin_file(&pdev->dev, &firmware_attr); + ret = device_create_bin_file(&pdev->dev, &bin_attr_gvt_firmware); if (ret) { vfree(firmware); return ret; @@ -122,8 +108,8 @@ static void clean_firmware_sysfs(struct intel_gvt *gvt) { struct pci_dev *pdev = to_pci_dev(gvt->gt->i915->drm.dev); - device_remove_bin_file(&pdev->dev, &firmware_attr); - vfree(firmware_attr.private); + device_remove_bin_file(&pdev->dev, &bin_attr_gvt_firmware); + vfree(bin_attr_gvt_firmware.private); } /** diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 427d370648d5..6d4b51a122bc 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -73,14 +73,7 @@ struct odvp_attr { struct device_attribute attr; }; -static ssize_t data_vault_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, loff_t off, size_t count) -{ - memcpy(buf, attr->private + off, count); - return count; -} - -static BIN_ATTR_RO(data_vault, 0); +static BIN_ATTR_SIMPLE_RO(data_vault); static struct bin_attribute *data_attributes[] = { &bin_attr_data_vault, diff --git a/init/initramfs.c b/init/initramfs.c index da79760b8be3..5193fae330c3 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -575,15 +575,7 @@ extern unsigned long __initramfs_size; #include #include -static ssize_t raw_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t pos, size_t count) -{ - memcpy(buf, attr->private + pos, count); - return count; -} - -static BIN_ATTR(initrd, 0440, raw_read, NULL, 0); +static BIN_ATTR(initrd, 0440, sysfs_bin_attr_simple_read, NULL, 0); void __init reserve_initrd_mem(void) { diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index d964167c6658..26efe1305c12 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -146,17 +146,6 @@ struct module_notes_attrs { struct bin_attribute attrs[] __counted_by(notes); }; -static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - static void free_notes_attrs(struct module_notes_attrs *notes_attrs, unsigned int i) { @@ -205,7 +194,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) nattr->attr.mode = 0444; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *)info->sechdrs[i].sh_addr; - nattr->read = module_notes_read; + nattr->read = sysfs_bin_attr_simple_read; ++nattr; } ++loaded; -- cgit v1.2.3 From dfd458a95d78ce31855fe06bbfde4f4fe60c40db Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:04 +0100 Subject: rcu: Add data structures for synchronize_rcu() The synchronize_rcu() call is going to be reworked, thus this patch adds dedicated fields into the rcu_state structure. Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df48160b3136..b942b9437438 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -315,6 +315,13 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +#define SR_NORMAL_GP_WAIT_HEAD_MAX 5 + +struct sr_wait_node { + atomic_t inuse; + struct llist_node node; +}; + /* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) @@ -400,6 +407,13 @@ struct rcu_state { /* Synchronize offline with */ /* GP pre-initialization. */ int nocb_is_setup; /* nocb is setup from boot */ + + /* synchronize_rcu() part. */ + struct llist_head srs_next; /* request a GP users. */ + struct llist_node *srs_wait_tail; /* wait for GP users. */ + struct llist_node *srs_done_tail; /* ready for GP users. */ + struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; + struct work_struct srs_cleanup_work; }; /* Values for rcu_state structure's gp_flags field. */ -- cgit v1.2.3 From 02b3c5fcdfe46f9c9dd5d3fc199612b13bf47c06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:25:44 -0800 Subject: tracing: Select new NEED_TASKS_RCU Kconfig option Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. A new NEED_TASKS_RCU Kconfig option has been created to allow this enablement logic to be in one place in kernel/rcu/Kconfig. Therefore, select the new NEED_TASKS_RCU Kconfig option instead of the old TASKS_RCU option. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Cc: Ankur Arora Cc: Thomas Gleixner Acked-by: Mark Rutland Signed-off-by: Uladzislau Rezki (Sony) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596..6cdc5ff919b0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -163,7 +163,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU config GENERIC_TRACER bool @@ -204,7 +204,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done -- cgit v1.2.3 From 4cd47222e435dec8e3787614924174f53fcfb5ae Mon Sep 17 00:00:00 2001 From: George Stark Date: Thu, 11 Apr 2024 19:10:25 +0300 Subject: locking/mutex: Introduce devm_mutex_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using of devm API leads to a certain order of releasing resources. So all dependent resources which are not devm-wrapped should be deleted with respect to devm-release order. Mutex is one of such objects that often is bound to other resources and has no own devm wrapping. Since mutex_destroy() actually does nothing in non-debug builds frequently calling mutex_destroy() is just ignored which is safe for now but wrong formally and can lead to a problem if mutex_destroy() will be extended so introduce devm_mutex_init(). Suggested-by: Christophe Leroy Signed-off-by: George Stark Reviewed-by: Christophe Leroy Reviewed-by: Andy Shevchenko Reviewed-by: Marek Behún Acked-by: Waiman Long Link: https://lore.kernel.org/r/20240411161032.609544-2-gnstark@salutedevices.com Signed-off-by: Lee Jones --- include/linux/mutex.h | 27 +++++++++++++++++++++++++++ kernel/locking/mutex-debug.c | 12 ++++++++++++ 2 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 67edc4ca2bee..a561c629d89f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -22,6 +22,8 @@ #include #include +struct device; + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ , .dep_map = { \ @@ -117,6 +119,31 @@ do { \ } while (0) #endif /* CONFIG_PREEMPT_RT */ +#ifdef CONFIG_DEBUG_MUTEXES + +int __devm_mutex_init(struct device *dev, struct mutex *lock); + +#else + +static inline int __devm_mutex_init(struct device *dev, struct mutex *lock) +{ + /* + * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so + * no really need to register it in the devm subsystem. + */ + return 0; +} + +#endif + +#define devm_mutex_init(dev, mutex) \ +({ \ + typeof(mutex) mutex_ = (mutex); \ + \ + mutex_init(mutex_); \ + __devm_mutex_init(dev, mutex_); \ +}) + /* * See kernel/locking/mutex.c for detailed documentation of these APIs. * Also see Documentation/locking/mutex-design.rst. diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index bc8abb8549d2..6e6f6071cfa2 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include @@ -89,6 +90,17 @@ void debug_mutex_init(struct mutex *lock, const char *name, lock->magic = lock; } +static void devm_mutex_release(void *res) +{ + mutex_destroy(res); +} + +int __devm_mutex_init(struct device *dev, struct mutex *lock) +{ + return devm_add_action_or_reset(dev, devm_mutex_release, lock); +} +EXPORT_SYMBOL_GPL(__devm_mutex_init); + /*** * mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed -- cgit v1.2.3 From 58329c4312031603bb1786b44265c26d5065fe72 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 3 Apr 2024 17:36:18 +0800 Subject: padata: Disable BH when taking works lock on MT path As the old padata code can execute in softirq context, disable softirqs for the new padata_do_mutithreaded code too as otherwise lockdep will get antsy. Reported-by: syzbot+0cb5bb0f4bf9e79db3b3@syzkaller.appspotmail.com Signed-off-by: Herbert Xu Acked-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index e3f639ff1670..53f4bc912712 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -106,7 +106,7 @@ static int __init padata_work_alloc_mt(int nworks, void *data, { int i; - spin_lock(&padata_works_lock); + spin_lock_bh(&padata_works_lock); /* Start at 1 because the current task participates in the job. */ for (i = 1; i < nworks; ++i) { struct padata_work *pw = padata_work_alloc(); @@ -116,7 +116,7 @@ static int __init padata_work_alloc_mt(int nworks, void *data, padata_work_init(pw, padata_mt_helper, data, 0); list_add(&pw->pw_list, head); } - spin_unlock(&padata_works_lock); + spin_unlock_bh(&padata_works_lock); return i; } @@ -134,12 +134,12 @@ static void __init padata_works_free(struct list_head *works) if (list_empty(works)) return; - spin_lock(&padata_works_lock); + spin_lock_bh(&padata_works_lock); list_for_each_entry_safe(cur, next, works, pw_list) { list_del(&cur->pw_list); padata_work_free(cur); } - spin_unlock(&padata_works_lock); + spin_unlock_bh(&padata_works_lock); } static void padata_parallel_worker(struct work_struct *parallel_work) -- cgit v1.2.3 From f7842747d13d9f4eedba9edec5b834c9d9237717 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 5 Apr 2024 07:58:15 -0400 Subject: mm: replace set_pte_at_notify() with just set_pte_at() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the demise of the .change_pte() MMU notifier callback, there is no notification happening in set_pte_at_notify(). It is a synonym of set_pte_at() and can be replaced with it. Signed-off-by: Paolo Bonzini Reviewed-by: David Hildenbrand Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240405115815.3226315-5-pbonzini@redhat.com> Acked-by: Andrew Morton Signed-off-by: Paolo Bonzini --- include/linux/mmu_notifier.h | 2 -- kernel/events/uprobes.c | 6 +++--- mm/ksm.c | 4 ++-- mm/memory.c | 7 +------ mm/migrate_device.c | 8 ++------ 5 files changed, 8 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 8c72bf651606..d39ebb10caeb 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -657,6 +657,4 @@ static inline void mmu_notifier_synchronize(void) #endif /* CONFIG_MMU_NOTIFIER */ -#define set_pte_at_notify set_pte_at - #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..1215bc299390 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -18,7 +18,7 @@ #include #include #include /* anon_vma_prepare */ -#include /* set_pte_at_notify */ +#include #include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ @@ -195,8 +195,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) diff --git a/mm/ksm.c b/mm/ksm.c index 8c001819cf10..108a4d167824 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1345,7 +1345,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_write(entry)) entry = pte_wrprotect(entry); - set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; @@ -1447,7 +1447,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, newpte); + set_pte_at(mm, addr, ptep, newpte); folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); diff --git a/mm/memory.c b/mm/memory.c index f2bc6dd15eb8..9a6f4d8aa379 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3327,13 +3327,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); - /* - * We call the notify macro here because, when using secondary - * mmu page tables (such as kvm shadow page tables), we want the - * new page to be mapped directly into the secondary page table. - */ BUG_ON(unshare && pte_write(entry)); - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b6c27c76e1a0..66206734b1b9 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -664,13 +664,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); } + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); pte_unmap_unlock(ptep, ptl); *src = MIGRATE_PFN_MIGRATE; -- cgit v1.2.3 From 1e52af7f023c44859868306fac1a4b6b556cf47b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Feb 2024 11:17:28 -0800 Subject: bpf: Choose RCU Tasks based on TASKS_RCU rather than PREEMPTION The advent of CONFIG_PREEMPT_AUTO, AKA lazy preemption, will mean that even kernels built with CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY might see the occasional preemption, and that this preemption just might happen within a trampoline. Therefore, update bpf_tramp_image_put() to choose call_rcu_tasks() based on CONFIG_TASKS_RCU instead of CONFIG_PREEMPTION. This change might enable further simplifications, but the goal of this effort is to make the code safe, not necessarily optimal. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: John Fastabend Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Ankur Arora Cc: Thomas Gleixner Cc: Signed-off-by: Uladzislau Rezki (Sony) --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59c78..88673a4267eb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -333,7 +333,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); -- cgit v1.2.3 From 64ec8b6ad61997262fb3373970377f5fb709454b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Feb 2024 11:30:43 -0800 Subject: ftrace: Choose RCU Tasks based on TASKS_RCU rather than PREEMPTION The advent of CONFIG_PREEMPT_AUTO, AKA lazy preemption, will mean that even kernels built with CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY might see the occasional preemption, and that this preemption just might happen within a trampoline. Therefore, update ftrace_shutdown() to invoke synchronize_rcu_tasks() based on CONFIG_TASKS_RCU instead of CONFIG_PREEMPTION. [ paulmck: Apply Steven Rostedt feedback. ] Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Ankur Arora Cc: Thomas Gleixner Cc: Signed-off-by: Uladzislau Rezki (Sony) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..6c96b30f3d63 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3157,8 +3157,7 @@ out: * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - synchronize_rcu_tasks(); + synchronize_rcu_tasks(); ftrace_trampoline_free(ops); } -- cgit v1.2.3 From 6a97734f2222e0352f1900e3eb3167e9069b91bb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 25 Mar 2024 15:09:32 +0100 Subject: locking/pvqspinlock: Use try_cmpxchg_acquire() in trylock_clear_pending() Replace this pattern in trylock_clear_pending(): cmpxchg_acquire(*ptr, old, new) == old ... with the simpler and faster: try_cmpxchg_acquire(*ptr, &old, new) The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after the CMPXCHG. Also change the return type of the function to bool and streamline the control flow in the _Q_PENDING_BITS == 8 variant a bit. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Reviewed-by: Linus Torvalds Link: https://lore.kernel.org/r/20240325140943.815051-1-ubizjak@gmail.com --- kernel/locking/qspinlock_paravirt.h | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 169950fe1aad..77ba80bd95f9 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -116,11 +116,12 @@ static __always_inline void set_pending(struct qspinlock *lock) * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the * lock just to be sure that it will get it. */ -static __always_inline int trylock_clear_pending(struct qspinlock *lock) +static __always_inline bool trylock_clear_pending(struct qspinlock *lock) { + u16 old = _Q_PENDING_VAL; + return !READ_ONCE(lock->locked) && - (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, - _Q_LOCKED_VAL) == _Q_PENDING_VAL); + try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -128,27 +129,21 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline int trylock_clear_pending(struct qspinlock *lock) +static __always_inline bool trylock_clear_pending(struct qspinlock *lock) { - int val = atomic_read(&lock->val); - - for (;;) { - int old, new; - - if (val & _Q_LOCKED_MASK) - break; + int old, new; + old = atomic_read(&lock->val); + do { + if (old & _Q_LOCKED_MASK) + return false; /* * Try to clear pending bit & set locked bit */ - old = val; - new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg_acquire(&lock->val, old, new); + new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new)); - if (val == old) - return 1; - } - return 0; + return true; } #endif /* _Q_PENDING_BITS == 8 */ -- cgit v1.2.3 From fea0e1820b51fff95c85518eb9cf3386f367908d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 11 Apr 2024 21:22:55 +0200 Subject: locking/pvqspinlock: Use try_cmpxchg() in qspinlock_paravirt.h Use try_cmpxchg(*ptr, &old, new) instead of cmpxchg(*ptr, old, new) == old in qspinlock_paravirt.h x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240411192317.25432-2-ubizjak@gmail.com --- kernel/locking/qspinlock_paravirt.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 77ba80bd95f9..f5a36e67b593 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -86,9 +86,10 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) */ for (;;) { int val = atomic_read(&lock->val); + u8 old = 0; if (!(val & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { + try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) { lockevent_inc(pv_lock_stealing); return true; } @@ -211,8 +212,9 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + struct qspinlock *old = NULL; hopcnt++; - if (!cmpxchg(&he->lock, NULL, lock)) { + if (try_cmpxchg(&he->lock, &old, lock)) { WRITE_ONCE(he->node, node); lockevent_pv_hop(hopcnt); return &he->lock; @@ -355,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - + enum vcpu_state old = vcpu_halted; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will @@ -372,8 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * subsequent writes. */ smp_mb__before_atomic(); - if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) - != vcpu_halted) + if (!try_cmpxchg_relaxed(&pn->state, &old, vcpu_hashed)) return; /* @@ -541,15 +542,14 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #ifndef __pv_queued_spin_unlock __visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { - u8 locked; + u8 locked = _Q_LOCKED_VAL; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) + if (try_cmpxchg_release(&lock->locked, &locked, 0)) return; __pv_queued_spin_unlock_slowpath(lock, locked); -- cgit v1.2.3 From 86d2a2f51fbada84e377665df06b5a479a1edc99 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Thu, 11 Apr 2024 15:41:30 +0800 Subject: genirq: Convert kstat_irqs to a struct The irq_desc::kstat_irqs member is a per-CPU variable of type int, which is only capable of counting. A snapshot mechanism for interrupt statistics will be added soon, which requires an additional variable to store the snapshot. To facilitate expansion, convert kstat_irqs here to a struct containing only the count. Originally-by: Thomas Gleixner Signed-off-by: Bitao Hu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240411074134.30922-2-yaoma@linux.alibaba.com --- arch/mips/dec/setup.c | 2 +- arch/parisc/kernel/smp.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 +- include/linux/irqdesc.h | 12 ++++++++++-- kernel/irq/internals.h | 2 +- kernel/irq/irqdesc.c | 9 ++++----- kernel/irq/proc.c | 5 ++--- scripts/gdb/linux/interrupts.py | 6 +++--- 8 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c index 6c3704f51d0d..87f0a1436bf9 100644 --- a/arch/mips/dec/setup.c +++ b/arch/mips/dec/setup.c @@ -756,7 +756,7 @@ void __init arch_init_irq(void) NULL)) pr_err("Failed to register fpu interrupt\n"); desc_fpu = irq_to_desc(irq_fpu); - fpu_kstat_irq = this_cpu_ptr(desc_fpu->kstat_irqs); + fpu_kstat_irq = this_cpu_ptr(&desc_fpu->kstat_irqs->cnt); } if (dec_interrupt[DEC_IRQ_CASCADE] >= 0) { if (request_irq(dec_interrupt[DEC_IRQ_CASCADE], no_action, diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 444154271f23..800eb64e91ad 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -344,7 +344,7 @@ static int smp_boot_one_cpu(int cpuid, struct task_struct *idle) struct irq_desc *desc = irq_to_desc(i); if (desc && desc->kstat_irqs) - *per_cpu_ptr(desc->kstat_irqs, cpuid) = 0; + *per_cpu_ptr(desc->kstat_irqs, cpuid) = (struct irqstat) { }; } #endif diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index e42984878503..f2636414d82a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -837,7 +837,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr) */ static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc) { - this_cpu_inc_rm(desc->kstat_irqs); + this_cpu_inc_rm(&desc->kstat_irqs->cnt); __this_cpu_inc(kstat.irqs_sum); } diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index d9451d456a73..c28612674acb 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -17,6 +17,14 @@ struct irq_desc; struct irq_domain; struct pt_regs; +/** + * struct irqstat - interrupt statistics + * @cnt: real-time interrupt count + */ +struct irqstat { + unsigned int cnt; +}; + /** * struct irq_desc - interrupt descriptor * @irq_common_data: per irq and chip data passed down to chip functions @@ -55,7 +63,7 @@ struct pt_regs; struct irq_desc { struct irq_common_data irq_common_data; struct irq_data irq_data; - unsigned int __percpu *kstat_irqs; + struct irqstat __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status_use_accessors; @@ -119,7 +127,7 @@ extern struct irq_desc irq_desc[NR_IRQS]; static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc, unsigned int cpu) { - return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; + return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bcc7f21db9ee..1d92532c2aae 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -258,7 +258,7 @@ static inline void irq_state_set_masked(struct irq_desc *desc) static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { - __this_cpu_inc(*desc->kstat_irqs); + __this_cpu_inc(desc->kstat_irqs->cnt); __this_cpu_inc(kstat.irqs_sum); } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4c6b32318ce3..b59b79200ad7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -134,7 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) - *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; + *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; desc_smp_init(desc, node, affinity); } @@ -186,7 +186,7 @@ static int init_desc(struct irq_desc *desc, int irq, int node, const struct cpumask *affinity, struct module *owner) { - desc->kstat_irqs = alloc_percpu(unsigned int); + desc->kstat_irqs = alloc_percpu(struct irqstat); if (!desc->kstat_irqs) return -ENOMEM; @@ -968,8 +968,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc && desc->kstat_irqs ? - *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; + return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static bool irq_is_nmi(struct irq_desc *desc) @@ -991,7 +990,7 @@ static unsigned int kstat_irqs(unsigned int irq) return data_race(desc->tot_count); for_each_possible_cpu(cpu) - sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu)); + sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 623b8136e9af..6954e0a02047 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -490,7 +490,7 @@ int show_interrupts(struct seq_file *p, void *v) if (desc->kstat_irqs) { for_each_online_cpu(j) - any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j)); + any_count |= data_race(per_cpu(desc->kstat_irqs->cnt, j)); } if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) @@ -498,8 +498,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) - seq_printf(p, "%10u ", desc->kstat_irqs ? - *per_cpu_ptr(desc->kstat_irqs, j) : 0); + seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0); raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { diff --git a/scripts/gdb/linux/interrupts.py b/scripts/gdb/linux/interrupts.py index 66ae5c7690cf..616a5f26377a 100644 --- a/scripts/gdb/linux/interrupts.py +++ b/scripts/gdb/linux/interrupts.py @@ -37,7 +37,7 @@ def show_irq_desc(prec, irq): any_count = 0 if desc['kstat_irqs']: for cpu in cpus.each_online_cpu(): - any_count += cpus.per_cpu(desc['kstat_irqs'], cpu) + any_count += cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt'] if (desc['action'] == 0 or irq_desc_is_chained(desc)) and any_count == 0: return text; @@ -45,7 +45,7 @@ def show_irq_desc(prec, irq): text += "%*d: " % (prec, irq) for cpu in cpus.each_online_cpu(): if desc['kstat_irqs']: - count = cpus.per_cpu(desc['kstat_irqs'], cpu) + count = cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt'] else: count = 0 text += "%10u" % (count) @@ -177,7 +177,7 @@ def arm_common_show_interrupts(prec): if desc == 0: continue for cpu in cpus.each_online_cpu(): - text += "%10u" % (cpus.per_cpu(desc['kstat_irqs'], cpu)) + text += "%10u" % (cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt']) text += " %s" % (ipi_types[ipi].string()) text += "\n" return text -- cgit v1.2.3 From 99cf63c56661be0a0c42f79b56f37a4aa34b4779 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Thu, 11 Apr 2024 15:41:31 +0800 Subject: genirq: Provide a snapshot mechanism for interrupt statistics The soft lockup detector lacks a mechanism to identify interrupt storms as root cause of a lockup. To enable this the detector needs a mechanism to snapshot the interrupt count statistics on a CPU when the detector observes a potential lockup scenario and compare that against the interrupt count when it warns about the lockup later on. The number of interrupts in that period give a hint whether the lockup might have been caused by an interrupt storm. Instead of having extra storage in the lockup detector and accessing the internals of the interrupt descriptor directly, add a snapshot member to the per CPU irq_desc::kstat_irq structure and provide interfaces to take a snapshot of all interrupts on the current CPU and to retrieve the delta of a specific interrupt later on. Originally-by: Thomas Gleixner Signed-off-by: Bitao Hu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240411074134.30922-3-yaoma@linux.alibaba.com --- include/linux/irqdesc.h | 4 ++++ include/linux/kernel_stat.h | 8 ++++++++ kernel/irq/Kconfig | 4 ++++ kernel/irq/irqdesc.c | 25 +++++++++++++++++++++++++ 4 files changed, 41 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index c28612674acb..fd091c35d572 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -20,9 +20,13 @@ struct pt_regs; /** * struct irqstat - interrupt statistics * @cnt: real-time interrupt count + * @ref: snapshot of interrupt count */ struct irqstat { unsigned int cnt; +#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT + unsigned int ref; +#endif }; /** diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 9935f7ecbfb9..9c042c6384bb 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -79,6 +79,14 @@ static inline unsigned int kstat_cpu_softirqs_sum(int cpu) return sum; } +#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT +extern void kstat_snapshot_irqs(void); +extern unsigned int kstat_get_irq_since_snapshot(unsigned int irq); +#else +static inline void kstat_snapshot_irqs(void) { } +static inline unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { return 0; } +#endif + /* * Number of interrupts per specific IRQ source, since bootup */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 2531f3496ab6..529adb1f5859 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -108,6 +108,10 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR config GENERIC_IRQ_RESERVATION_MODE bool +# Snapshot for interrupt statistics +config GENERIC_IRQ_STAT_SNAPSHOT + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b59b79200ad7..f348faffa7b4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -994,6 +994,31 @@ static unsigned int kstat_irqs(unsigned int irq) return sum; } +#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT + +void kstat_snapshot_irqs(void) +{ + struct irq_desc *desc; + unsigned int irq; + + for_each_irq_desc(irq, desc) { + if (!desc->kstat_irqs) + continue; + this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt)); + } +} + +unsigned int kstat_get_irq_since_snapshot(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !desc->kstat_irqs) + return 0; + return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref); +} + +#endif + /** * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number -- cgit v1.2.3 From 25a4a015118037809c97d089d69e927737e589e1 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Thu, 11 Apr 2024 15:41:32 +0800 Subject: genirq: Avoid summation loops for /proc/interrupts show_interrupts() unconditionally accumulates the per CPU interrupt statistics to determine whether an interrupt was ever raised. This can be avoided for all interrupts which are not strictly per CPU and not of type NMI because those interrupts provide already an accumulated counter. The required logic is already implemented in kstat_irqs(). Split the inner access logic out of kstat_irqs() and use it for kstat_irqs() and show_interrupts() to avoid the accumulation loop when possible. Originally-by: Thomas Gleixner Signed-off-by: Bitao Hu Signed-off-by: Thomas Gleixner Reviewed-by: Liu Song Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20240411074134.30922-4-yaoma@linux.alibaba.com --- kernel/irq/internals.h | 2 ++ kernel/irq/irqdesc.c | 16 +++++++++++----- kernel/irq/proc.c | 6 ++---- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 1d92532c2aae..6c43ef3e7308 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -98,6 +98,8 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); +extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); + #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f348faffa7b4..382093196210 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -976,24 +976,30 @@ static bool irq_is_nmi(struct irq_desc *desc) return desc->istate & IRQS_NMI; } -static unsigned int kstat_irqs(unsigned int irq) +unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { - struct irq_desc *desc = irq_to_desc(irq); unsigned int sum = 0; int cpu; - if (!desc || !desc->kstat_irqs) - return 0; if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) return data_race(desc->tot_count); - for_each_possible_cpu(cpu) + for_each_cpu(cpu, cpumask) sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } +static unsigned int kstat_irqs(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !desc->kstat_irqs) + return 0; + return kstat_irqs_desc(desc, cpu_possible_mask); +} + #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT void kstat_snapshot_irqs(void) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6954e0a02047..5c320c3f10a7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -488,10 +488,8 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc || irq_settings_is_hidden(desc)) goto outsparse; - if (desc->kstat_irqs) { - for_each_online_cpu(j) - any_count |= data_race(per_cpu(desc->kstat_irqs->cnt, j)); - } + if (desc->kstat_irqs) + any_count = kstat_irqs_desc(desc, cpu_online_mask); if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) goto outsparse; -- cgit v1.2.3 From d7037381d00286aa4beb631c401da761ee564c94 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Thu, 11 Apr 2024 15:41:33 +0800 Subject: watchdog/softlockup: Low-overhead detection of interrupt storm The following softlockup is caused by interrupt storm, but it cannot be identified from the call tree. Because the call tree is just a snapshot and doesn't fully capture the behavior of the CPU during the soft lockup. watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921] ... Call trace: __do_softirq+0xa0/0x37c __irq_exit_rcu+0x108/0x140 irq_exit+0x14/0x20 __handle_domain_irq+0x84/0xe0 gic_handle_irq+0x80/0x108 el0_irq_naked+0x50/0x58 Therefore, it is necessary to report CPU utilization during the softlockup_threshold period (report once every sample_period, for a total of 5 reportings), like this: watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921] CPU#28 Utilization every 4s during lockup: #1: 0% system, 0% softirq, 100% hardirq, 0% idle #2: 0% system, 0% softirq, 100% hardirq, 0% idle #3: 0% system, 0% softirq, 100% hardirq, 0% idle #4: 0% system, 0% softirq, 100% hardirq, 0% idle #5: 0% system, 0% softirq, 100% hardirq, 0% idle ... This is helpful in determining whether an interrupt storm has occurred or in identifying the cause of the softlockup. The criteria for determination are as follows: a. If the hardirq utilization is high, then interrupt storm should be considered and the root cause cannot be determined from the call tree. b. If the softirq utilization is high, then the call might not necessarily point at the root cause. c. If the system utilization is high, then analyzing the root cause from the call tree is possible in most cases. The mechanism requires a considerable amount of global storage space when configured for the maximum number of CPUs. Therefore, adding a SOFTLOCKUP_DETECTOR_INTR_STORM Kconfig knob that defaults to "yes" if the max number of CPUs is <= 128. Signed-off-by: Bitao Hu Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Reviewed-by: Liu Song Link: https://lore.kernel.org/r/20240411074134.30922-5-yaoma@linux.alibaba.com --- kernel/watchdog.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- lib/Kconfig.debug | 14 ++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7b2125503af..ef8ebd31fdab 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -35,6 +37,8 @@ static DEFINE_MUTEX(watchdog_mutex); # define WATCHDOG_HARDLOCKUP_DEFAULT 0 #endif +#define NUM_SAMPLE_PERIODS 5 + unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; @@ -333,6 +337,96 @@ __setup("watchdog_thresh=", watchdog_thresh_setup); static void __lockup_detector_cleanup(void); +#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM +enum stats_per_group { + STATS_SYSTEM, + STATS_SOFTIRQ, + STATS_HARDIRQ, + STATS_IDLE, + NUM_STATS_PER_GROUP, +}; + +static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = { + CPUTIME_SYSTEM, + CPUTIME_SOFTIRQ, + CPUTIME_IRQ, + CPUTIME_IDLE, +}; + +static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]); +static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]); +static DEFINE_PER_CPU(u8, cpustat_tail); + +/* + * We don't need nanosecond resolution. A granularity of 16ms is + * sufficient for our precision, allowing us to use u16 to store + * cpustats, which will roll over roughly every ~1000 seconds. + * 2^24 ~= 16 * 10^6 + */ +static u16 get_16bit_precision(u64 data_ns) +{ + return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ +} + +static void update_cpustat(void) +{ + int i; + u8 util; + u16 old_stat, new_stat; + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u8 tail = __this_cpu_read(cpustat_tail); + u16 sample_period_16 = get_16bit_precision(sample_period); + + kcpustat_cpu_fetch(&kcpustat, smp_processor_id()); + + for (i = 0; i < NUM_STATS_PER_GROUP; i++) { + old_stat = __this_cpu_read(cpustat_old[i]); + new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); + util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + __this_cpu_write(cpustat_util[tail][i], util); + __this_cpu_write(cpustat_old[i], new_stat); + } + + __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS); +} + +static void print_cpustat(void) +{ + int i, group; + u8 tail = __this_cpu_read(cpustat_tail); + u64 sample_period_second = sample_period; + + do_div(sample_period_second, NSEC_PER_SEC); + + /* + * Outputting the "watchdog" prefix on every line is redundant and not + * concise, and the original alarm information is sufficient for + * positioning in logs, hence here printk() is used instead of pr_crit(). + */ + printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", + smp_processor_id(), sample_period_second); + + for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { + group = (tail + i) % NUM_SAMPLE_PERIODS; + printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t" + "%3u%% hardirq,\t%3u%% idle\n", i + 1, + __this_cpu_read(cpustat_util[group][STATS_SYSTEM]), + __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]), + __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]), + __this_cpu_read(cpustat_util[group][STATS_IDLE])); + } +} + +static void report_cpu_status(void) +{ + print_cpustat(); +} +#else +static inline void update_cpustat(void) { } +static inline void report_cpu_status(void) { } +#endif + /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally @@ -364,7 +458,7 @@ static void set_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS); watchdog_update_hrtimer_threshold(sample_period); } @@ -504,6 +598,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); + update_cpustat(); + /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { @@ -539,6 +635,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + report_cpu_status(); print_modules(); print_irqtrace_events(current); if (regs) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c63a5fbf1f1c..1f0ce712a671 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1029,6 +1029,20 @@ config SOFTLOCKUP_DETECTOR chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config SOFTLOCKUP_DETECTOR_INTR_STORM + bool "Detect Interrupt Storm in Soft Lockups" + depends on SOFTLOCKUP_DETECTOR && IRQ_TIME_ACCOUNTING + select GENERIC_IRQ_STAT_SNAPSHOT + default y if NR_CPUS <= 128 + help + Say Y here to enable the kernel to detect interrupt storm + during "soft lockups". + + "soft lockups" can be caused by a variety of reasons. If one is + caused by an interrupt storm, then the storming interrupts will not + be on the callstack. To detect this case, it is necessary to report + the CPU stats and the interrupt counts during the "soft lockups". + config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" depends on SOFTLOCKUP_DETECTOR -- cgit v1.2.3 From e9a9292e2368e9be4a48aae6ff8aafa3433133e6 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Thu, 11 Apr 2024 15:41:34 +0800 Subject: watchdog/softlockup: Report the most frequent interrupts When the watchdog determines that the current soft lockup is due to an interrupt storm based on CPU utilization, reporting the most frequent interrupts could be good enough for further troubleshooting. Below is an example of interrupt storm. The call tree does not provide useful information, but analyzing which interrupt caused the soft lockup by comparing the counts of interrupts during the lockup period allows to identify the culprit. [ 638.870231] watchdog: BUG: soft lockup - CPU#9 stuck for 26s! [swapper/9:0] [ 638.870825] CPU#9 Utilization every 4s during lockup: [ 638.871194] #1: 0% system, 0% softirq, 100% hardirq, 0% idle [ 638.871652] #2: 0% system, 0% softirq, 100% hardirq, 0% idle [ 638.872107] #3: 0% system, 0% softirq, 100% hardirq, 0% idle [ 638.872563] #4: 0% system, 0% softirq, 100% hardirq, 0% idle [ 638.873018] #5: 0% system, 0% softirq, 100% hardirq, 0% idle [ 638.873494] CPU#9 Detect HardIRQ Time exceeds 50%. Most frequent HardIRQs: [ 638.873994] #1: 330945 irq#7 [ 638.874236] #2: 31 irq#82 [ 638.874493] #3: 10 irq#10 [ 638.874744] #4: 2 irq#89 [ 638.874992] #5: 1 irq#102 ... [ 638.875313] Call trace: [ 638.875315] __do_softirq+0xa8/0x364 Signed-off-by: Bitao Hu Signed-off-by: Thomas Gleixner Reviewed-by: Liu Song Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20240411074134.30922-6-yaoma@linux.alibaba.com --- kernel/watchdog.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ef8ebd31fdab..d12ff74889ed 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -12,22 +12,25 @@ #define pr_fmt(fmt) "watchdog: " fmt -#include #include -#include #include +#include +#include #include +#include #include +#include #include +#include +#include #include #include + #include #include #include -#include #include -#include static DEFINE_MUTEX(watchdog_mutex); @@ -418,13 +421,105 @@ static void print_cpustat(void) } } +#define HARDIRQ_PERCENT_THRESH 50 +#define NUM_HARDIRQ_REPORT 5 +struct irq_counts { + int irq; + u32 counts; +}; + +static DEFINE_PER_CPU(bool, snapshot_taken); + +/* Tabulate the most frequent interrupts. */ +static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank) +{ + int i; + struct irq_counts new_count = {irq, counts}; + + for (i = 0; i < rank; i++) { + if (counts > irq_counts[i].counts) + swap(new_count, irq_counts[i]); + } +} + +/* + * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period, + * then the cause of softlockup might be interrupt storm. In this case, it + * would be useful to start interrupt counting. + */ +static bool need_counting_irqs(void) +{ + u8 util; + int tail = __this_cpu_read(cpustat_tail); + + tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; + util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); + return util > HARDIRQ_PERCENT_THRESH; +} + +static void start_counting_irqs(void) +{ + if (!__this_cpu_read(snapshot_taken)) { + kstat_snapshot_irqs(); + __this_cpu_write(snapshot_taken, true); + } +} + +static void stop_counting_irqs(void) +{ + __this_cpu_write(snapshot_taken, false); +} + +static void print_irq_counts(void) +{ + unsigned int i, count; + struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = { + {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0} + }; + + if (__this_cpu_read(snapshot_taken)) { + for_each_active_irq(i) { + count = kstat_get_irq_since_snapshot(i); + tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT); + } + + /* + * Outputting the "watchdog" prefix on every line is redundant and not + * concise, and the original alarm information is sufficient for + * positioning in logs, hence here printk() is used instead of pr_crit(). + */ + printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n", + smp_processor_id(), HARDIRQ_PERCENT_THRESH); + + for (i = 0; i < NUM_HARDIRQ_REPORT; i++) { + if (irq_counts_sorted[i].irq == -1) + break; + + printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n", + i + 1, irq_counts_sorted[i].counts, + irq_counts_sorted[i].irq); + } + + /* + * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last + * sample_period, then we suspect the interrupt storm might be subsiding. + */ + if (!need_counting_irqs()) + stop_counting_irqs(); + } +} + static void report_cpu_status(void) { print_cpustat(); + print_irq_counts(); } #else static inline void update_cpustat(void) { } static inline void report_cpu_status(void) { } +static inline bool need_counting_irqs(void) { return false; } +static inline void start_counting_irqs(void) { } +static inline void stop_counting_irqs(void) { } #endif /* @@ -528,6 +623,18 @@ static int is_softlockup(unsigned long touch_ts, unsigned long now) { if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { + /* + * If period_ts has not been updated during a sample_period, then + * in the subsequent few sample_periods, period_ts might also not + * be updated, which could indicate a potential softlockup. In + * this case, if we suspect the cause of the potential softlockup + * might be interrupt storm, then we need to count the interrupts + * to find which interrupt is storming. + */ + if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) && + need_counting_irqs()) + start_counting_irqs(); + /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; @@ -550,6 +657,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); static int softlockup_fn(void *data) { update_touch_ts(); + stop_counting_irqs(); complete(this_cpu_ptr(&softlockup_completion)); return 0; -- cgit v1.2.3 From 37eacb9f6e89fb399a79e952bc9c78eb3e16290e Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 12 Apr 2024 16:11:00 +0200 Subject: bpf: Fix a verifier verbose message Long ago a map file descriptor in a pseudo ldimm64 instruction could only be present as an immediate value insn[0].imm, and thus this value was used in a verbose verifier message printed when the file descriptor wasn't valid. Since addition of BPF_PSEUDO_MAP_IDX_VALUE/BPF_PSEUDO_MAP_IDX the insn[0].imm field can also contain an index pointing to the file descriptor in the attr.fd_array array. However, if the file descriptor is invalid, the verifier still prints the verbose message containing value of insn[0].imm. Patch the verifier message to always print the actual file descriptor value. Fixes: 387544bfa291 ("bpf: Introduce fd_idx") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240412141100.3562942-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98188379d5c7..aa24beb65393 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18289,8 +18289,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) f = fdget(fd); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn[0].imm); + verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); return PTR_ERR(map); } -- cgit v1.2.3 From c1ec7c158090ab968ab9022a9f67e7d88d66ee61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Feb 2024 15:33:55 -0800 Subject: rcu: Make TINY_RCU depend on !PREEMPT_RCU rather than !PREEMPTION Right now, TINY_RCU depends on (!PREEMPTION && !SMP), which has served the kernel well for many years due to the fact that PREEMPT_RCU is normally a synonym for PREEMPTION. But with the advent of lazy preemption, it will be possible to have non-preemptible RCU in a preemptible kernel, so that kernels could be built with PREEMPT_RCU=n and PREEMPTION=y. This commit therefore makes TINY_RCU depend on (!PREEMPT_RCU && !SMP), thus allowing for a non-preemptible RCU in preemptible kernels. Signed-off-by: Paul E. McKenney Cc: Ankur Arora Cc: Thomas Gleixner Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index e7d2dd267593..7dca0138260c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -31,7 +31,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPTION && !SMP + default y if !PREEMPT_RCU && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response -- cgit v1.2.3 From 65b4a59557f6fb5d4355093ad5a2c1bd5598ee41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Feb 2024 09:04:30 -0800 Subject: srcu: Make Tiny SRCU explicitly disable preemption Because Tiny SRCU is used only in kernels built with either CONFIG_PREEMPT_NONE=y or CONFIG_PREEMPT_VOLUNTARY=y, there has not been any need for TINY SRCU to explicitly disable preemption. However, the prospect of lazy preemption changes that, and the lazy-preemption patches do result in rcutorture runs finding both too-short grace periods and grace-period hangs for Tiny SRCU. This commit therefore adds the needed preempt_disable() and preempt_enable() calls to Tiny SRCU. Signed-off-by: Paul E. McKenney Cc: Ankur Arora Cc: Thomas Gleixner Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcutiny.h | 2 ++ kernel/rcu/srcutiny.c | 31 ++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 447133171d95..4d96bbdb45f0 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,8 +64,10 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; + preempt_disable(); // Needed for PREEMPT_AUTO idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); + preempt_enable(); return idx; } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index c38e5933a5d6..5afd5cf494db 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -96,9 +96,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; + int newval; + preempt_disable(); // Needed for PREEMPT_AUTO + newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); + preempt_enable(); if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) swake_up_one(&ssp->srcu_wq); } @@ -117,8 +120,11 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + preempt_disable(); // Needed for PREEMPT_AUTO + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { return; /* Already running or nothing to do. */ + preempt_enable(); + } /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(ssp->srcu_gp_running, true); @@ -130,9 +136,12 @@ void srcu_drive_gp(struct work_struct *wp) idx = (ssp->srcu_idx & 0x2) / 2; WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + preempt_disable(); // Needed for PREEMPT_AUTO WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + preempt_enable(); /* Invoke the callbacks we removed above. */ while (lh) { @@ -150,8 +159,11 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ + preempt_disable(); // Needed for PREEMPT_AUTO WRITE_ONCE(ssp->srcu_gp_running, false); - if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); + preempt_enable(); + if (idx) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -160,9 +172,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; + preempt_disable(); // Needed for PREEMPT_AUTO cookie = get_state_synchronize_srcu(ssp); - if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { + preempt_enable(); return; + } WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) @@ -170,6 +185,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } + preempt_enable(); } /* @@ -183,11 +199,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; + preempt_disable(); // Needed for PREEMPT_AUTO local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); srcu_gp_start_if_needed(ssp); + preempt_enable(); } EXPORT_SYMBOL_GPL(call_srcu); @@ -241,9 +259,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); */ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { - unsigned long ret = get_state_synchronize_srcu(ssp); + unsigned long ret; + preempt_disable(); // Needed for PREEMPT_AUTO + ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); + preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); -- cgit v1.2.3 From 1b4e9fdf9ed489c779259734e0b47f408c7786f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:09:19 -0800 Subject: rcu: Create NEED_TASKS_RCU to factor out enablement logic Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. This commit therefore creates a new NEED_TASKS_RCU Kconfig option so that the default value of TASKS_RCU can depend on a combination of this new option and any needed enablement logic, so that this logic is in one place. While in the area, also anticipate a likely future change by adding PREEMPT_AUTO to that logic. Signed-off-by: Paul E. McKenney Cc: Ankur Arora Cc: Thomas Gleixner Cc: Steven Rostedt Acked-by: Mark Rutland Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 7dca0138260c..3e079de0f5b4 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -85,9 +85,13 @@ config FORCE_TASKS_RCU idle, and user-mode execution as quiescent states. Not for manual selection in most cases. -config TASKS_RCU +config NEED_TASKS_RCU bool default n + +config TASKS_RCU + bool + default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) select IRQ_WORK config FORCE_TASKS_RUDE_RCU -- cgit v1.2.3 From 3dbd8652f87b81fccb766bddb985ef46a1502f04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Mar 2024 17:16:53 -0800 Subject: rcu: Remove redundant BH disabling in TINY_RCU The TINY_RCU rcu_process_callbacks() function is only ever invoked from a softirq handler, which means that BH is already disabled. This commit therefore removes the redundant local_bh_disable() and local_bh_ennable() from this function. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tiny.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 705c0d16850a..4470af926a34 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -130,9 +130,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - local_bh_disable(); rcu_reclaim_tiny(list); - local_bh_enable(); list = next; } } -- cgit v1.2.3 From 11b8b378c58bdaf076c2724bee03157e3160af5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Mar 2024 15:33:33 -0800 Subject: rcu: Make Tiny RCU explicitly disable preemption Because Tiny RCU is used only in kernels built with either CONFIG_PREEMPT_NONE=y or CONFIG_PREEMPT_VOLUNTARY=y, there has not been any need for TINY RCU to explicitly disable preemption. However, the prospect of lazy preemption changes that, and preemption means that the non-atomic increment in synchronize_rcu() can be preempted, with the possibility that one of the increments is lost. This could cause failures for users of the APIs that poll RCU grace periods. This commit therefore adds the needed preempt_disable() and preempt_enable() call to Tiny RCU. Signed-off-by: Paul E. McKenney Cc: Ankur Arora Cc: Thomas Gleixner Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tiny.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4470af926a34..4402d6f5f857 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -153,7 +153,9 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + preempt_disable(); WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); + preempt_enable(); } EXPORT_SYMBOL_GPL(synchronize_rcu); -- cgit v1.2.3 From 62bb24c4b022b9ba9cf2e4a72f6cd8c3086f0cf8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Mar 2024 15:01:54 -0800 Subject: rcu: Remove redundant READ_ONCE() of rcu_state.gp_flags in tree.c Although it is functionally OK to do READ_ONCE() of a variable that cannot change, it is confusing and at best an accident waiting to happen. This commit therefore removes a number of READ_ONCE(rcu_state.gp_flags) instances from kernel/rcu/tree.c that are not needed due to updates to this field being excluded by virtue of holding the root rcu_node structure's ->lock. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/4857c5ef-bd8f-4670-87ac-0600a1699d05@paulmck-laptop/T/#mccb23c2a4902da4d3c750165329f8de056903c58 Reported-by: Julia Lawall Closes: https://lore.kernel.org/lkml/4857c5ef-bd8f-4670-87ac-0600a1699d05@paulmck-laptop/T/#md1b5c026584f9c3c7b0fbc9240dd7de584597b73 Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2795a1457acf..559f2d0d271f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1463,7 +1463,7 @@ static noinline_for_stack bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - if (!READ_ONCE(rcu_state.gp_flags)) { + if (!rcu_state.gp_flags) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq_rcu_node(rnp); return false; @@ -1648,8 +1648,7 @@ static void rcu_gp_fqs(bool first_time) /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq_rcu_node(rnp); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq_rcu_node(rnp); } } @@ -1910,8 +1909,7 @@ static void rcu_report_qs_rsp(unsigned long flags) { raw_lockdep_assert_held_rcu_node(rcu_get_root()); WARN_ON_ONCE(!rcu_gp_in_progress()); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); rcu_gp_kthread_wake(); } @@ -2426,8 +2424,7 @@ void rcu_force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } -- cgit v1.2.3 From c90b9e49782424623e0006abbcab98f835c1f1d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Mar 2024 15:06:19 -0800 Subject: rcu: Bring diagnostic read of rcu_state.gp_flags into alignment This commit adds READ_ONCE() to a lockless diagnostic read from rcu_state.gp_flags to avoid giving the compiler any chance whatsoever of confusing the diagnostic state printed. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5d666428546b..62b2c4858028 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -579,7 +579,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n", rcu_state.name, (jiffies - jiffies_fqs), (long)rcu_seq_current(&rcu_state.gp_seq), - data_race(rcu_state.gp_flags), + data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, data_race(READ_ONCE(gpk->__state))); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", -- cgit v1.2.3 From a542d116bab28b4bcade2f41488f4617373c620f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Mar 2024 20:11:21 -0800 Subject: rcu: Mark writes to rcu_sync ->gp_count field The rcu_sync structure's ->gp_count field is updated under the protection of ->rss_lock, but read locklessly, and KCSAN noted the data race. This commit therefore uses WRITE_ONCE() to do this update to clearly document its racy nature. Signed-off-by: Paul E. McKenney Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/sync.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 86df878a2fee..6c2bd9001adc 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * we are called at early boot time but this shouldn't happen. */ } - rsp->gp_count++; + WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1); spin_unlock_irq(&rsp->rss_lock); if (gp_state == GP_IDLE) { @@ -151,11 +151,15 @@ void rcu_sync_enter(struct rcu_sync *rsp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + int gpc; + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); - if (!--rsp->gp_count) { + gpc = rsp->gp_count - 1; + WRITE_ONCE(rsp->gp_count, gpc); + if (!gpc) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); rcu_sync_call(rsp); -- cgit v1.2.3 From 09e077cf22c4302ab4ca7932f56c5a8b20c9e32b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Mar 2024 20:14:55 -0800 Subject: rcu: Mark loads from rcu_state.n_online_cpus The rcu_state.n_online_cpus value is only ever updated by CPU-hotplug operations, which are serialized. However, this value is read locklessly. This commit therefore marks those reads. While in the area, it also adds ASSERT_EXCLUSIVE_WRITER() calls just in case parallel CPU hotplug becomes a thing. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 4 +++- kernel/rcu/tree_stall.h | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 559f2d0d271f..7149b2d5cdd6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4328,7 +4328,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); // whether spinlocks may be acquired safely. static bool rcu_init_invoked(void) { - return !!rcu_state.n_online_cpus; + return !!READ_ONCE(rcu_state.n_online_cpus); } /* @@ -4538,6 +4538,7 @@ int rcutree_prepare_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; @@ -4806,6 +4807,7 @@ void rcutree_migrate_callbacks(int cpu) */ int rcutree_dead_cpu(unsigned int cpu) { + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 62b2c4858028..8a2edf6a1ef5 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -628,7 +628,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) totqlen += rcu_get_n_cbs_cpu(cpu); pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { rcu_dump_cpu_stacks(); @@ -689,7 +690,8 @@ static void print_cpu_stall(unsigned long gps) totqlen += rcu_get_n_cbs_cpu(cpu); pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); -- cgit v1.2.3 From ae2b217ab542d0db0ca1a6de4f442201a1982f00 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Mar 2024 11:15:01 -0800 Subject: rcu: Make hotplug operations track GP state, not flags Currently, there are rcu_data structure fields named ->rcu_onl_gp_seq and ->rcu_ofl_gp_seq that track the rcu_state.gp_flags field at the time of the corresponding CPU's last online or offline operation, respectively. However, this information is not particularly useful. It would be better to instead track the grace period state kept in rcu_state.gp_state. This would also be consistent with the initialization in rcu_boot_init_percpu_data(), which is to RCU_GP_CLEANED (an rcu_state.gp_state value), and also with the diagnostics in rcu_implicit_dynticks_qs(), whose format is consistent with an integer, not a bitmask. This commit therefore makes this change and changes the names to ->rcu_onl_gp_flags and ->rcu_ofl_gp_flags, respectively. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree.h | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7149b2d5cdd6..306f55b81d10 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -841,8 +841,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); return 1; /* Break things loose after complaining. */ } @@ -4420,9 +4420,9 @@ rcu_boot_init_percpu_data(int cpu) WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; - rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; - rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_state = RCU_GP_CLEANED; rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); @@ -4682,7 +4682,7 @@ void rcutree_report_cpu_starting(unsigned int cpu) ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state); /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ @@ -4733,7 +4733,7 @@ void rcutree_report_cpu_dead(void) arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ rcu_disable_urgency_upon_qs(rdp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df48160b3136..ff4d8b60554b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -273,9 +273,9 @@ struct rcu_data { bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ - short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + short rcu_ofl_gp_state; /* ->gp_state at last offline. */ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ - short rcu_onl_gp_flags; /* ->gp_flags at last online. */ + short rcu_onl_gp_state; /* ->gp_state at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 36a8b5dbf5b5..340bbefe5f65 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -805,8 +805,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) rdp = per_cpu_ptr(&rcu_data, cpu); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", cpu, ".o"[rcu_rdp_cpu_online(rdp)], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); } } -- cgit v1.2.3 From fc2897d2abe5a307e3b28943bcb4c1401432ea26 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Mar 2024 13:26:26 -0800 Subject: rcu: Inform KCSAN of one-byte cmpxchg() in rcu_trc_cmpxchg_need_qs() Tasks Trace RCU needs a single-byte cmpxchg(), but no such thing exists. Therefore, rcu_trc_cmpxchg_need_qs() emulates one using field substitution and a four-byte cmpxchg(), such that the other three bytes are always atomically updated to their old values. This works, but results in false-positive KCSAN failures because as far as KCSAN knows, this cmpxchg() operation is updating all four bytes. This commit therefore encloses the cmpxchg() in a data_race() and adds a single-byte instrument_atomic_read_write(), thus telling KCSAN exactly what is going on so as to avoid the false positives. Signed-off-by: Paul E. McKenney Cc: Marco Elver Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 147b5945d67a..327fbfc999c8 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1457,6 +1457,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v) /* * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for * the four-byte operand-size restriction of some platforms. + * * Returns the old value, which is often ignored. */ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) @@ -1468,7 +1469,14 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) if (trs_old.b.need_qs != old) return trs_old.b.need_qs; trs_new.b.need_qs = new; - ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s); + + // Although cmpxchg() appears to KCSAN to update all four bytes, + // only the .b.need_qs byte actually changes. + instrument_atomic_read_write(&t->trc_reader_special.b.need_qs, + sizeof(t->trc_reader_special.b.need_qs)); + // Avoid false-positive KCSAN failures. + ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s)); + return ret.b.need_qs; } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); -- cgit v1.2.3 From 8db610c3bd93e6929348f6e5271f2f905f80d82e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Feb 2024 17:07:24 -0800 Subject: rcu-tasks: Replace exit_tasks_rcu_start() initialization with WARN_ON_ONCE() Because the Tasks RCU ->rtp_exit_list is initialized at rcu_init() time while there is only one CPU running with interrupts disabled, it is not possible for an exiting task to encounter an uninitialized list. This commit therefore replaces the conditional initialization with a WARN_ON_ONCE(). Reported-by: Frederic Weisbecker Closes: https://lore.kernel.org/all/ZdiNXmO3wRvmzPsr@lothringen/ Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 82e458ea0728..78d74c81cc24 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1203,8 +1203,7 @@ void exit_tasks_rcu_start(void) rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); t->rcu_tasks_exit_cpu = smp_processor_id(); raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - if (!rtpcp->rtp_exit_list.next) - INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + WARN_ON_ONCE(!rtpcp->rtp_exit_list.next); list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); preempt_enable(); -- cgit v1.2.3 From 5f48fa85fdb92569f58374b6e040c956628fdcf5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 26 Feb 2024 11:24:39 +0800 Subject: rcu-tasks: Fix the comments for tasks_rcu_exit_srcu_stall_timer The synchronize_srcu() has been removed by commit("rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks") in rcu_tasks_postscan. This commit therefore fixes the tasks_rcu_exit_srcu_stall_timer comment. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 78d74c81cc24..d5319bbe8c98 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -150,7 +150,7 @@ static struct rcu_tasks rt_name = \ #ifdef CONFIG_TASKS_RCU -/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ +/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif -- cgit v1.2.3 From cc5645fddb0ce28492b15520306d092730dffa48 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Wed, 27 Mar 2024 20:47:47 +0300 Subject: rcu-tasks: Fix show_rcu_tasks_trace_gp_kthread buffer overflow There is a possibility of buffer overflow in show_rcu_tasks_trace_gp_kthread() if counters, passed to sprintf() are huge. Counter numbers, needed for this are unrealistically high, but buffer overflow is still possible. Use snprintf() with buffer size instead of sprintf(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: edf3775f0ad6 ("rcu-tasks: Add count for idle tasks on offline CPUs") Signed-off-by: Nikita Kiryushin Reviewed-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5319bbe8c98..08a92bffeb84 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1997,7 +1997,7 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%lu h:%lu/%lu/%lu", + snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), -- cgit v1.2.3 From 3758f7d9917bd7ef0482c4184c0ad673b4c4e069 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Mon, 1 Apr 2024 22:43:15 +0300 Subject: rcu: Fix buffer overflow in print_cpu_stall_info() The rcuc-starvation output from print_cpu_stall_info() might overflow the buffer if there is a huge difference in jiffies difference. The situation might seem improbable, but computers sometimes get very confused about time, which can result in full-sized integers, and, in this case, buffer overflow. Also, the unsigned jiffies difference is printed using %ld, which is normally for signed integers. This is intentional for debugging purposes, but it is not obvious from the code. This commit therefore changes sprintf() to snprintf() and adds a clarifying comment about intention of %ld format. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 245a62982502 ("rcu: Dump rcuc kthread status for CPUs not reporting quiescent state") Signed-off-by: Nikita Kiryushin Reviewed-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_stall.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 8a2edf6a1ef5..460efecd077b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -504,7 +504,8 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) - sprintf(buf, " rcuc=%ld jiffies(starved)", j); + // Print signed value, as negative values indicate a probable bug. + snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j); pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", cpu, "O."[!!cpu_online(cpu)], -- cgit v1.2.3 From 988f569ae041ccc93a79d98d1b0043dff4d7e9b7 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:05 +0100 Subject: rcu: Reduce synchronize_rcu() latency A call to a synchronize_rcu() can be optimized from a latency point of view. Workloads which depend on this can benefit of it. The delay of wakeme_after_rcu() callback, which unblocks a waiter, depends on several factors: - how fast a process of offloading is started. Combination of: - !CONFIG_RCU_NOCB_CPU/CONFIG_RCU_NOCB_CPU; - !CONFIG_RCU_LAZY/CONFIG_RCU_LAZY; - other. - when started, invoking path is interrupted due to: - time limit; - need_resched(); - if limit is reached. - where in a nocb list it is located; - how fast previous callbacks completed; Example: 1. On our embedded devices i can easily trigger the scenario when it is a last in the list out of ~3600 callbacks: <...>-29 [001] d..1. 21950.145313: rcu_batch_start: rcu_preempt CBs=3613 bl=28 ... <...>-29 [001] ..... 21950.152578: rcu_invoke_callback: rcu_preempt rhp=00000000b2d6dee8 func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152579: rcu_invoke_callback: rcu_preempt rhp=00000000a446f607 func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152580: rcu_invoke_callback: rcu_preempt rhp=00000000a5cab03b func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152581: rcu_invoke_callback: rcu_preempt rhp=0000000013b7e5ee func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152582: rcu_invoke_callback: rcu_preempt rhp=000000000a8ca6f9 func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152583: rcu_invoke_callback: rcu_preempt rhp=000000008f162ca8 func=wakeme_after_rcu.cfi_jt <...>-29 [001] d..1. 21950.152625: rcu_batch_end: rcu_preempt CBs-invoked=3612 idle=.... 2. We use cpuset/cgroup to classify tasks and assign them into different cgroups. For example "backgrond" group which binds tasks only to little CPUs or "foreground" which makes use of all CPUs. Tasks can be migrated between groups by a request if an acceleration is needed. See below an example how "surfaceflinger" task gets migrated. Initially it is located in the "system-background" cgroup which allows to run only on little cores. In order to speed it up it can be temporary moved into "foreground" cgroup which allows to use big/all CPUs: cgroup_attach_task(): -> cgroup_migrate_execute() -> cpuset_can_attach() -> percpu_down_write() -> rcu_sync_enter() -> synchronize_rcu() -> now move tasks to the new cgroup. -> cgroup_migrate_finish() rcuop/1-29 [000] ..... 7030.528570: rcu_invoke_callback: rcu_preempt rhp=00000000461605e0 func=wakeme_after_rcu.cfi_jt PERFD-SERVER-1855 [000] d..1. 7030.530293: cgroup_attach_task: dst_root=3 dst_id=22 dst_level=1 dst_path=/foreground pid=1900 comm=surfaceflinger TimerDispatch-2768 [002] d..5. 7030.537542: sched_migrate_task: comm=surfaceflinger pid=1900 prio=98 orig_cpu=0 dest_cpu=4 "Boosting a task" depends on synchronize_rcu() latency: - first trace shows a completion of synchronize_rcu(); - second shows attaching a task to a new group; - last shows a final step when migration occurs. 3. To address this drawback, maintain a separate track that consists of synchronize_rcu() callers only. After completion of a grace period users are deferred to a dedicated worker to process requests. 4. This patch reduces the latency of synchronize_rcu() approximately by ~30-40% on synthetic tests. The real test case, camera launch time, shows(time is in milliseconds): 1-run 542 vs 489 improvement 9% 2-run 540 vs 466 improvement 13% 3-run 518 vs 468 improvement 9% 4-run 531 vs 457 improvement 13% 5-run 548 vs 475 improvement 13% 6-run 509 vs 484 improvement 4% Synthetic test(no "noise" from other callbacks): Hardware: x86_64 64 CPUs, 64GB of memory Linux-6.6 - 10K tasks(simultaneous); - each task does(1000 loops) synchronize_rcu(); kfree(p); default: CONFIG_RCU_NOCB_CPU: takes 54 seconds to complete all users; patch: CONFIG_RCU_NOCB_CPU: takes 35 seconds to complete all users. Running 60K gives approximately same results on my setup. Please note it is without any interaction with another type of callbacks, otherwise it will impact a lot a default case. 5. By default it is disabled. To enable this perform one of the below sequence: echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1" Reviewed-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Co-developed-by: Neeraj Upadhyay (AMD) Signed-off-by: Neeraj Upadhyay (AMD) Signed-off-by: Uladzislau Rezki (Sony) --- Documentation/admin-guide/kernel-parameters.txt | 14 + kernel/rcu/tree.c | 331 +++++++++++++++++++++++- kernel/rcu/tree_exp.h | 2 +- 3 files changed, 345 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..0a3b0fd1910e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5091,6 +5091,20 @@ delay, memory pressure or callback list growing too big. + rcutree.rcu_normal_wake_from_gp= [KNL] + Reduces a latency of synchronize_rcu() call. This approach + maintains its own track of synchronize_rcu() callers, so it + does not interact with regular callbacks because it does not + use a call_rcu[_hurry]() path. Please note, this is for a + normal grace period. + + How to enable it: + + echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp + or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1" + + Default is 0. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9642dd06c25..f65255205e44 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -75,6 +75,7 @@ #define MODULE_PARAM_PREFIX "rcutree." /* Data structures. */ +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, @@ -93,6 +94,8 @@ static struct rcu_state rcu_state = { .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, + .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, + rcu_sr_normal_gp_cleanup_work), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -1422,6 +1425,282 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * There is a single llist, which is used for handling + * synchronize_rcu() users' enqueued rcu_synchronize nodes. + * Within this llist, there are two tail pointers: + * + * wait tail: Tracks the set of nodes, which need to + * wait for the current GP to complete. + * done tail: Tracks the set of nodes, for which grace + * period has elapsed. These nodes processing + * will be done as part of the cleanup work + * execution by a kworker. + * + * At every grace period init, a new wait node is added + * to the llist. This wait node is used as wait tail + * for this new grace period. Given that there are a fixed + * number of wait nodes, if all wait nodes are in use + * (which can happen when kworker callback processing + * is delayed) and additional grace period is requested. + * This means, a system is slow in processing callbacks. + * + * TODO: If a slow processing is detected, a first node + * in the llist should be used as a wait-tail for this + * grace period, therefore users which should wait due + * to a slow process are handled by _this_ grace period + * and not next. + * + * Below is an illustration of how the done and wait + * tail pointers move from one set of rcu_synchronize nodes + * to the other, as grace periods start and finish and + * nodes are processed by kworker. + * + * + * a. Initial llist callbacks list: + * + * +----------+ +--------+ +-------+ + * | | | | | | + * | head |---------> | cb2 |--------->| cb1 | + * | | | | | | + * +----------+ +--------+ +-------+ + * + * + * + * b. New GP1 Start: + * + * WAIT TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * c. GP completion: + * + * WAIT_TAIL == DONE_TAIL + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * d. New callbacks and GP2 start: + * + * WAIT TAIL DONE TAIL + * | | + * | | + * v v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * + * e. GP2 completion: + * + * WAIT_TAIL == DONE_TAIL + * DONE TAIL + * | + * | + * v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * While the llist state transitions from d to e, a kworker + * can start executing rcu_sr_normal_gp_cleanup_work() and + * can observe either the old done tail (@c) or the new + * done tail (@e). So, done tail updates and reads need + * to use the rel-acq semantics. If the concurrent kworker + * observes the old done tail, the newly queued work + * execution will process the updated done tail. If the + * concurrent kworker observes the new done tail, then + * the newly queued work will skip processing the done + * tail, as workqueue semantics guarantees that the new + * work is executed only after the previous one completes. + * + * f. kworker callbacks processing complete: + * + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ + * | | | | + * | head ------> wait | + * | | | head2 | + * +----------+ +--------+ + * + */ +static bool rcu_sr_is_wait_head(struct llist_node *node) +{ + return &(rcu_state.srs_wait_nodes)[0].node <= node && + node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node; +} + +static struct llist_node *rcu_sr_get_wait_head(void) +{ + struct sr_wait_node *sr_wn; + int i; + + for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) { + sr_wn = &(rcu_state.srs_wait_nodes)[i]; + + if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1)) + return &sr_wn->node; + } + + return NULL; +} + +static void rcu_sr_put_wait_head(struct llist_node *node) +{ + struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node); + + atomic_set_release(&sr_wn->inuse, 0); +} + +/* Disabled by default. */ +static int rcu_normal_wake_from_gp; +module_param(rcu_normal_wake_from_gp, int, 0644); + +static void rcu_sr_normal_complete(struct llist_node *node) +{ + struct rcu_synchronize *rs = container_of( + (struct rcu_head *) node, struct rcu_synchronize, head); + unsigned long oldstate = (unsigned long) rs->head.func; + + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + !poll_state_synchronize_rcu(oldstate), + "A full grace period is not passed yet: %lu", + rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + + /* Finally. */ + complete(&rs->completion); +} + +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) +{ + struct llist_node *done, *rcu, *next, *head; + + /* + * This work execution can potentially execute + * while a new done tail is being updated by + * grace period kthread in rcu_sr_normal_gp_cleanup(). + * So, read and updates of done tail need to + * follow acq-rel semantics. + * + * Given that wq semantics guarantees that a single work + * cannot execute concurrently by multiple kworkers, + * the done tail list manipulations are protected here. + */ + done = smp_load_acquire(&rcu_state.srs_done_tail); + if (!done) + return; + + WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); + head = done->next; + done->next = NULL; + + /* + * The dummy node, which is pointed to by the + * done tail which is acq-read above is not removed + * here. This allows lockless additions of new + * rcu_synchronize nodes in rcu_sr_normal_add_req(), + * while the cleanup work executes. The dummy + * nodes is removed, in next round of cleanup + * work execution. + */ + llist_for_each_safe(rcu, next, head) { + if (!rcu_sr_is_wait_head(rcu)) { + rcu_sr_normal_complete(rcu); + continue; + } + + rcu_sr_put_wait_head(rcu); + } +} + +/* + * Helper function for rcu_gp_cleanup(). + */ +static void rcu_sr_normal_gp_cleanup(void) +{ + struct llist_node *wait_tail; + + wait_tail = rcu_state.srs_wait_tail; + if (wait_tail == NULL) + return; + + rcu_state.srs_wait_tail = NULL; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + // concurrent sr_normal_gp_cleanup work might observe this update. + smp_store_release(&rcu_state.srs_done_tail, wait_tail); + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + + schedule_work(&rcu_state.srs_cleanup_work); +} + +/* + * Helper function for rcu_gp_init(). + */ +static bool rcu_sr_normal_gp_init(void) +{ + struct llist_node *first; + struct llist_node *wait_head; + bool start_new_poll = false; + + first = READ_ONCE(rcu_state.srs_next.first); + if (!first || rcu_sr_is_wait_head(first)) + return start_new_poll; + + wait_head = rcu_sr_get_wait_head(); + if (!wait_head) { + // Kick another GP to retry. + start_new_poll = true; + return start_new_poll; + } + + /* Inject a wait-dummy-node. */ + llist_add(wait_head, &rcu_state.srs_next); + + /* + * A waiting list of rcu_synchronize nodes should be empty on + * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(), + * rolls it over. If not, it is a BUG, warn a user. + */ + WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL); + rcu_state.srs_wait_tail = wait_head; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + return start_new_poll; +} + +static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) +{ + llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1432,6 +1711,7 @@ static noinline_for_stack bool rcu_gp_init(void) unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); + bool start_new_poll; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -1456,10 +1736,24 @@ static noinline_for_stack bool rcu_gp_init(void) /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); + start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); + /* + * The "start_new_poll" is set to true, only when this GP is not able + * to handle anything and there are outstanding users. It happens when + * the rcu_sr_normal_gp_init() function was not able to insert a dummy + * separator to the llist, because there were no left any dummy-nodes. + * + * Number of dummy-nodes is fixed, it could be that we are run out of + * them, if so we start a new pool request to repeat a try. It is rare + * and it means that a system is doing a slow processing of callbacks. + */ + if (start_new_poll) + (void) start_poll_synchronize_rcu(); + /* * Apply per-leaf buffered online and offline operations to * the rcu_node tree. Note that this new grace period need not @@ -1825,6 +2119,9 @@ static noinline void rcu_gp_cleanup(void) } raw_spin_unlock_irq_rcu_node(rnp); + // Make synchronize_rcu() users aware of the end of old grace period. + rcu_sr_normal_gp_cleanup(); + // If strict, make all CPUs aware of the end of the old grace period. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) on_each_cpu(rcu_strict_gp_boundary, NULL, 0); @@ -3559,6 +3856,38 @@ static int rcu_blocking_is_gp(void) return true; } +/* + * Helper function for the synchronize_rcu() API. + */ +static void synchronize_rcu_normal(void) +{ + struct rcu_synchronize rs; + + if (!READ_ONCE(rcu_normal_wake_from_gp)) { + wait_rcu_gp(call_rcu_hurry); + return; + } + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + + /* + * This code might be preempted, therefore take a GP + * snapshot before adding a request. + */ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + rs.head.func = (void *) get_state_synchronize_rcu(); + + rcu_sr_normal_add_req(&rs); + + /* Kick a GP and start waiting. */ + (void) start_poll_synchronize_rcu(); + + /* Now we can wait. */ + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); +} + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -3610,7 +3939,7 @@ void synchronize_rcu(void) if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6b83537480b1..8a1d9c8bd9f7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -930,7 +930,7 @@ void synchronize_rcu_expedited(void) /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } -- cgit v1.2.3 From 2053937a310a3982de9d33af3db2dbd2b32b66e4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:06 +0100 Subject: rcu: Add a trace event for synchronize_rcu_normal() Add an rcu_sr_normal() trace event. It takes three arguments first one is the name of RCU flavour, second one is a user id which triggeres synchronize_rcu_normal() and last one is an event. There are two traces in the synchronize_rcu_normal(). On entry, when a new request is registered and on exit point when request is completed. Please note, CONFIG_RCU_TRACE=y is required to activate traces. Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/trace/events/rcu.h | 27 +++++++++++++++++++++++++++ kernel/rcu/tree.c | 7 ++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 2ef9c719772a..31b3e0d3e65f 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -707,6 +707,33 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback, __entry->rcuname, __entry->p, __entry->nr_records) ); +/* + * Tracepoint for a normal synchronize_rcu() states. The first argument + * is the RCU flavor, the second argument is a pointer to rcu_head the + * last one is an event. + */ +TRACE_EVENT_RCU(rcu_sr_normal, + + TP_PROTO(const char *rcuname, struct rcu_head *rhp, const char *srevent), + + TP_ARGS(rcuname, rhp, srevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(void *, rhp) + __field(const char *, srevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->rhp = rhp; + __entry->srevent = srevent; + ), + + TP_printk("%s rhp=0x%p event=%s", + __entry->rcuname, __entry->rhp, __entry->srevent) +); + /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been * invoked. The first argument is the name of the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f65255205e44..2e1c5be6d64b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3863,9 +3863,11 @@ static void synchronize_rcu_normal(void) { struct rcu_synchronize rs; + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); + if (!READ_ONCE(rcu_normal_wake_from_gp)) { wait_rcu_gp(call_rcu_hurry); - return; + goto trace_complete_out; } init_rcu_head_on_stack(&rs.head); @@ -3886,6 +3888,9 @@ static void synchronize_rcu_normal(void) /* Now we can wait. */ wait_for_completion(&rs.completion); destroy_rcu_head_on_stack(&rs.head); + +trace_complete_out: + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete")); } /** -- cgit v1.2.3 From 462df2f543ae360e79fcaa1b498d2a1a0c2a5b63 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:07 +0100 Subject: rcu: Support direct wake-up of synchronize_rcu() users This patch introduces a small enhancement which allows to do a direct wake-up of synchronize_rcu() callers. It occurs after a completion of grace period, thus by the gp-kthread. Number of clients is limited by the hard-coded maximum allowed threshold. The remaining part, if still exists is deferred to a main worker. Link: https://lore.kernel.org/lkml/Zd0ZtNu+Rt0qXkfS@lothringen/ Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 24 +++++++++++++++++++++++- kernel/rcu/tree.h | 6 ++++++ 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2e1c5be6d64b..2a270abade4d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1645,7 +1645,8 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) */ static void rcu_sr_normal_gp_cleanup(void) { - struct llist_node *wait_tail; + struct llist_node *wait_tail, *next, *rcu; + int done = 0; wait_tail = rcu_state.srs_wait_tail; if (wait_tail == NULL) @@ -1653,11 +1654,32 @@ static void rcu_sr_normal_gp_cleanup(void) rcu_state.srs_wait_tail = NULL; ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail)); + + /* + * Process (a) and (d) cases. See an illustration. + */ + llist_for_each_safe(rcu, next, wait_tail->next) { + if (rcu_sr_is_wait_head(rcu)) + break; + + rcu_sr_normal_complete(rcu); + // It can be last, update a next on this step. + wait_tail->next = next; + + if (++done == SR_MAX_USERS_WAKE_FROM_GP) + break; + } // concurrent sr_normal_gp_cleanup work might observe this update. smp_store_release(&rcu_state.srs_done_tail, wait_tail); ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + /* + * We schedule a work in order to perform a final processing + * of outstanding users(if still left) and releasing wait-heads + * added by rcu_sr_normal_gp_init() call. + */ schedule_work(&rcu_state.srs_cleanup_work); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b942b9437438..2832787cee1d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -315,6 +315,12 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +/* + * A max threshold for synchronize_rcu() users which are + * awaken directly by the rcu_gp_kthread(). Left part is + * deferred to the main worker. + */ +#define SR_MAX_USERS_WAKE_FROM_GP 5 #define SR_NORMAL_GP_WAIT_HEAD_MAX 5 struct sr_wait_node { -- cgit v1.2.3 From 0fd210baa07a9e3f15df1bc687293eafb119283a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:09 +0100 Subject: rcu: Allocate WQ with WQ_MEM_RECLAIM bit set synchronize_rcu() users have to be processed regardless of memory pressure so our private WQ needs to have at least one execution context what WQ_MEM_RECLAIM flag guarantees. Reviewed-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a270abade4d..1d5c000e5c7a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1582,6 +1582,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node) /* Disabled by default. */ static int rcu_normal_wake_from_gp; module_param(rcu_normal_wake_from_gp, int, 0644); +static struct workqueue_struct *sync_wq; static void rcu_sr_normal_complete(struct llist_node *node) { @@ -1680,7 +1681,7 @@ static void rcu_sr_normal_gp_cleanup(void) * of outstanding users(if still left) and releasing wait-heads * added by rcu_sr_normal_gp_init() call. */ - schedule_work(&rcu_state.srs_cleanup_work); + queue_work(sync_wq, &rcu_state.srs_cleanup_work); } /* @@ -5585,6 +5586,9 @@ void __init rcu_init(void) rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); + WARN_ON(!sync_wq); + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) -- cgit v1.2.3 From 8d0f9a6639f5083453602375ce9d3b71ef93ac73 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2024 19:14:46 -0800 Subject: rcutorture: Remove extraneous rcu_torture_pipe_update_one() READ_ONCE() The rcu_torture_pipe_update_one() cannot run concurrently with any updates of ->rtort_pipe_count, so this commit removes the extraneous READ_ONCE() from the read from this field. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/CAHk-=wiX_zF5Mpt8kUm_LFQpYY-mshrXJPOe+wKNwiVhEUcU9g@mail.gmail.com/ Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d8c12eba35b7..6b821a7037b0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -461,7 +461,7 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) WRITE_ONCE(rp->rtort_chkp, NULL); smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). } - i = READ_ONCE(rp->rtort_pipe_count); + i = rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); -- cgit v1.2.3 From 8b9b443fa860276822b25057cb3ff3b28734dec0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2024 19:21:47 -0800 Subject: rcutorture: Fix rcu_torture_one_read() pipe_count overflow comment The "pipe_count > RCU_TORTURE_PIPE_LEN" check has a comment saying "Should not happen, but...". This is only true when testing an RCU whose grace periods are always long enough. This commit therefore fixes this comment. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/CAHk-=wi7rJ-eGq+xaxVfzFEgbL9tdf6Kc8Z89rCpfcQOKm74Tw@mail.gmail.com/ Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b821a7037b0..0cb5452ecd94 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2000,7 +2000,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ + // Should not happen in a correct RCU implementation, + // happens quite often for torture_type=busted. pipe_count = RCU_TORTURE_PIPE_LEN; } completed = cur_ops->get_gp_seq(); -- cgit v1.2.3 From a10e3cbf32786cae437e4f370573318721e47c2c Mon Sep 17 00:00:00 2001 From: linke li Date: Wed, 6 Mar 2024 19:51:10 -0800 Subject: rcutorture: Re-use value stored to ->rtort_pipe_count instead of re-reading Currently, the rcu_torture_pipe_update_one() writes the value (i + 1) to rp->rtort_pipe_count, then immediately re-reads it in order to compare it to RCU_TORTURE_PIPE_LEN. This re-read is pointless because no other update to rp->rtort_pipe_count can occur at this point. This commit therefore instead re-uses the (i + 1) value stored in the comparison instead of re-reading rp->rtort_pipe_count. Signed-off-by: linke li Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0cb5452ecd94..dd7d5ba45740 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -467,7 +467,7 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(rp->rtort_pipe_count, i + 1); ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); - if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + if (i + 1 >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } -- cgit v1.2.3 From e38bf06d509a90c9fc185129ec913beb6de3d428 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 15 Mar 2024 15:17:10 +0800 Subject: rcutorture: Use the gp_kthread_dbg operation specified by cur_ops Despite there being a cur_ops->gp_kthread_dbg(), rcu_torture_writer() unconditionally invokes vanilla RCU's show_rcu_gp_kthreads(). This is not at all helpful when some other flavor of RCU is being tested. This commit therefore makes rcu_torture_writer() invoke cur_ops->gp_kthread_dbg() for RCU implementations providing this function. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dd7d5ba45740..2f43d31fb7a5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1589,7 +1589,8 @@ rcu_torture_writer(void *arg) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); } -- cgit v1.2.3 From dddcddef1414be3ebc37a40d13fcc0f6a672ba9f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 18 Mar 2024 17:34:11 +0800 Subject: rcutorture: Make rcutorture support print rcu-tasks gp state This commit make rcu-tasks related rcutorture test support rcu-tasks gp state printing when the writer stall occurs or the at the end of rcutorture test, and generate rcu_ops->get_gp_data() operation to simplify the acquisition of gp state for different types of rcutorture tests. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcu.h | 20 ++++++++++---------- kernel/rcu/rcutorture.c | 26 ++++++++++++++++++-------- kernel/rcu/srcutree.c | 5 +---- kernel/rcu/tasks.h | 21 +++++++++++++++++++++ kernel/rcu/tree.c | 13 +++---------- 5 files changed, 53 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 86fce206560e..38238e595a61 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -522,12 +522,18 @@ static inline void show_rcu_tasks_gp_kthreads(void) {} #ifdef CONFIG_TASKS_RCU struct task_struct *get_rcu_tasks_gp_kthread(void); +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RCU #ifdef CONFIG_TASKS_RUDE_RCU struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RUDE_RCU +#ifdef CONFIG_TASKS_TRACE_RCU +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); +#endif + #ifdef CONFIG_TASKS_RCU_GENERIC void tasks_cblist_init_generic(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -557,8 +563,7 @@ static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } #endif #if defined(CONFIG_TREE_RCU) -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq); +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, @@ -566,8 +571,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c); void rcu_gp_set_torture_wait(int duration); #else -static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, unsigned long *gp_seq) +static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { *flags = 0; *gp_seq = 0; @@ -587,20 +591,16 @@ static inline void rcu_gp_set_torture_wait(int duration) { } #ifdef CONFIG_TINY_SRCU -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq); #endif diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2f43d31fb7a5..85ff8a32f75a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -381,6 +381,7 @@ struct rcu_torture_ops { void (*gp_kthread_dbg)(void); bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + void (*get_gp_data)(int *flags, unsigned long *gp_seq); long cbflood_max; int irq_capable; int can_boost; @@ -569,6 +570,7 @@ static struct rcu_torture_ops rcu_ops = { .gp_kthread_dbg = show_rcu_gp_kthreads, .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, + .get_gp_data = rcutorture_get_gp_data, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -628,6 +630,11 @@ static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; +static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) +{ + srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); +} + static int srcu_torture_read_lock(void) { if (cur_ops == &srcud_ops) @@ -736,6 +743,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), @@ -774,6 +782,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), @@ -882,6 +891,7 @@ static struct rcu_torture_ops tasks_ops = { .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, + .get_gp_data = rcu_tasks_get_gp_data, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -922,6 +932,7 @@ static struct rcu_torture_ops tasks_rude_ops = { .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, .fqs = NULL, .stats = NULL, @@ -974,6 +985,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, .fqs = NULL, .stats = NULL, @@ -2264,10 +2276,8 @@ rcu_torture_stats_print(void) int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; - rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); wtp = READ_ONCE(writer_task); pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n", rcu_torture_writer_state_getname(), @@ -3390,8 +3400,8 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", cur_ops->name, (long)gp_seq, flags, rcutorture_seq_diff(gp_seq, start_gp_seq)); @@ -3762,8 +3772,8 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); start_gp_seq = gp_seq; pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", cur_ops->name, (long)gp_seq, flags); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e4d673fc30f4..bc4b58b0204e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1826,12 +1826,9 @@ static void process_srcu(struct work_struct *work) srcu_reschedule(ssp, curdelay); } -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *ssp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 147b5945d67a..a1af7dadc0f7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1178,6 +1178,13 @@ struct task_struct *get_rcu_tasks_gp_kthread(void) } EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data); + /* * Protect against tasklist scan blind spot while the task is exiting and * may be removed from the tasklist. Do this by adding the task to yet @@ -1358,6 +1365,13 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void) } EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data); + #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// @@ -2010,6 +2024,13 @@ struct task_struct *get_rcu_tasks_trace_gp_kthread(void) } EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9642dd06c25..60e79ed73700 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -508,17 +508,10 @@ static struct rcu_node *rcu_get_root(void) /* * Send along grace-period-related data for rcutorture diagnostics. */ -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq) +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { - switch (test_type) { - case RCU_FLAVOR: - *flags = READ_ONCE(rcu_state.gp_flags); - *gp_seq = rcu_seq_current(&rcu_state.gp_seq); - break; - default: - break; - } + *flags = READ_ONCE(rcu_state.gp_flags); + *gp_seq = rcu_seq_current(&rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -- cgit v1.2.3 From 710cf51d3722012988b2b8d35dee3c553dcc5649 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 18 Mar 2024 17:34:12 +0800 Subject: rcutorture: Removing redundant function pointer initialization For these rcu_torture_ops structure's objects defined by using static, if the value of the function pointer in its member is not set, the default value will be NULL, this commit therefore remove the pre-existing initialization of function pointers to NULL. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 85ff8a32f75a..3f9c3766f52b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -566,7 +566,6 @@ static struct rcu_torture_ops rcu_ops = { .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, - .stats = NULL, .gp_kthread_dbg = show_rcu_gp_kthreads, .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, @@ -614,9 +613,6 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, - .cb_barrier = NULL, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "busted" }; @@ -847,8 +843,6 @@ static struct rcu_torture_ops trivial_ops = { .get_gp_seq = rcu_no_completed, .sync = synchronize_rcu_trivial, .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "trivial" }; @@ -892,8 +886,6 @@ static struct rcu_torture_ops tasks_ops = { .cb_barrier = rcu_barrier_tasks, .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, .get_gp_data = rcu_tasks_get_gp_data, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .slow_gps = 1, .name = "tasks" @@ -934,8 +926,6 @@ static struct rcu_torture_ops tasks_rude_ops = { .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "tasks-rude" }; @@ -987,8 +977,6 @@ static struct rcu_torture_ops tasks_tracing_ops = { .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .slow_gps = 1, .name = "tasks-tracing" -- cgit v1.2.3 From 431315a563015f259b28e34c5842f6166439e969 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 21 Mar 2024 16:28:50 +0800 Subject: rcutorture: Make stall-tasks directly exit when rcutorture tests end When the rcutorture tests start to exit, the rcu_torture_cleanup() is invoked to stop kthreads and release resources, if the stall-task kthreads exist, cpu-stall has started and the rcutorture.stall_cpu is set to a larger value, the rcu_torture_cleanup() will be blocked for a long time and the hung-task may occur, this commit therefore add kthread_should_stop() to the loop of cpu-stall operation, when rcutorture tests ends, no need to wait for cpu-stall to end, exit directly. Use the following command to test: insmod rcutorture.ko torture_type=srcu fwd_progress=0 stat_interval=4 stall_cpu_block=1 stall_cpu=200 stall_cpu_holdoff=10 read_exit_burst=0 object_debug=1 rmmod rcutorture [15361.918610] INFO: task rmmod:878 blocked for more than 122 seconds. [15361.918613] Tainted: G W 6.8.0-rc2-yoctodev-standard+ #25 [15361.918615] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [15361.918616] task:rmmod state:D stack:0 pid:878 tgid:878 ppid:773 flags:0x00004002 [15361.918621] Call Trace: [15361.918623] [15361.918626] __schedule+0xc0d/0x28f0 [15361.918631] ? __pfx___schedule+0x10/0x10 [15361.918635] ? rcu_is_watching+0x19/0xb0 [15361.918638] ? schedule+0x1f6/0x290 [15361.918642] ? __pfx_lock_release+0x10/0x10 [15361.918645] ? schedule+0xc9/0x290 [15361.918648] ? schedule+0xc9/0x290 [15361.918653] ? trace_preempt_off+0x54/0x100 [15361.918657] ? schedule+0xc9/0x290 [15361.918661] schedule+0xd0/0x290 [15361.918665] schedule_timeout+0x56d/0x7d0 [15361.918669] ? debug_smp_processor_id+0x1b/0x30 [15361.918672] ? rcu_is_watching+0x19/0xb0 [15361.918676] ? __pfx_schedule_timeout+0x10/0x10 [15361.918679] ? debug_smp_processor_id+0x1b/0x30 [15361.918683] ? rcu_is_watching+0x19/0xb0 [15361.918686] ? wait_for_completion+0x179/0x4c0 [15361.918690] ? __pfx_lock_release+0x10/0x10 [15361.918693] ? __kasan_check_write+0x18/0x20 [15361.918696] ? wait_for_completion+0x9d/0x4c0 [15361.918700] ? _raw_spin_unlock_irq+0x36/0x50 [15361.918703] ? wait_for_completion+0x179/0x4c0 [15361.918707] ? _raw_spin_unlock_irq+0x36/0x50 [15361.918710] ? wait_for_completion+0x179/0x4c0 [15361.918714] ? trace_preempt_on+0x54/0x100 [15361.918718] ? wait_for_completion+0x179/0x4c0 [15361.918723] wait_for_completion+0x181/0x4c0 [15361.918728] ? __pfx_wait_for_completion+0x10/0x10 [15361.918738] kthread_stop+0x152/0x470 [15361.918742] _torture_stop_kthread+0x44/0xc0 [torture 7af7f9cbba28271a10503b653f9e05d518fbc8c3] [15361.918752] rcu_torture_cleanup+0x2ac/0xe90 [rcutorture f2cb1f556ee7956270927183c4c2c7749a336529] [15361.918766] ? __pfx_rcu_torture_cleanup+0x10/0x10 [rcutorture f2cb1f556ee7956270927183c4c2c7749a336529] [15361.918777] ? __kasan_check_write+0x18/0x20 [15361.918781] ? __mutex_unlock_slowpath+0x17c/0x670 [15361.918789] ? __might_fault+0xcd/0x180 [15361.918793] ? find_module_all+0x104/0x1d0 [15361.918799] __x64_sys_delete_module+0x2a4/0x3f0 [15361.918803] ? __pfx___x64_sys_delete_module+0x10/0x10 [15361.918807] ? syscall_exit_to_user_mode+0x149/0x280 Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3f9c3766f52b..456185d9e6c0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2489,8 +2489,8 @@ static int rcu_torture_stall(void *args) preempt_disable(); pr_alert("%s start on CPU %d.\n", __func__, raw_smp_processor_id()); - while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), - stop_at)) + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && + !kthread_should_stop()) if (stall_cpu_block) { #ifdef CONFIG_PREEMPTION preempt_schedule(); -- cgit v1.2.3 From 668c0406d887467d53f8fe79261dda1d22d5b671 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 25 Mar 2024 15:52:19 +0800 Subject: rcutorture: Fix invalid context warning when enable srcu barrier testing When the torture_type is set srcu or srcud and cb_barrier is non-zero, running the rcutorture test will trigger the following warning: [ 163.910989][ C1] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ 163.910994][ C1] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/1 [ 163.910999][ C1] preempt_count: 10001, expected: 0 [ 163.911002][ C1] RCU nest depth: 0, expected: 0 [ 163.911005][ C1] INFO: lockdep is turned off. [ 163.911007][ C1] irq event stamp: 30964 [ 163.911010][ C1] hardirqs last enabled at (30963): [] do_idle+0x362/0x500 [ 163.911018][ C1] hardirqs last disabled at (30964): [] sysvec_call_function_single+0xf/0xd0 [ 163.911025][ C1] softirqs last enabled at (0): [] copy_process+0x16ff/0x6580 [ 163.911033][ C1] softirqs last disabled at (0): [<0000000000000000>] 0x0 [ 163.911038][ C1] Preemption disabled at: [ 163.911039][ C1] [] stack_depot_save_flags+0x24b/0x6c0 [ 163.911063][ C1] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 6.8.0-rc4-rt4-yocto-preempt-rt+ #3 1e39aa9a737dd024a3275c4f835a872f673a7d3a [ 163.911071][ C1] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 163.911075][ C1] Call Trace: [ 163.911078][ C1] [ 163.911080][ C1] dump_stack_lvl+0x88/0xd0 [ 163.911089][ C1] dump_stack+0x10/0x20 [ 163.911095][ C1] __might_resched+0x36f/0x530 [ 163.911105][ C1] rt_spin_lock+0x82/0x1c0 [ 163.911112][ C1] spin_lock_irqsave_ssp_contention+0xb8/0x100 [ 163.911121][ C1] srcu_gp_start_if_needed+0x782/0xf00 [ 163.911128][ C1] ? _raw_spin_unlock_irqrestore+0x46/0x70 [ 163.911136][ C1] ? debug_object_active_state+0x336/0x470 [ 163.911148][ C1] ? __pfx_srcu_gp_start_if_needed+0x10/0x10 [ 163.911156][ C1] ? __pfx_lock_release+0x10/0x10 [ 163.911165][ C1] ? __pfx_rcu_torture_barrier_cbf+0x10/0x10 [ 163.911188][ C1] __call_srcu+0x9f/0xe0 [ 163.911196][ C1] call_srcu+0x13/0x20 [ 163.911201][ C1] srcu_torture_call+0x1b/0x30 [ 163.911224][ C1] rcu_torture_barrier1cb+0x4a/0x60 [ 163.911247][ C1] __flush_smp_call_function_queue+0x267/0xca0 [ 163.911256][ C1] ? __pfx_rcu_torture_barrier1cb+0x10/0x10 [ 163.911281][ C1] generic_smp_call_function_single_interrupt+0x13/0x20 [ 163.911288][ C1] __sysvec_call_function_single+0x7d/0x280 [ 163.911295][ C1] sysvec_call_function_single+0x93/0xd0 [ 163.911302][ C1] [ 163.911304][ C1] [ 163.911308][ C1] asm_sysvec_call_function_single+0x1b/0x20 [ 163.911313][ C1] RIP: 0010:default_idle+0x17/0x20 [ 163.911326][ C1] RSP: 0018:ffff888001997dc8 EFLAGS: 00000246 [ 163.911333][ C1] RAX: 0000000000000000 RBX: dffffc0000000000 RCX: ffffffffae618b51 [ 163.911337][ C1] RDX: 0000000000000000 RSI: ffffffffaea80920 RDI: ffffffffaec2de80 [ 163.911342][ C1] RBP: ffff888001997dc8 R08: 0000000000000001 R09: ffffed100d740cad [ 163.911346][ C1] R10: ffffed100d740cac R11: ffff88806ba06563 R12: 0000000000000001 [ 163.911350][ C1] R13: ffffffffafe460c0 R14: ffffffffafe460c0 R15: 0000000000000000 [ 163.911358][ C1] ? ct_kernel_exit.constprop.3+0x121/0x160 [ 163.911369][ C1] ? lockdep_hardirqs_on+0xc4/0x150 [ 163.911376][ C1] arch_cpu_idle+0x9/0x10 [ 163.911383][ C1] default_idle_call+0x7a/0xb0 [ 163.911390][ C1] do_idle+0x362/0x500 [ 163.911398][ C1] ? __pfx_do_idle+0x10/0x10 [ 163.911404][ C1] ? complete_with_flags+0x8b/0xb0 [ 163.911416][ C1] cpu_startup_entry+0x58/0x70 [ 163.911423][ C1] start_secondary+0x221/0x280 [ 163.911430][ C1] ? __pfx_start_secondary+0x10/0x10 [ 163.911440][ C1] secondary_startup_64_no_verify+0x17f/0x18b [ 163.911455][ C1] This commit therefore use smp_call_on_cpu() instead of smp_call_function_single(), make rcu_torture_barrier1cb() invoked happens on task-context. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 456185d9e6c0..8654e99bd4a3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3044,11 +3044,12 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) } /* IPI handler to get callback posted on desired CPU, if online. */ -static void rcu_torture_barrier1cb(void *rcu_void) +static int rcu_torture_barrier1cb(void *rcu_void) { struct rcu_head *rhp = rcu_void; cur_ops->call(rhp, rcu_torture_barrier_cbf); + return 0; } /* kthread function to register callbacks used to test RCU barriers. */ @@ -3074,11 +3075,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - if (smp_call_function_single(myid, rcu_torture_barrier1cb, - &rcu, 1)) { - // IPI failed, so use direct call from current CPU. + if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1)) cur_ops->call(&rcu, rcu_torture_barrier_cbf); - } + if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); -- cgit v1.2.3 From 1c67318b3d72e63c51646cf26aa4425ed6f34bc5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 29 Mar 2024 12:52:45 +0800 Subject: rcutorture: Use rcu_gp_slow_register/unregister() only for rcutype test The rcu_gp_slow_register/unregister() is only useful in tests where torture_type=rcu, so this commit therefore generates ->gp_slow_register() and ->gp_slow_unregister() function pointers in the rcu_torture_ops structure, and slows grace periods only when these function pointers exist. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8654e99bd4a3..807fbf6123a7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -382,6 +382,8 @@ struct rcu_torture_ops { bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); void (*get_gp_data)(int *flags, unsigned long *gp_seq); + void (*gp_slow_register)(atomic_t *rgssp); + void (*gp_slow_unregister)(atomic_t *rgssp); long cbflood_max; int irq_capable; int can_boost; @@ -570,6 +572,8 @@ static struct rcu_torture_ops rcu_ops = { .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, .get_gp_data = rcutorture_get_gp_data, + .gp_slow_register = rcu_gp_slow_register, + .gp_slow_unregister = rcu_gp_slow_unregister, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -3343,12 +3347,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } - rcu_gp_slow_unregister(NULL); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); - rcu_gp_slow_unregister(NULL); return; } @@ -3447,7 +3451,8 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); - rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3929,7 +3934,8 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); - rcu_gp_slow_register(&rcu_fwd_cb_nodelay); + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) + cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: -- cgit v1.2.3 From fc5eb4a84e4c063e75a6a6e92308e9533c0f19b5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 18:20:45 +0200 Subject: btf: Avoid weak external references If the BTF code is enabled in the build configuration, the start/stop BTF markers are guaranteed to exist. Only when CONFIG_DEBUG_INFO_BTF=n, the references in btf_parse_vmlinux() will remain unsatisfied, relying on the weak linkage of the external references to avoid breaking the build. Avoid GOT based relocations to these markers in the final executable by dropping the weak attribute and instead, make btf_parse_vmlinux() return ERR_PTR(-ENOENT) directly if CONFIG_DEBUG_INFO_BTF is not enabled to begin with. The compiler will drop any subsequent references to __start_BTF and __stop_BTF in that case, allowing the link to succeed. Note that Clang will notice that taking the address of __start_BTF can no longer yield NULL, so testing for that condition becomes unnecessary. Signed-off-by: Ard Biesheuvel Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Arnd Bergmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240415162041.2491523-8-ardb+git@google.com --- kernel/bpf/btf.c | 7 +++++-- kernel/bpf/sysfs_btf.c | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 90c4a32d89ff..6d46cee47ae3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5642,8 +5642,8 @@ errout_free: return ERR_PTR(err); } -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -5971,6 +5971,9 @@ struct btf *btf_parse_vmlinux(void) struct btf *btf = NULL; int err; + if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) + return ERR_PTR(-ENOENT); + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index ef6911aee3bb..fedb54c94cdb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,8 +9,8 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, @@ -32,7 +32,7 @@ static int __init btf_vmlinux_init(void) { bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) + if (bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); -- cgit v1.2.3 From 1f586614f3ffa80fdf2116b2a1bebcdb5969cef8 Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Tue, 16 Apr 2024 07:53:02 -0400 Subject: bpf: Harden and/or/xor value tracking in verifier This patch addresses a latent unsoundness issue in the scalar(32)_min_max_and/or/xor functions. While it is not a bugfix, it ensures that the functions produce sound outputs for all inputs. The issue occurs in these functions when setting signed bounds. The following example illustrates the issue for scalar_min_max_and(), but it applies to the other functions. In scalar_min_max_and() the following clause is executed when ANDing positive numbers: /* ANDing two positives gives a positive, so safe to * cast result into s64. */ dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; However, if umin_value and umax_value of dst_reg cross the sign boundary (i.e., if (s64)dst_reg->umin_value > (s64)dst_reg->umax_value), then we will end up with smin_value > smax_value, which is unsound. Previous works [1, 2] have discovered and reported this issue. Our tool Agni [2, 3] consideres it a false positive. This is because, during the verification of the abstract operator scalar_min_max_and(), Agni restricts its inputs to those passing through reg_bounds_sync(). This mimics real-world verifier behavior, as reg_bounds_sync() is invariably executed at the tail of every abstract operator. Therefore, such behavior is unlikely in an actual verifier execution. However, it is still unsound for an abstract operator to set signed bounds such that smin_value > smax_value. This patch fixes it, making the abstract operator sound for all (well-formed) inputs. It is worth noting that while the previous code updated the signed bounds (using the output unsigned bounds) only when the *input signed* bounds were positive, the new code updates them whenever the *output unsigned* bounds do not cross the sign boundary. An alternative approach to fix this latent unsoundness would be to unconditionally set the signed bounds to unbounded [S64_MIN, S64_MAX], and let reg_bounds_sync() refine the signed bounds using the unsigned bounds and the tnum. We found that our approach produces more precise (tighter) bounds. For example, consider these inputs to BPF_AND: /* dst_reg */ var_off.value: 8608032320201083347 var_off.mask: 615339716653692460 smin_value: 8070450532247928832 smax_value: 8070450532247928832 umin_value: 13206380674380886586 umax_value: 13206380674380886586 s32_min_value: -2110561598 s32_max_value: -133438816 u32_min_value: 4135055354 u32_max_value: 4135055354 /* src_reg */ var_off.value: 8584102546103074815 var_off.mask: 9862641527606476800 smin_value: 2920655011908158522 smax_value: 7495731535348625717 umin_value: 7001104867969363969 umax_value: 8584102543730304042 s32_min_value: -2097116671 s32_max_value: 71704632 u32_min_value: 1047457619 u32_max_value: 4268683090 After going through tnum_and() -> scalar32_min_max_and() -> scalar_min_max_and() -> reg_bounds_sync(), our patch produces the following bounds for s32: s32_min_value: -1263875629 s32_max_value: -159911942 Whereas, setting the signed bounds to unbounded in scalar_min_max_and() produces: s32_min_value: -1263875629 s32_max_value: -1 As observed, our patch produces a tighter s32 bound. We also confirmed using Agni and SMT verification that our patch always produces signed bounds that are equal to or more precise than setting the signed bounds to unbounded in scalar_min_max_and(). [1] https://sanjit-bhat.github.io/assets/pdf/ebpf-verifier-range-analysis22.pdf [2] https://link.springer.com/chapter/10.1007/978-3-031-37709-9_12 [3] https://github.com/bpfverif/agni Co-developed-by: Matan Shachnai Signed-off-by: Matan Shachnai Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402212039.51815-1-harishankar.vishwanathan@gmail.com Link: https://lore.kernel.org/bpf/20240416115303.331688-1-harishankar.vishwanathan@gmail.com --- kernel/bpf/verifier.c | 94 ++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2aad6d90550f..68cfd6fc6ad4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13320,7 +13320,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; if (src_known && dst_known) { @@ -13333,18 +13332,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13353,7 +13350,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { @@ -13366,18 +13362,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13389,7 +13383,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; if (src_known && dst_known) { @@ -13402,18 +13395,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13422,7 +13413,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { @@ -13435,18 +13425,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13458,7 +13446,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -13469,10 +13456,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u32 result into s32. - */ + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } else { @@ -13486,7 +13473,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; if (src_known && dst_known) { /* dst_reg->var_off.value has been updated earlier */ @@ -13498,10 +13484,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u64 result into s64. - */ + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; } else { -- cgit v1.2.3 From 15b8b9ab5081d8dce9aa27a594ba4db2c29cefc0 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 16 Apr 2024 16:20:09 +0200 Subject: cgroup/pids: Remove superfluous zeroing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atomic counters are in kzalloc'd struct. They are zeroed already and atomic64_t does not need special initialization (cf kernel/trace/trace_clock.c:trace_counter). Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 7695e60bcb40..0e5ec7d59b4d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } -- cgit v1.2.3 From fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 16 Apr 2024 19:51:26 +0200 Subject: cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints This commit enhances the ability to troubleshoot the global cgroup_rstat_lock by introducing wrapper helper functions for the lock along with associated tracepoints. Although global, the cgroup_rstat_lock helper APIs and tracepoints take arguments such as cgroup pointer and cpu_in_loop variable. This adjustment is made because flushing occurs per cgroup despite the lock being global. Hence, when troubleshooting, it's important to identify the relevant cgroup. The cpu_in_loop variable is necessary because the global lock may be released within the main flushing loop that traverses CPUs. In the tracepoints, the cpu_in_loop value is set to -1 when acquiring the main lock; otherwise, it denotes the CPU number processed last. The new feature in this patchset is detecting when lock is contended. The tracepoints are implemented with production in mind. For minimum overhead attach to cgroup:cgroup_rstat_lock_contended, which only gets activated when trylock detects lock is contended. A quick production check for issues could be done via this perf commands: perf record -g -e cgroup:cgroup_rstat_lock_contended Next natural question would be asking how long time do lock contenders wait for obtaining the lock. This can be answered by measuring the time between cgroup:cgroup_rstat_lock_contended and cgroup:cgroup_rstat_locked when args->contended is set. Like this bpftrace script: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs} tracepoint:cgroup:cgroup_rstat_locked { if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}} interval:s:1 {time("%H:%M:%S "); print(@wait_ns); }' Extending with time spend holding the lock will be more expensive as this also looks at all the non-contended cases. Like this bpftrace script: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs} tracepoint:cgroup:cgroup_rstat_locked { @locked[tid]=nsecs; if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}} tracepoint:cgroup:cgroup_rstat_unlock { @locked_ns=hist(nsecs-@locked[tid]); delete(@locked[tid]);} interval:s:1 {time("%H:%M:%S "); print(@wait_ns);print(@locked_ns); }' Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 +- include/trace/events/cgroup.h | 48 +++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/rstat.c | 47 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 88 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87def..2150ca60394b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -690,7 +690,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); -void cgroup_rstat_flush_release(void); +void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index dd7d7c9efecd..13f375800135 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -204,6 +204,54 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, TP_ARGS(cgrp, path, val) ); +DECLARE_EVENT_CLASS(cgroup_rstat, + + TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended), + + TP_ARGS(cgrp, cpu_in_loop, contended), + + TP_STRUCT__entry( + __field( int, root ) + __field( int, level ) + __field( u64, id ) + __field( int, cpu_in_loop ) + __field( bool, contended ) + ), + + TP_fast_assign( + __entry->root = cgrp->root->hierarchy_id; + __entry->id = cgroup_id(cgrp); + __entry->level = cgrp->level; + __entry->cpu_in_loop = cpu_in_loop; + __entry->contended = contended; + ), + + TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d", + __entry->root, __entry->id, __entry->level, + __entry->cpu_in_loop, __entry->contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 07e2284bb499..ff68c904e647 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@ #include #include +#include + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -222,6 +224,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) + __acquires(&cgroup_rstat_lock) +{ + bool contended; + + contended = !spin_trylock_irq(&cgroup_rstat_lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(&cgroup_rstat_lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) + __releases(&cgroup_rstat_lock) +{ + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(&cgroup_rstat_lock); +} + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +279,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, cpu); } } } @@ -273,9 +304,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } /** @@ -291,17 +322,17 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } int cgroup_rstat_init(struct cgroup *cgrp) @@ -533,7 +564,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; #endif - cgroup_rstat_flush_release(); + cgroup_rstat_flush_release(cgrp); } else { root_cgroup_cputime(&bstat); usage = bstat.cputime.sum_exec_runtime; -- cgit v1.2.3 From 89d6910cc562ab34d1f1c08f3cf0a9700b8bf2c4 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 10 Apr 2024 17:09:45 +0200 Subject: sched/vtime: Get rid of generic vtime_task_switch() implementation The generic vtime_task_switch() implementation gets built only if __ARCH_HAS_VTIME_TASK_SWITCH is not defined, but requires an architecture to implement arch_vtime_task_switch() callback at the same time, which is confusing. Further, arch_vtime_task_switch() is implemented for 32-bit PowerPC architecture only and vtime_task_switch() generic variant is rather superfluous. Simplify the whole vtime_task_switch() wiring by moving the existing generic implementation to PowerPC. Signed-off-by: Alexander Gordeev Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Reviewed-by: Nicholas Piggin Acked-by: Michael Ellerman Link: https://lore.kernel.org/r/2cb6e3caada93623f6d4f78ad938ac6cd0e2fda8.1712760275.git.agordeev@linux.ibm.com --- arch/powerpc/include/asm/cputime.h | 13 ------------- arch/powerpc/kernel/time.c | 22 ++++++++++++++++++++++ kernel/sched/cputime.c | 13 ------------- 3 files changed, 22 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 4961fb38e438..aff858ca99c0 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,23 +32,10 @@ #ifdef CONFIG_PPC64 #define get_accounting(tsk) (&get_paca()->accounting) #define raw_get_accounting(tsk) (&local_paca->accounting) -static inline void arch_vtime_task_switch(struct task_struct *tsk) { } #else #define get_accounting(tsk) (&task_thread_info(tsk)->accounting) #define raw_get_accounting(tsk) get_accounting(tsk) -/* - * Called from the context switch with interrupts disabled, to charge all - * accumulated times to the current process, and to prepare accounting on - * the next process. - */ -static inline void arch_vtime_task_switch(struct task_struct *prev) -{ - struct cpu_accounting_data *acct = get_accounting(current); - struct cpu_accounting_data *acct0 = get_accounting(prev); - - acct->starttime = acct0->starttime; -} #endif /* diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index df20cf201f74..c0fdc6d94fee 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -354,6 +354,28 @@ void vtime_flush(struct task_struct *tsk) acct->hardirq_time = 0; acct->softirq_time = 0; } + +/* + * Called from the context switch with interrupts disabled, to charge all + * accumulated times to the current process, and to prepare accounting on + * the next process. + */ +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_kernel(prev); + + vtime_flush(prev); + + if (!IS_ENABLED(CONFIG_PPC64)) { + struct cpu_accounting_data *acct = get_accounting(current); + struct cpu_accounting_data *acct0 = get_accounting(prev); + + acct->starttime = acct0->starttime; + } +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ void __no_kcsan __delay(unsigned long loops) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index af7952f12e6c..aa48b2ec879d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -# ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_kernel(prev); - - vtime_flush(prev); - arch_vtime_task_switch(prev); -} -# endif - void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { unsigned int pc = irq_count() - offset; -- cgit v1.2.3 From 97a46a66ad7d912638c5eefa1f567b4a4a5b58b8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 17 Apr 2024 12:59:46 +0200 Subject: cgroup/rstat: desc member cgrp in cgroup_rstat_flush_release Recent change to cgroup_rstat_flush_release added a parameter cgrp, which is used by tracepoint to correlate with other tracepoints that also have this cgrp. The kernel test robot detected kernel doc was missing a description of this member. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404170821.HwZGISTY-lkp@intel.com/ Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ff68c904e647..52e3b0ed1cee 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -328,6 +328,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint */ void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) -- cgit v1.2.3 From a6b8daba00e6703b728933d2ec24e6d1ee5d5ec0 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 17 Apr 2024 03:50:28 +0000 Subject: cgroup_freezer: update comment for freezer_css_online() The freezer->lock was replaced by freezer_mutex in commit e5ced8ebb10c ("cgroup_freezer: replace freezer->lock with freezer_mutex"), so the comment here is out-of-date, update it. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 66d1708042a7..d598cc6ee92e 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { -- cgit v1.2.3 From 6b9391b581fddd8579239dad4de4f0393149e10a Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Tue, 12 Mar 2024 16:53:41 -0700 Subject: riscv: Include riscv_set_icache_flush_ctx prctl Support new prctl with key PR_RISCV_SET_ICACHE_FLUSH_CTX to enable optimization of cross modifying code. This prctl enables userspace code to use icache flushing instructions such as fence.i with the guarantee that the icache will continue to be clean after thread migration. Signed-off-by: Charlie Jenkins Reviewed-by: Atish Patra Reviewed-by: Alexandre Ghiti Reviewed-by: Samuel Holland Link: https://lore.kernel.org/r/20240312-fencei-v13-2-4b6bdc2bbf32@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/mmu.h | 2 + arch/riscv/include/asm/processor.h | 10 ++++ arch/riscv/include/asm/switch_to.h | 23 ++++++++ arch/riscv/mm/cacheflush.c | 113 +++++++++++++++++++++++++++++++++++++ arch/riscv/mm/context.c | 19 ++++--- include/uapi/linux/prctl.h | 6 ++ kernel/sys.c | 6 ++ 7 files changed, 171 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 355504b37f8e..60be458e94da 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -19,6 +19,8 @@ typedef struct { #ifdef CONFIG_SMP /* A local icache flush is needed before user execution can resume. */ cpumask_t icache_stale_mask; + /* Force local icache flush on all migrations. */ + bool force_icache_flush; #endif #ifdef CONFIG_BINFMT_ELF_FDPIC unsigned long exec_fdpic_loadmap; diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index a8509cc31ab2..cca62013c3c0 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -69,6 +69,7 @@ #endif #ifndef __ASSEMBLY__ +#include struct task_struct; struct pt_regs; @@ -123,6 +124,12 @@ struct thread_struct { struct __riscv_v_ext_state vstate; unsigned long align_ctl; struct __riscv_v_ext_state kernel_vstate; +#ifdef CONFIG_SMP + /* Flush the icache on migration */ + bool force_icache_flush; + /* A forced icache flush is not needed if migrating to the previous cpu. */ + unsigned int prev_cpu; +#endif }; /* Whitelist the fstate from the task_struct for hardened usercopy */ @@ -184,6 +191,9 @@ extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val); #define GET_UNALIGN_CTL(tsk, addr) get_unalign_ctl((tsk), (addr)) #define SET_UNALIGN_CTL(tsk, val) set_unalign_ctl((tsk), (val)) +#define RISCV_SET_ICACHE_FLUSH_CTX(arg1, arg2) riscv_set_icache_flush_ctx(arg1, arg2) +extern int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long per_thread); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index 7efdb0584d47..7594df37cc9f 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -72,14 +73,36 @@ static __always_inline bool has_fpu(void) { return false; } extern struct task_struct *__switch_to(struct task_struct *, struct task_struct *); +static inline bool switch_to_should_flush_icache(struct task_struct *task) +{ +#ifdef CONFIG_SMP + bool stale_mm = task->mm && task->mm->context.force_icache_flush; + bool stale_thread = task->thread.force_icache_flush; + bool thread_migrated = smp_processor_id() != task->thread.prev_cpu; + + return thread_migrated && (stale_mm || stale_thread); +#else + return false; +#endif +} + +#ifdef CONFIG_SMP +#define __set_prev_cpu(thread) ((thread).prev_cpu = smp_processor_id()) +#else +#define __set_prev_cpu(thread) +#endif + #define switch_to(prev, next, last) \ do { \ struct task_struct *__prev = (prev); \ struct task_struct *__next = (next); \ + __set_prev_cpu(__prev->thread); \ if (has_fpu()) \ __switch_to_fpu(__prev, __next); \ if (has_vector()) \ __switch_to_vector(__prev, __next); \ + if (switch_to_should_flush_icache(__next)) \ + local_flush_icache_all(); \ ((last) = __switch_to(__prev, __next)); \ } while (0) diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 55a34f2020a8..15a95578e670 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -152,3 +153,115 @@ void __init riscv_init_cbo_blocksizes(void) if (cboz_block_size) riscv_cboz_block_size = cboz_block_size; } + +#ifdef CONFIG_SMP +static void set_icache_stale_mask(void) +{ + cpumask_t *mask; + bool stale_cpu; + + /* + * Mark every other hart's icache as needing a flush for + * this MM. Maintain the previous value of the current + * cpu to handle the case when this function is called + * concurrently on different harts. + */ + mask = ¤t->mm->context.icache_stale_mask; + stale_cpu = cpumask_test_cpu(smp_processor_id(), mask); + + cpumask_setall(mask); + assign_bit(cpumask_check(smp_processor_id()), cpumask_bits(mask), stale_cpu); +} +#endif + +/** + * riscv_set_icache_flush_ctx() - Enable/disable icache flushing instructions in + * userspace. + * @ctx: Set the type of icache flushing instructions permitted/prohibited in + * userspace. Supported values described below. + * + * Supported values for ctx: + * + * * %PR_RISCV_CTX_SW_FENCEI_ON: Allow fence.i in user space. + * + * * %PR_RISCV_CTX_SW_FENCEI_OFF: Disallow fence.i in user space. All threads in + * a process will be affected when ``scope == PR_RISCV_SCOPE_PER_PROCESS``. + * Therefore, caution must be taken; use this flag only when you can guarantee + * that no thread in the process will emit fence.i from this point onward. + * + * @scope: Set scope of where icache flushing instructions are allowed to be + * emitted. Supported values described below. + * + * Supported values for scope: + * + * * %PR_RISCV_SCOPE_PER_PROCESS: Ensure the icache of any thread in this process + * is coherent with instruction storage upon + * migration. + * + * * %PR_RISCV_SCOPE_PER_THREAD: Ensure the icache of the current thread is + * coherent with instruction storage upon + * migration. + * + * When ``scope == PR_RISCV_SCOPE_PER_PROCESS``, all threads in the process are + * permitted to emit icache flushing instructions. Whenever any thread in the + * process is migrated, the corresponding hart's icache will be guaranteed to be + * consistent with instruction storage. This does not enforce any guarantees + * outside of migration. If a thread modifies an instruction that another thread + * may attempt to execute, the other thread must still emit an icache flushing + * instruction before attempting to execute the potentially modified + * instruction. This must be performed by the user-space program. + * + * In per-thread context (eg. ``scope == PR_RISCV_SCOPE_PER_THREAD``) only the + * thread calling this function is permitted to emit icache flushing + * instructions. When the thread is migrated, the corresponding hart's icache + * will be guaranteed to be consistent with instruction storage. + * + * On kernels configured without SMP, this function is a nop as migrations + * across harts will not occur. + */ +int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope) +{ +#ifdef CONFIG_SMP + switch (ctx) { + case PR_RISCV_CTX_SW_FENCEI_ON: + switch (scope) { + case PR_RISCV_SCOPE_PER_PROCESS: + current->mm->context.force_icache_flush = true; + break; + case PR_RISCV_SCOPE_PER_THREAD: + current->thread.force_icache_flush = true; + break; + default: + return -EINVAL; + } + break; + case PR_RISCV_CTX_SW_FENCEI_OFF: + switch (scope) { + case PR_RISCV_SCOPE_PER_PROCESS: + current->mm->context.force_icache_flush = false; + + set_icache_stale_mask(); + break; + case PR_RISCV_SCOPE_PER_THREAD: + current->thread.force_icache_flush = false; + + set_icache_stale_mask(); + break; + default: + return -EINVAL; + } + break; + default: + return -EINVAL; + } + return 0; +#else + switch (ctx) { + case PR_RISCV_CTX_SW_FENCEI_ON: + case PR_RISCV_CTX_SW_FENCEI_OFF: + return 0; + default: + return -EINVAL; + } +#endif +} diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 217fd4de6134..beef30f42d5c 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_MMU @@ -297,21 +298,23 @@ static inline void set_mm(struct mm_struct *prev, * * The "cpu" argument must be the current local CPU number. */ -static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu) +static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu, + struct task_struct *task) { #ifdef CONFIG_SMP - cpumask_t *mask = &mm->context.icache_stale_mask; - - if (cpumask_test_cpu(cpu, mask)) { - cpumask_clear_cpu(cpu, mask); + if (cpumask_test_and_clear_cpu(cpu, &mm->context.icache_stale_mask)) { /* * Ensure the remote hart's writes are visible to this hart. * This pairs with a barrier in flush_icache_mm. */ smp_mb(); - local_flush_icache_all(); - } + /* + * If cache will be flushed in switch_to, no need to flush here. + */ + if (!(task && switch_to_should_flush_icache(task))) + local_flush_icache_all(); + } #endif } @@ -332,5 +335,5 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, set_mm(prev, next, cpu); - flush_icache_deferred(next, cpu); + flush_icache_deferred(next, cpu, task); } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 370ed14b1ae0..524d546d697b 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -306,4 +306,10 @@ struct prctl_mm_map { # define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc # define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f +#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71 +# define PR_RISCV_CTX_SW_FENCEI_ON 0 +# define PR_RISCV_CTX_SW_FENCEI_OFF 1 +# define PR_RISCV_SCOPE_PER_PROCESS 0 +# define PR_RISCV_SCOPE_PER_THREAD 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index e219fcfa112d..69afdd8b430f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -146,6 +146,9 @@ #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif +#ifndef RISCV_SET_ICACHE_FLUSH_CTX +# define RISCV_SET_ICACHE_FLUSH_CTX(a, b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2743,6 +2746,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_RISCV_V_GET_CONTROL: error = RISCV_V_GET_CONTROL(); break; + case PR_RISCV_SET_ICACHE_FLUSH_CTX: + error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 15a0b5fe1ad6fbecfa6517750718089e12ee8344 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 18 Apr 2024 02:19:30 +0000 Subject: cgroup: don't call cgroup1_pidlist_destroy_all() for v2 Currently cgroup1_pidlist_destroy_all() will be called when releasing cgroup even if the cgroup is on default hierarchy, however it doesn't make any sense for v2 to destroy pidlist of v1. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c851c..e32b6972c478 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); -- cgit v1.2.3 From f71bfbe1e281a707e7766eb0a59ba056dc0f29f9 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 18 Apr 2024 12:43:49 +0000 Subject: cgroup, legacy_freezer: update comment for freezer_css_offline() After commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), system_freezing_count was replaced by freezer_active. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index d598cc6ee92e..074653f964c1 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -132,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 19fc8a896565ecebb3951664fd0eeab0a80314a1 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 19 Apr 2024 08:53:16 +0000 Subject: cgroup: Avoid unnecessary looping in cgroup_no_v1() No need to continue the for_each_subsys loop after the token matches the name of subsys and cgroup_no_v1_mask is set. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..b9dbf6bf2779 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; -- cgit v1.2.3 From bfa858f220ab8c950dd3e1310fee61950d0ecdae Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 18 Apr 2024 11:40:08 +0200 Subject: sysctl: treewide: constify ctl_table_header::ctl_table_arg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to constify instances of struct ctl_tables it is necessary to remove ways through which non-const versions are exposed from the sysctl core. One of these is the ctl_table_arg member of struct ctl_table_header. Constify this reference as a prerequisite for the full constification of struct ctl_table instances. No functional change. Signed-off-by: Thomas Weißschuh Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 +- include/linux/sysctl.h | 2 +- ipc/ipc_sysctl.c | 2 +- ipc/mq_sysctl.c | 2 +- kernel/ucount.c | 2 +- net/ax25/sysctl_net_ax25.c | 2 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/sysctl_net_core.c | 2 +- net/ieee802154/6lowpan/reassembly.c | 2 +- net/ipv4/devinet.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 2 +- net/ipv6/sysctl_net_ipv6.c | 6 +++--- net/ipv6/xfrm6_policy.c | 2 +- net/mpls/af_mpls.c | 4 ++-- net/mptcp/ctrl.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/sctp/sysctl.c | 2 +- net/smc/smc_sysctl.c | 2 +- net/unix/sysctl_net_unix.c | 2 +- net/xfrm/xfrm_sysctl.c | 2 +- 27 files changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index bb95ce43cd97..66f8542f3b18 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1971,7 +1971,7 @@ static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); - struct ctl_table *table; + const struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index ee7d33b89e9e..9413241df962 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -182,7 +182,7 @@ struct ctl_table_header { struct rcu_head rcu; }; struct completion *unregistering; - struct ctl_table *ctl_table_arg; + const struct ctl_table *ctl_table_arg; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_dir *parent; diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 45cb1dabce29..3c3755918d34 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -306,7 +306,7 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns) void retire_ipc_sysctls(struct ipc_namespace *ns) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = ns->ipc_sysctls->ctl_table_arg; unregister_sysctl_table(ns->ipc_sysctls); diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 21fba3a6edaf..69c709262f5a 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -160,7 +160,7 @@ bool setup_mq_sysctls(struct ipc_namespace *ns) void retire_mq_sysctls(struct ipc_namespace *ns) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = ns->mq_sysctls->ctl_table_arg; unregister_sysctl_table(ns->mq_sysctls); diff --git a/kernel/ucount.c b/kernel/ucount.c index 4aa6166cb856..d9e283600f5c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -119,7 +119,7 @@ bool setup_userns_sysctls(struct user_namespace *ns) void retire_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = ns->sysctls->ctl_table_arg; unregister_sysctl_table(ns->sysctls); diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index db66e11e7fe8..e0128dc9def3 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -171,7 +171,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) { struct ctl_table_header *header = ax25_dev->sysheader; - struct ctl_table *table; + const struct ctl_table *table; if (header) { ax25_dev->sysheader = NULL; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c683d8377862..7948a9e7542c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1275,7 +1275,7 @@ static int br_netfilter_sysctl_init_net(struct net *net) static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { - struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; + const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6973dda3abda..903ab4a51c17 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -743,7 +743,7 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6dd960ec558c..2a983cf450da 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -399,7 +399,7 @@ err_alloc: static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7a437f0d4190..7592f242336b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2749,7 +2749,7 @@ err_alloc_all: static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb947d1613fe..534b98a0744a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -632,7 +632,7 @@ err_alloc: static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6133f1fd848e..2e29ab819967 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3576,7 +3576,7 @@ err_dup: static __net_exit void sysctl_route_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e4f16a7dcc1..ce5d19978a26 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1554,7 +1554,7 @@ err_alloc: static __net_exit void ipv4_sysctl_exit_net(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c33bca2c3841..1dda59e0aeab 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -186,7 +186,7 @@ err_alloc: static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3a92d002abe4..9aa0900abfa1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7238,7 +7238,7 @@ out: static void __addrconf_sysctl_unregister(struct net *net, struct ipv6_devconf *p, int ifindex) { - struct ctl_table *table; + const struct ctl_table *table; if (!p->sysctl_header) return; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d0dcbaca1994..ce8c14d8aff5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -105,7 +105,7 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index acb4f119e11f..ee95cdcc8747 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -487,7 +487,7 @@ err_alloc: static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 888676163e90..75de55f907b0 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -313,9 +313,9 @@ out_ipv6_table: static void __net_exit ipv6_sysctl_net_exit(struct net *net) { - struct ctl_table *ipv6_table; - struct ctl_table *ipv6_route_table; - struct ctl_table *ipv6_icmp_table; + const struct ctl_table *ipv6_table; + const struct ctl_table *ipv6_route_table; + const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 42fb6996b077..4891012b692f 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -218,7 +218,7 @@ err_alloc: static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0315b8deed3f..5d2012d1cf4a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1422,7 +1422,7 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct mpls_dev *mdev) { struct net *net = dev_net(dev); - struct ctl_table *table; + const struct ctl_table *table; if (!mdev->sysctl) return; @@ -2690,7 +2690,7 @@ static void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; - struct ctl_table *table; + const struct ctl_table *table; unsigned int index; table = net->mpls.ctl->ctl_table_arg; diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 13fe0748dde8..8d661156ab8c 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -198,7 +198,7 @@ err_alloc: static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { - struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee98ce5b816..bb9dea676ec1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1122,7 +1122,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 370f8231385c..efedd2f13ac7 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -514,7 +514,7 @@ err_alloc: static void netfilter_log_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index f65d6f92afcb..25bdf17c7262 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -624,7 +624,7 @@ int sctp_sysctl_net_register(struct net *net) void sctp_sysctl_net_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->sctp.sysctl_header->ctl_table_arg; unregister_net_sysctl_table(net->sctp.sysctl_header); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a5946d1b9d60..4e8baa2e7ea4 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -133,7 +133,7 @@ err_alloc: void __net_exit smc_sysctl_net_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 3e84b31c355a..44996af61999 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -52,7 +52,7 @@ err_alloc: void unix_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c index 7fdeafc838a7..e972930c292b 100644 --- a/net/xfrm/xfrm_sysctl.c +++ b/net/xfrm/xfrm_sysctl.c @@ -76,7 +76,7 @@ out_kmemdup: void __net_exit xfrm_sysctl_fini(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->xfrm.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->xfrm.sysctl_hdr); -- cgit v1.2.3 From b7c8e1f8a7b4352c1d0b4310686385e3cf6c104a Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 18 Apr 2024 10:30:00 +0800 Subject: hrtimer: Rename __hrtimer_hres_active() to hrtimer_hres_active() The function hrtimer_hres_active() are defined in the hrtimer.c file, but not called elsewhere, so rename __hrtimer_hres_active() to hrtimer_hres_active() and remove the old hrtimer_hres_active() function. kernel/time/hrtimer.c:653:19: warning: unused function 'hrtimer_hres_active'. Fixes: 82ccdf062a64 ("hrtimer: Remove unused function") Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/20240418023000.130324-1-jiapeng.chong@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8778 --- kernel/time/hrtimer.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cae9d04b5584..492c14aac642 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -644,17 +644,12 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * Is the high resolution mode active ? */ -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } -static inline int hrtimer_hres_active(void) -{ - return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); -} - static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) @@ -678,7 +673,7 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, * set. So we'd effectively block all timers until the T2 event * fires. */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); @@ -789,12 +784,12 @@ static void retrigger_next_event(void *arg) * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ - if (!__hrtimer_hres_active(base) && !tick_nohz_active) + if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); - if (__hrtimer_hres_active(base)) + if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); @@ -951,7 +946,7 @@ void clock_was_set(unsigned int bases) cpumask_var_t mask; int cpu; - if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) + if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -1491,7 +1486,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!__hrtimer_hres_active(cpu_base)) + if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1514,7 +1509,7 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (__hrtimer_hres_active(cpu_base)) { + if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { @@ -1886,7 +1881,7 @@ void hrtimer_run_queues(void) unsigned long flags; ktime_t now; - if (__hrtimer_hres_active(cpu_base)) + if (hrtimer_hres_active(cpu_base)) return; /* -- cgit v1.2.3 From e1a7545981e2086feaa40dcb7b0db8de0bdada19 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 14:52:26 -0300 Subject: bpf: Fix typo in function save_aux_ptr_type I found this typo in the save_aux_ptr_type function. s/allow_trust_missmatch/allow_trust_mismatch/ I did not find this anywhere else in the codebase. Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/fbe1d636-8172-4698-9a5a-5a3444b55322@smtp-relay.sendinblue.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68cfd6fc6ad4..7b2aa065e552 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6971,7 +6971,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch); + bool allow_trust_mismatch); static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { @@ -17530,7 +17530,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch) + bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; @@ -17548,7 +17548,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * src_reg == stack|map in some other branch. * Reject it. */ - if (allow_trust_missmatch && + if (allow_trust_mismatch && base_type(type) == PTR_TO_BTF_ID && base_type(*prev_type) == PTR_TO_BTF_ID) { /* -- cgit v1.2.3 From a7de265cb2d849f8986a197499ad58dca0a4f209 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 15:49:14 -0300 Subject: bpf: Fix typos in comments Found the following typos in comments, and fixed them: s/unpriviledged/unprivileged/ s/reponsible/responsible/ s/possiblities/possibilities/ s/Divison/Division/ s/precsion/precision/ s/havea/have a/ s/reponsible/responsible/ s/responsibile/responsible/ s/tigher/tighter/ s/respecitve/respective/ Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6af7deb4-bb24-49e8-b3f1-8dd410597337@smtp-relay.sendinblue.com --- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index bdea1a459153..976cb258a0ed 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -318,7 +318,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not - * responsibile to free the local_storage. + * responsible to free the local_storage. */ if (storage_smap) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a41718eaeefe..95c7fd093e55 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2814,7 +2814,7 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); -/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +/* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83a9a74260e9..c3e79a0b9361 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1539,7 +1539,7 @@ static void htab_map_free(struct bpf_map *map) */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it - * underneath and is reponsible for waiting for callbacks to finish + * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8cde717137bd..61126180d398 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2412,7 +2412,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer - * if the bpf program allows skb data writes. There are two possiblities + * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b2aa065e552..5a7e34e83a5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -172,7 +172,7 @@ static bool bpf_global_percpu_ma_set; /* verifier_state + insn_idx are pushed to stack when branch is encountered */ struct bpf_verifier_stack_elem { - /* verifer state is 'st' + /* verifier state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ @@ -2131,7 +2131,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tigher range. + * values on both sides of 64-bit range in hope to have tighter range. * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound @@ -2139,7 +2139,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respecitve s64 or u64 domain, just + * with are well-formed ranges in respective s64 or u64 domain, just * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ __u64 new_umin, new_umax; @@ -14714,7 +14714,7 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state /* Adjusts the register min/max values in the case that the dst_reg and * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * check, in which case we have a fake SCALAR_VALUE representing insn->imm). * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now. */ @@ -17352,7 +17352,7 @@ hit: err = propagate_liveness(env, &sl->state, cur); /* if previous state reached the exit with precision and - * current state is equivalent to it (except precsion marks) + * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ @@ -20209,7 +20209,7 @@ patch_map_ops_generic: * divide-by-3 through multiplication, followed by further * division by 8 through 3-bit right shift. * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., - * p. 227, chapter "Unsigned Divison by 3" for details and proofs. + * p. 227, chapter "Unsigned Division by 3" for details and proofs. * * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. */ -- cgit v1.2.3 From 5b6d8ef6f056d8130168746c5459d7a3fb494859 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 28 Mar 2024 15:00:17 +0100 Subject: kdb: Use str_plural() to fix Coccinelle warning Fixes the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_plural(days) Signed-off-by: Thorsten Blum Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20240328140015.388654-3-thorsten.blum@toblux.com Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d05066cb40b2..664bae55f2c9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2517,7 +2517,7 @@ static int kdb_summary(int argc, const char **argv) if (val.uptime > (24*60*60)) { int days = val.uptime / (24*60*60); val.uptime %= (24*60*60); - kdb_printf("%d day%s ", days, days == 1 ? "" : "s"); + kdb_printf("%d day%s ", days, str_plural(days)); } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); -- cgit v1.2.3 From 8996f93fc3880d000a0a0cb40c724d5830e140fd Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 20 Apr 2024 09:46:16 +0000 Subject: cgroup/cpuset: Statically initialize more members of top_cpuset Initializing top_cpuset.relax_domain_level and setting CS_SCHED_LOAD_BALANCE to top_cpuset.flags in cpuset_init() could be completed at the time of top_cpuset definition by compiler. Signed-off-by: Xiu Jianfeng Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d8d3439eda4e..e70008a1d86a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -369,8 +369,9 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + (1 << CS_MEM_EXCLUSIVE) | (1 < CS_SCHED_LOAD_BALANCE)), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; @@ -4309,8 +4310,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); -- cgit v1.2.3 From bb58c1baa501e3d695d8e13a5bd955fd00e2e879 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 17 Apr 2024 16:53:56 +0800 Subject: genirq: Simplify the checks for irq_set_percpu_devid_partition() Since whether desc is NULL or desc->percpu_enabled is true, it returns -EINVAL, check them together, and assign desc->percpu_affinity using a ternary to simplify the code. Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240417085356.3785381-1-ruanjinjie@huawei.com --- kernel/irq/irqdesc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 382093196210..f0d4b98c7820 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -911,10 +911,7 @@ int irq_set_percpu_devid_partition(unsigned int irq, { struct irq_desc *desc = irq_to_desc(irq); - if (!desc) - return -EINVAL; - - if (desc->percpu_enabled) + if (!desc || desc->percpu_enabled) return -EINVAL; desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); @@ -922,10 +919,7 @@ int irq_set_percpu_devid_partition(unsigned int irq, if (!desc->percpu_enabled) return -ENOMEM; - if (affinity) - desc->percpu_affinity = affinity; - else - desc->percpu_affinity = cpu_possible_mask; + desc->percpu_affinity = affinity ? : cpu_possible_mask; irq_set_percpu_devid_flags(irq); return 0; -- cgit v1.2.3 From 118005713e35a1893c6ee47ab2926cca277737de Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:08 +0530 Subject: crash: forward memory_notify arg to arch crash hotplug handler In the event of memory hotplug or online/offline events, the crash memory hotplug notifier `crash_memhp_notifier()` receives a `memory_notify` object but doesn't forward that object to the generic and architecture-specific crash hotplug handler. The `memory_notify` object contains the starting PFN (Page Frame Number) and the number of pages in the hot-removed memory. This information is necessary for architectures like PowerPC to update/recreate the kdump image, specifically `elfcorehdr`. So update the function signature of `crash_handle_hotplug_event()` and `arch_crash_handle_hotplug_event()` to accept the `memory_notify` object as an argument from crash memory hotplug notifier. Since no such object is available in the case of CPU hotplug event, the crash CPU hotplug notifier `crash_cpuhp_online()` passes NULL to the crash hotplug handler. Signed-off-by: Sourabh Jain Acked-by: Baoquan He Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-2-sourabhjain@linux.ibm.com --- arch/x86/include/asm/kexec.h | 2 +- arch/x86/kernel/crash.c | 4 +++- include/linux/crash_core.h | 2 +- kernel/crash_core.c | 14 +++++++------- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 91ca9a9ee3a2..cb1320ebbc23 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -207,7 +207,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); extern void kdump_nmi_shootdown_cpus(void); #ifdef CONFIG_CRASH_HOTPLUG -void arch_crash_handle_hotplug_event(struct kimage *image); +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e74d0c4286c1..2a682fe86352 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -432,10 +432,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void) /** * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes * @image: a pointer to kexec_crash_image + * @arg: struct memory_notify handler for memory hotplug case and + * NULL for CPU hotplug case. * * Prepare the new elfcorehdr and replace the existing elfcorehdr. */ -void arch_crash_handle_hotplug_event(struct kimage *image) +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { void *elfbuf = NULL, *old_elfcorehdr; unsigned long nr_mem_ranges; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index d33352c2e386..647e928efee8 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -37,7 +37,7 @@ static inline void arch_kexec_unprotect_crashkres(void) { } #ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } +static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } #endif int crash_check_update_elfcorehdr(void); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..70fa8111a9d6 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -534,7 +534,7 @@ int crash_check_update_elfcorehdr(void) * list of segments it checks (since the elfcorehdr changes and thus * would require an update to purgatory itself to update the digest). */ -static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg) { struct kimage *image; @@ -596,7 +596,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) image->hp_action = hp_action; /* Now invoke arch-specific update handler */ - arch_crash_handle_hotplug_event(image); + arch_crash_handle_hotplug_event(image, arg); /* No longer handling a hotplug event */ image->hp_action = KEXEC_CRASH_HP_NONE; @@ -612,17 +612,17 @@ out: crash_hotplug_unlock(); } -static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg) { switch (val) { case MEM_ONLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; case MEM_OFFLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; } return NOTIFY_OK; @@ -635,13 +635,13 @@ static struct notifier_block crash_memhp_nb = { static int crash_cpuhp_online(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL); return 0; } static int crash_cpuhp_offline(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL); return 0; } -- cgit v1.2.3 From 79365026f86948b52c3cb7bf099dded92c559b4c Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:09 +0530 Subject: crash: add a new kexec flag for hotplug support Commit a72bbec70da2 ("crash: hotplug support for kexec_load()") introduced a new kexec flag, `KEXEC_UPDATE_ELFCOREHDR`. Kexec tool uses this flag to indicate to the kernel that it is safe to modify the elfcorehdr of the kdump image loaded using the kexec_load system call. However, it is possible that architectures may need to update kexec segments other then elfcorehdr. For example, FDT (Flatten Device Tree) on PowerPC. Introducing a new kexec flag for every new kexec segment may not be a good solution. Hence, a generic kexec flag bit, `KEXEC_CRASH_HOTPLUG_SUPPORT`, is introduced to share the CPU/Memory hotplug support intent between the kexec tool and the kernel for the kexec_load system call. Now we have two kexec flags that enables crash hotplug support for kexec_load system call. First is KEXEC_UPDATE_ELFCOREHDR (only used in x86), and second is KEXEC_CRASH_HOTPLUG_SUPPORT (for all architectures). To simplify the process of finding and reporting the crash hotplug support the following changes are introduced. 1. Define arch specific function to process the kexec flags and determine crash hotplug support 2. Rename the @update_elfcorehdr member of struct kimage to @hotplug_support and populate it for both kexec_load and kexec_file_load syscalls, because architecture can update more than one kexec segment 3. Let generic function crash_check_hotplug_support report hotplug support for loaded kdump image based on value of @hotplug_support To bring the x86 crash hotplug support in line with the above points, the following changes have been made: - Introduce the arch_crash_hotplug_support function to process kexec flags and determine crash hotplug support - Remove the arch_crash_hotplug_[cpu|memory]_support functions Signed-off-by: Sourabh Jain Acked-by: Baoquan He Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-3-sourabhjain@linux.ibm.com --- arch/x86/include/asm/kexec.h | 11 ++--------- arch/x86/kernel/crash.c | 28 +++++++++++++++++----------- drivers/base/cpu.c | 2 +- drivers/base/memory.c | 2 +- include/linux/crash_core.h | 13 ++++++------- include/linux/kexec.h | 11 +++++++---- include/uapi/linux/kexec.h | 1 + kernel/crash_core.c | 15 ++++++--------- kernel/kexec.c | 4 ++-- kernel/kexec_file.c | 5 +++++ 10 files changed, 48 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index cb1320ebbc23..ae5482a2f0ca 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -210,15 +210,8 @@ extern void kdump_nmi_shootdown_cpus(void); void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event -#ifdef CONFIG_HOTPLUG_CPU -int arch_crash_hotplug_cpu_support(void); -#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support -#endif - -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_crash_hotplug_memory_support(void); -#define crash_hotplug_memory_support arch_crash_hotplug_memory_support -#endif +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); +#define arch_crash_hotplug_support arch_crash_hotplug_support unsigned int arch_crash_get_elfcorehdr_size(void); #define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2a682fe86352..f06501445cd9 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -402,20 +402,26 @@ int crash_load_segments(struct kimage *image) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt -/* These functions provide the value for the sysfs crash_hotplug nodes */ -#ifdef CONFIG_HOTPLUG_CPU -int arch_crash_hotplug_cpu_support(void) +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) { - return crash_check_update_elfcorehdr(); -} -#endif -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_crash_hotplug_memory_support(void) -{ - return crash_check_update_elfcorehdr(); -} +#ifdef CONFIG_KEXEC_FILE + if (image->file_mode) + return 1; #endif + /* + * Initially, crash hotplug support for kexec_load was added + * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this + * functionality was expanded to accommodate multiple kexec + * segment updates, leading to the introduction of the + * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently, + * when the kexec tool sends either of these flags, it indicates + * that the required kexec segment (elfcorehdr) is excluded from + * the SHA calculation. + */ + return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR || + kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT); +} unsigned int arch_crash_get_elfcorehdr_size(void) { diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 56fba44ba391..c61ecb0c2ae2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -306,7 +306,7 @@ static ssize_t crash_hotplug_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sysfs_emit(buf, "%d\n", crash_hotplug_cpu_support()); + return sysfs_emit(buf, "%d\n", crash_check_hotplug_support()); } static DEVICE_ATTR_ADMIN_RO(crash_hotplug); #endif diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c0436f46cfb7..67858eeb92ed 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -535,7 +535,7 @@ static DEVICE_ATTR_RW(auto_online_blocks); static ssize_t crash_hotplug_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support()); + return sysfs_emit(buf, "%d\n", crash_check_hotplug_support()); } static DEVICE_ATTR_RO(crash_hotplug); #endif diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 647e928efee8..44305336314e 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -40,14 +40,13 @@ static inline void arch_kexec_unprotect_crashkres(void) { } static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } #endif -int crash_check_update_elfcorehdr(void); +int crash_check_hotplug_support(void); -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } +#ifndef arch_crash_hotplug_support +static inline int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) +{ + return 0; +} #endif #ifndef crash_get_elfcorehdr_size diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 060835bb82d5..5b93a5767413 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -319,8 +319,10 @@ struct kimage { /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; #ifdef CONFIG_CRASH_HOTPLUG - /* If set, allow changes to elfcorehdr of kexec_load'd image */ - unsigned int update_elfcorehdr:1; + /* If set, it is safe to update kexec segments that are + * excluded from SHA calculation. + */ + unsigned int hotplug_support:1; #endif #ifdef ARCH_HAS_KIMAGE_ARCH @@ -391,9 +393,10 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec flags */ #ifndef CONFIG_KEXEC_JUMP -#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR) +#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT) #else -#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR) +#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \ + KEXEC_CRASH_HOTPLUG_SUPPORT) #endif /* List of defined/legal kexec file flags */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index c17bb096ea68..5ae1741ea8ea 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -13,6 +13,7 @@ #define KEXEC_ON_CRASH 0x00000001 #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_UPDATE_ELFCOREHDR 0x00000004 +#define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008 #define KEXEC_ARCH_MASK 0xffff0000 /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 70fa8111a9d6..394db3ebe835 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -493,10 +493,10 @@ static DEFINE_MUTEX(__crash_hotplug_lock); /* * This routine utilized when the crash_hotplug sysfs node is read. - * It reflects the kernel's ability/permission to update the crash - * elfcorehdr directly. + * It reflects the kernel's ability/permission to update the kdump + * image directly. */ -int crash_check_update_elfcorehdr(void) +int crash_check_hotplug_support(void) { int rc = 0; @@ -508,10 +508,7 @@ int crash_check_update_elfcorehdr(void) return 0; } if (kexec_crash_image) { - if (kexec_crash_image->file_mode) - rc = 1; - else - rc = kexec_crash_image->update_elfcorehdr; + rc = kexec_crash_image->hotplug_support; } /* Release lock now that update complete */ kexec_unlock(); @@ -552,8 +549,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, image = kexec_crash_image; - /* Check that updating elfcorehdr is permitted */ - if (!(image->file_mode || image->update_elfcorehdr)) + /* Check that kexec segments update is permitted */ + if (!image->hotplug_support) goto out; if (hp_action == KEXEC_CRASH_HP_ADD_CPU || diff --git a/kernel/kexec.c b/kernel/kexec.c index bab542fc1463..a6b3f96bb50c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -135,8 +135,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image->preserve_context = 1; #ifdef CONFIG_CRASH_HOTPLUG - if (flags & KEXEC_UPDATE_ELFCOREHDR) - image->update_elfcorehdr = 1; + if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; #endif ret = machine_kexec_prepare(image); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 2d1db05fbf04..3d64290d24c9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -376,6 +376,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; +#ifdef CONFIG_CRASH_HOTPLUG + if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; +#endif + ret = machine_kexec_prepare(image); if (ret) goto out; -- cgit v1.2.3 From e8784765fae6edc47efb68d425c65e3633d70cd6 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 23 Apr 2024 02:44:39 +0000 Subject: cgroup/cpuset: Avoid clearing CS_SCHED_LOAD_BALANCE twice In cpuset_css_online(), CS_SCHED_LOAD_BALANCE will be cleared twice, the former one in the is_in_v2_mode() case could be removed because is_in_v2_mode() can be true for cgroup v1 if the "cpuset_v2_mode" mount option is specified, that balance flag change isn't appropriate for this particular case. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e70008a1d86a..032fcc93f2b8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4052,11 +4052,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* -- cgit v1.2.3 From 57a01eafdcf78f6da34fad9ff075ed5dfdd9f420 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 23 Apr 2024 08:19:05 +0200 Subject: workqueue: Fix selection of wake_cpu in kick_pool() With cpu_possible_mask=0-63 and cpu_online_mask=0-7 the following kernel oops was observed: smp: Bringing up secondary CPUs ... smp: Brought up 1 node, 8 CPUs Unable to handle kernel pointer dereference in virtual kernel address space Failing address: 0000000000000000 TEID: 0000000000000803 [..] Call Trace: arch_vcpu_is_preempted+0x12/0x80 select_idle_sibling+0x42/0x560 select_task_rq_fair+0x29a/0x3b0 try_to_wake_up+0x38e/0x6e0 kick_pool+0xa4/0x198 __queue_work.part.0+0x2bc/0x3a8 call_timer_fn+0x36/0x160 __run_timers+0x1e2/0x328 __run_timer_base+0x5a/0x88 run_timer_softirq+0x40/0x78 __do_softirq+0x118/0x388 irq_exit_rcu+0xc0/0xd8 do_ext_irq+0xae/0x168 ext_int_handler+0xbe/0xf0 psw_idle_exit+0x0/0xc default_idle_call+0x3c/0x110 do_idle+0xd4/0x158 cpu_startup_entry+0x40/0x48 rest_init+0xc6/0xc8 start_kernel+0x3c4/0x5e0 startup_continue+0x3c/0x50 The crash is caused by calling arch_vcpu_is_preempted() for an offline CPU. To avoid this, select the cpu with cpumask_any_and_distribute() to mask __pod_cpumask with cpu_online_mask. In case no cpu is left in the pool, skip the assignment. tj: This doesn't fully fix the bug as CPUs can still go down between picking the target CPU and the wake call. Fixing that likely requires adding cpu_online() test to either the sched or s390 arch code. However, regardless of how that is fixed, workqueue shouldn't be picking a CPU which isn't online as that would result in unpredictable and worse behavior. Signed-off-by: Sven Schnelle Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues") Cc: stable@vger.kernel.org # v6.6+ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0066c8f6c154..a2af0aaf026b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1277,8 +1277,12 @@ static bool kick_pool(struct worker_pool *pool) !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); - p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); - get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask, + cpu_online_mask); + if (wake_cpu < nr_cpu_ids) { + p->wake_cpu = wake_cpu; + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } } #endif wake_up_process(p); -- cgit v1.2.3 From 06fe8fd6808562971637c6b133c806bcf49097ad Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Tue, 23 Apr 2024 16:40:20 +0530 Subject: genirq/msi: Add MSI allocation helper and export MSI functions MSI functions for allocation and free can be directly used by the device drivers without any wrapper provided by bus drivers. So export these MSI functions. Also, add a wrapper API to allocate MSIs providing only the number of interrupts rather than range for simpler driver usage. Signed-off-by: Nipun Gupta Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423111021.1686144-1-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- include/linux/msi.h | 6 ++++++ kernel/irq/msi.c | 2 ++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 26d07e23052e..765a65581a66 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -676,6 +676,12 @@ int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nve void platform_device_msi_free_irqs_all(struct device *dev); bool msi_device_has_isolated_msi(struct device *dev); + +static inline int msi_domain_alloc_irqs(struct device *dev, unsigned int domid, int nirqs) +{ + return msi_domain_alloc_irqs_range(dev, domid, 0, nirqs - 1); +} + #else /* CONFIG_GENERIC_MSI_IRQ */ static inline bool msi_device_has_isolated_msi(struct device *dev) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index f90952ebc494..2024f89baea4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1434,6 +1434,7 @@ int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, msi_unlock_descs(dev); return ret; } +EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); /** * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain @@ -1680,6 +1681,7 @@ void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, msi_domain_free_irqs_range_locked(dev, domid, first, last); msi_unlock_descs(dev); } +EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); /** * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain -- cgit v1.2.3 From be2749beff62e0d63cf97fe63cabc79a68443139 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:01 +0200 Subject: bpf: make timer data struct more generic To be able to add workqueues and reuse most of the timer code, we need to make bpf_hrtimer more generic. There is no code change except that the new struct gets a new u64 flags attribute. We are still below 2 cache lines, so this shouldn't impact the current running codes. The ordering is also changed. Everything related to async callback is now on top of bpf_hrtimer. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-1-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 71 ++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 61126180d398..7c7f31dd87ba 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +struct bpf_async_cb { + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; + struct rcu_head rcu; + u64 flags; +}; + /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. - * That space is used to keep 'struct bpf_timer_kern'. + * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. @@ -1096,16 +1105,12 @@ const struct bpf_func_proto bpf_snprintf_proto = { * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { + struct bpf_async_cb cb; struct hrtimer timer; - struct bpf_map *map; - struct bpf_prog *prog; - void __rcu *callback_fn; - void *value; - struct rcu_head rcu; }; /* the actual struct hidden inside uapi struct bpf_timer */ -struct bpf_timer_kern { +struct bpf_async_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer @@ -1119,14 +1124,14 @@ static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); - struct bpf_map *map = t->map; - void *value = t->value; + struct bpf_map *map = t->cb.map; + void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); - callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; @@ -1155,7 +1160,7 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, u64, flags) { clockid_t clockid = flags & (MAX_CLOCKS - 1); @@ -1163,8 +1168,8 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map int ret = 0; BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); if (in_nmi()) return -EOPNOTSUPP; @@ -1187,10 +1192,10 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->record->timer_off; - t->map = map; - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + t->cb.value = (void *)timer - map->record->timer_off; + t->cb.map = map; + t->cb.prog = NULL; + rcu_assign_pointer(t->cb.callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; WRITE_ONCE(timer->timer, t); @@ -1222,7 +1227,7 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { struct bpf_prog *prev, *prog = aux->prog; @@ -1237,7 +1242,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EINVAL; goto out; } - if (!atomic64_read(&t->map->usercnt)) { + if (!atomic64_read(&t->cb.map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1246,7 +1251,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->prog; + prev = t->cb.prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1259,9 +1264,9 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->prog = prog; + t->cb.prog = prog; } - rcu_assign_pointer(t->callback_fn, callback_fn); + rcu_assign_pointer(t->cb.callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1275,7 +1280,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; @@ -1287,7 +1292,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; - if (!t || !t->prog) { + if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } @@ -1315,18 +1320,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_hrtimer *t) +static void drop_prog_refcnt(struct bpf_async_cb *async) { - struct bpf_prog *prog = t->prog; + struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + async->prog = NULL; + rcu_assign_pointer(async->callback_fn, NULL); } } -BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t; int ret = 0; @@ -1348,7 +1353,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) ret = -EDEADLK; goto out; } - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1371,7 +1376,7 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_timer_kern *timer = val; + struct bpf_async_kern *timer = val; struct bpf_hrtimer *t; /* Performance optimization: read timer->timer without lock first. */ @@ -1383,7 +1388,7 @@ void bpf_timer_cancel_and_free(void *val) t = timer->timer; if (!t) goto out; - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ @@ -1410,7 +1415,7 @@ out: */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree_rcu(t, rcu); + kfree_rcu(t, cb.rcu); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) -- cgit v1.2.3 From 56b4a177ae6322173360a93ea828ad18570a5a14 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:02 +0200 Subject: bpf: replace bpf_timer_init with a generic helper No code change except for the new flags argument being stored in the local data struct. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-2-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 91 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7c7f31dd87ba..0f0201171576 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1111,7 +1111,10 @@ struct bpf_hrtimer { /* the actual struct hidden inside uapi struct bpf_timer */ struct bpf_async_kern { - struct bpf_hrtimer *timer; + union { + struct bpf_async_cb *cb; + struct bpf_hrtimer *timer; + }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. @@ -1119,6 +1122,10 @@ struct bpf_async_kern { struct bpf_spin_lock lock; } __attribute__((aligned(8))); +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, +}; + static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) @@ -1160,46 +1167,55 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, - u64, flags) +static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, + enum bpf_async_type type) { - clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_async_cb *cb; struct bpf_hrtimer *t; + clockid_t clockid; + size_t size; int ret = 0; - BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); - if (in_nmi()) return -EOPNOTSUPP; - if (flags >= MAX_CLOCKS || - /* similar to timerfd except _ALARM variants are not supported */ - (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME && - clockid != CLOCK_BOOTTIME)) + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + size = sizeof(struct bpf_hrtimer); + break; + default: return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; + } + + __bpf_spin_lock_irqsave(&async->lock); + t = async->timer; if (t) { ret = -EBUSY; goto out; } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ - t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); - if (!t) { + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + if (!cb) { ret = -ENOMEM; goto out; } - t->cb.value = (void *)timer - map->record->timer_off; - t->cb.map = map; - t->cb.prog = NULL; - rcu_assign_pointer(t->cb.callback_fn, NULL); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; - WRITE_ONCE(timer->timer, t); - /* Guarantee the order between timer->timer and map->usercnt. So + + if (type == BPF_ASYNC_TYPE_TIMER) { + clockid = flags & (MAX_CLOCKS - 1); + t = (struct bpf_hrtimer *)cb; + + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + cb->value = (void *)async - map->record->timer_off; + } + cb->map = map; + cb->prog = NULL; + cb->flags = flags; + rcu_assign_pointer(cb->callback_fn, NULL); + + WRITE_ONCE(async->cb, cb); + /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. @@ -1209,15 +1225,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(timer->timer, NULL); - kfree(t); + WRITE_ONCE(async->cb, NULL); + kfree(cb); ret = -EPERM; } out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clock_t clockid = flags & (MAX_CLOCKS - 1); + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + + return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, -- cgit v1.2.3 From 073f11b0264310b85754b6a0946afee753790c66 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:03 +0200 Subject: bpf: replace bpf_timer_set_callback with a generic helper In the same way we have a generic __bpf_async_init(), we also need to share code between timer and workqueue for the set_callback call. We just add an unused flags parameter, as it will be used for workqueues. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-3-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0f0201171576..1cbbf28e9398 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1262,22 +1262,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, - struct bpf_prog_aux *, aux) +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog_aux *aux, unsigned int flags, + enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { + __bpf_spin_lock_irqsave(&async->lock); + cb = async->cb; + if (!cb) { ret = -EINVAL; goto out; } - if (!atomic64_read(&t->cb.map->usercnt)) { + if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1286,7 +1287,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->cb.prog; + prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1299,14 +1300,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->cb.prog = prog; + cb->prog = prog; } - rcu_assign_pointer(t->cb.callback_fn, callback_fn); + rcu_assign_pointer(cb->callback_fn, callback_fn); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, -- cgit v1.2.3 From fc22d9495f0b32d75b5d25a17b300b7aad05c55d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:04 +0200 Subject: bpf: replace bpf_timer_cancel_and_free with a generic helper Same reason than most bpf_timer* functions, we need almost the same for workqueues. So extract the generic part out of it so bpf_wq_cancel_and_free can reuse it. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-4-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1cbbf28e9398..0b8ba6c81994 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1413,36 +1413,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) { - struct bpf_async_kern *timer = val; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; - /* Performance optimization: read timer->timer without lock first. */ - if (!READ_ONCE(timer->timer)) - return; + /* Performance optimization: read async->cb without lock first. */ + if (!READ_ONCE(async->cb)) + return NULL; - __bpf_spin_lock_irqsave(&timer->lock); + __bpf_spin_lock_irqsave(&async->lock); /* re-read it under lock */ - t = timer->timer; - if (!t) + cb = async->cb; + if (!cb) goto out; - drop_prog_refcnt(&t->cb); + drop_prog_refcnt(cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - WRITE_ONCE(timer->timer, NULL); + WRITE_ONCE(async->cb, NULL); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); + return cb; +} + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) return; /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call kfree(t) * right after for both preallocated and non-preallocated maps. - * The timer->timer = NULL was already done and no code path can + * The async->cb = NULL was already done and no code path can * see address 't' anymore. * * Check that bpf_map_delete/update_elem() wasn't called from timer @@ -1451,7 +1459,7 @@ out: * return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since timer->timer = NULL was already done. The timer will be + * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. */ -- cgit v1.2.3 From d56b63cf0c0f71e1b2e04dd8220b408f049e67ff Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:05 +0200 Subject: bpf: add support for bpf_wq user type Mostly a copy/paste from the bpf_timer API, without the initialization and free, as they will be done in a separate patch. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-5-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++++++++- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/btf.c | 17 +++++++++++++++++ kernel/bpf/syscall.c | 6 +++++- kernel/bpf/verifier.c | 9 +++++++++ 5 files changed, 45 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5034c1b4ded7..c7dcfd395555 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -185,7 +185,7 @@ struct bpf_map_ops { enum { /* Support at most 10 fields in a BTF type */ - BTF_FIELDS_MAX = 10, + BTF_FIELDS_MAX = 11, }; enum btf_field_type { @@ -202,6 +202,7 @@ enum btf_field_type { BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), + BPF_WORKQUEUE = (1 << 10), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -238,6 +239,7 @@ struct btf_record { u32 field_mask; int spin_lock_off; int timer_off; + int wq_off; int refcount_off; struct btf_field fields[]; }; @@ -312,6 +314,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_spin_lock"; case BPF_TIMER: return "bpf_timer"; + case BPF_WORKQUEUE: + return "bpf_wq"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; @@ -340,6 +344,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); + case BPF_WORKQUEUE: + return sizeof(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -367,6 +373,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); + case BPF_WORKQUEUE: + return __alignof__(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -406,6 +414,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cee0a7915c08..e4ae83550fb3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7306,6 +7306,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6d46cee47ae3..8291fbfd27b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3464,6 +3464,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } + if (field_mask & BPF_WORKQUEUE) { + if (!strcmp(name, "bpf_wq")) { + if (*seen_mask & BPF_WORKQUEUE) + return -E2BIG; + *seen_mask |= BPF_WORKQUEUE; + type = BPF_WORKQUEUE; + goto end; + } + } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); @@ -3515,6 +3524,7 @@ static int btf_find_struct_field(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3582,6 +3592,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3816,6 +3827,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; + rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); @@ -3846,6 +3858,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; + case BPF_WORKQUEUE: + WARN_ON_ONCE(rec->wq_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->wq_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d392ec83655..0848e4141b00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to release */ break; default: @@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: @@ -679,6 +681,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; + case BPF_WORKQUEUE: + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1085,7 +1089,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5a7e34e83a5b..89490a95b120 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1838,6 +1838,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) */ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; + if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -18141,6 +18143,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_wq yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); -- cgit v1.2.3 From ad2c03e691be3268eefc75ff1d892db3f0e79f62 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:07 +0200 Subject: bpf: verifier: bail out if the argument is not a map When a kfunc is declared with a KF_ARG_PTR_TO_MAP, we should have reg->map_ptr set to a non NULL value, otherwise, that means that the underlying type is not a map. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-7-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89490a95b120..4adf7fc33e5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11720,6 +11720,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_NULL: continue; case KF_ARG_PTR_TO_MAP: + if (!reg->map_ptr) { + verbose(env, "pointer in R%d isn't map pointer\n", regno); + return -EINVAL; + } + fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) -- cgit v1.2.3 From d940c9b94d7e6d9cff288623e3e8bf5fdea98b79 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:08 +0200 Subject: bpf: add support for KF_ARG_PTR_TO_WORKQUEUE Introduce support for KF_ARG_PTR_TO_WORKQUEUE. The kfuncs will use bpf_wq as argument and that will be recognized as workqueue argument by verifier. bpf_wq_kern casting can happen inside kfunc, but using bpf_wq in argument makes life easier for users who work with non-kern type in BPF progs. Duplicate process_timer_func into process_wq_func. meta argument is only needed to ensure bpf_wq_init's workqueue and map arguments are coming from the same map (map_uid logic is necessary for correct inner-map handling), so also amend check_kfunc_args() to match what helpers functions check is doing. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-8-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4adf7fc33e5a..cc20fda907fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -332,6 +332,10 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; + struct { + struct bpf_map *ptr; + int uid; + } map; u64 mem_size; }; @@ -7598,6 +7602,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_wq_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (map->record->wq_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", + val + reg->off, map->record->wq_off); + return -EINVAL; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10843,6 +10864,7 @@ enum { KF_ARG_LIST_NODE_ID, KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, + KF_ARG_WORKQUEUE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -10851,6 +10873,7 @@ BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) +BTF_ID(struct, bpf_wq) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -10894,6 +10917,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -10963,6 +10991,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_WORKQUEUE, }; enum special_kfunc_type { @@ -11119,6 +11148,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_map(meta->btf, &args[argno])) return KF_ARG_PTR_TO_MAP; + if (is_kfunc_arg_wq(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_WORKQUEUE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11724,6 +11756,29 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } + if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * wq = bpf_map_lookup_elem(inner_map1); + * if (wq) + * // mismatch would have been allowed + * bpf_wq_init(wq, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { + verbose(env, + "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } + } + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: @@ -11757,6 +11812,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: + case KF_ARG_PTR_TO_WORKQUEUE: /* Trusted by default */ break; default: @@ -12043,6 +12099,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret) return ret; break; + case KF_ARG_PTR_TO_WORKQUEUE: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_wq_func(env, regno, meta); + if (ret < 0) + return ret; + break; } } -- cgit v1.2.3 From 246331e3f1eac905170a923f0ec76725c2558232 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:09 +0200 Subject: bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps Currently bpf_wq_cancel_and_free() is just a placeholder as there is no memory allocation for bpf_wq just yet. Again, duplication of the bpf_timer approach Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-9-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 18 ++++++++++------- kernel/bpf/hashtab.c | 55 ++++++++++++++++++++++++++++++++++++++++----------- kernel/bpf/helpers.c | 8 ++++++++ kernel/bpf/syscall.c | 9 +++++++++ 5 files changed, 73 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c7dcfd395555..64bf0cf3ee95 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -534,6 +534,7 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_wq_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2204,6 +2205,7 @@ void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8c1e6d7654bb..580d07b15471 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -428,17 +428,21 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers(struct bpf_map *map) +static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!btf_record_has_field(map->record, BPF_TIMER)) - return; + /* We don't reset or free fields other than timer and workqueue + * on uref dropping to zero. + */ + if (btf_record_has_field(map->record, BPF_TIMER)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -782,7 +786,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers, + .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c3e79a0b9361..0179183c543a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_wq(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -1495,7 +1515,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers(struct bpf_htab *htab) +static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) { int i; @@ -1507,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); + if (is_timer) + bpf_obj_free_timer(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); + else + bpf_obj_free_workqueue(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers(struct bpf_map *map) +static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer on uref dropping to zero */ - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers(htab); - else - htab_free_prealloced_timers(htab); + /* We only free timer and workqueue on uref dropping to zero */ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, true); + else + htab_free_prealloced_timers(htab); + } + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, false); + else + htab_free_prealloced_wq(htab); + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -2260,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2281,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b8ba6c81994..fe827f6e5234 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1468,6 +1468,14 @@ void bpf_timer_cancel_and_free(void *val) kfree_rcu(t, cb.rcu); } +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_wq_cancel_and_free(void *val) +{ + BTF_TYPE_EMIT(struct bpf_wq); +} + BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) { unsigned long *kptr = map_value; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0848e4141b00..63e368337483 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -661,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) + return; + bpf_wq_cancel_and_free(obj + rec->wq_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -682,6 +689,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(field_ptr); break; case BPF_WORKQUEUE: + bpf_wq_cancel_and_free(field_ptr); break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); @@ -1119,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, } break; case BPF_TIMER: + case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { -- cgit v1.2.3 From eb48f6cd41a0f7803770a76bbffb6bd5b1b2ae2f Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:11 +0200 Subject: bpf: wq: add bpf_wq_init We need to teach the verifier about the second argument which is declared as void * but which is of type KF_ARG_PTR_TO_MAP. We could have dropped this extra case if we declared the second argument as struct bpf_map *, but that means users will have to do extra casting to have their program compile. We also need to duplicate the timer code for the checking if the map argument is matching the provided workqueue. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-11-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fe827f6e5234..1fc002cfe2b1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1109,11 +1109,18 @@ struct bpf_hrtimer { struct hrtimer timer; }; -/* the actual struct hidden inside uapi struct bpf_timer */ +struct bpf_work { + struct bpf_async_cb cb; + struct work_struct work; + struct work_struct delete_work; +}; + +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ struct bpf_async_kern { union { struct bpf_async_cb *cb; struct bpf_hrtimer *timer; + struct bpf_work *work; }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer @@ -1124,6 +1131,7 @@ struct bpf_async_kern { enum bpf_async_type { BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, }; static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); @@ -1167,11 +1175,64 @@ out: return HRTIMER_NORESTART; } +static void bpf_wq_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, work); + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + struct bpf_async_cb *cb = &w->cb; + struct bpf_prog *prog = cb->prog; + struct bpf_map *map = cb->map; + bpf_callback_t callback_fn; + void *value = cb->value; + void *key; + u32 idx; + + BTF_TYPE_EMIT(struct bpf_wq); + + callback_fn = READ_ONCE(cb->callback_fn); + if (!callback_fn || !prog) + return; + + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + run_ctx.bpf_cookie = 0; + + if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { + /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); + return; + } + + callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); + /* The verifier checked that return value is zero. */ + + __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, + &run_ctx); +} + +static void bpf_wq_delete_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + + cancel_work_sync(&w->work); + + kfree_rcu(w, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { struct bpf_async_cb *cb; struct bpf_hrtimer *t; + struct bpf_work *w; clockid_t clockid; size_t size; int ret = 0; @@ -1183,6 +1244,9 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u case BPF_ASYNC_TYPE_TIMER: size = sizeof(struct bpf_hrtimer); break; + case BPF_ASYNC_TYPE_WQ: + size = sizeof(struct bpf_work); + break; default: return -EINVAL; } @@ -1201,13 +1265,22 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - if (type == BPF_ASYNC_TYPE_TIMER) { + switch (type) { + case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; + break; + case BPF_ASYNC_TYPE_WQ: + w = (struct bpf_work *)cb; + + INIT_WORK(&w->work, bpf_wq_work); + INIT_WORK(&w->delete_work, bpf_wq_delete_work); + cb->value = (void *)async - map->record->wq_off; + break; } cb->map = map; cb->prog = NULL; @@ -1473,7 +1546,19 @@ void bpf_timer_cancel_and_free(void *val) */ void bpf_wq_cancel_and_free(void *val) { + struct bpf_work *work; + BTF_TYPE_EMIT(struct bpf_wq); + + work = (struct bpf_work *)__bpf_async_cancel_and_free(val); + if (!work) + return; + /* Trigger cancel of the sleepable work, but *do not* wait for + * it to finish if it was running as we might not be in a + * sleepable context. + * kfree will be called once the work has finished. + */ + schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) @@ -2612,6 +2697,20 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_map *map = p__map; + + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); + + if (flags) + return -EINVAL; + + return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2689,6 +2788,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +BTF_ID_FLAGS(func, bpf_wq_init) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 81f1d7a583fa1fa14f0c4e6140d34b5e3d08d227 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:13 +0200 Subject: bpf: wq: add bpf_wq_set_callback_impl To support sleepable async callbacks, we need to tell push_async_cb() whether the cb is sleepable or not. The verifier now detects that we are in bpf_wq_set_callback_impl and can allow a sleepable callback to happen. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-13-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/helpers.c | 15 +++++++++++ kernel/bpf/verifier.c | 60 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 70 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 36d19cd32eb5..9db35530c878 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -426,6 +426,7 @@ struct bpf_verifier_state { * while they are still in use. */ bool used_as_loop_entry; + bool in_sleepable; /* first and last insn idx of this verifier state */ u32 first_insn_idx; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1fc002cfe2b1..dd2eaaf77c93 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2711,6 +2711,20 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags, + void *aux__ign) +{ + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + + if (flags) + return -EINVAL; + + return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2789,6 +2803,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) +BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cc20fda907fb..9715c88cc025 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) } static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_async_callback_calling_kfunc(u32 btf_id); +static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); + static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) static bool is_async_callback_calling_insn(struct bpf_insn *insn) { - return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); + return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } static bool is_may_goto_insn(struct bpf_insn *insn) @@ -1429,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; dst_state->active_lock.id = src->active_lock.id; @@ -2404,7 +2410,7 @@ static void init_func_state(struct bpf_verifier_env *env, /* Similar to push_stack(), but for async callbacks */ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, - int subprog) + int subprog, bool is_sleepable) { struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; @@ -2431,6 +2437,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * Initialize it similar to do_check_common(). */ elem->st.branches = 1; + elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -5278,7 +5285,8 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable; + return env->prog->sleepable || + (env->cur_state && env->cur_state->in_sleepable); } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -9513,7 +9521,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins */ env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && - !is_sync_callback_calling_kfunc(insn->imm)) { + !is_callback_calling_kfunc(insn->imm)) { verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", func_id_name(insn->imm), insn->imm); return -EFAULT; @@ -9527,10 +9535,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; - /* there is no real recursion here. timer callbacks are async */ + /* there is no real recursion here. timer and workqueue callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - insn_idx, subprog); + insn_idx, subprog, + is_bpf_wq_set_callback_impl_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -11017,6 +11026,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, + KF_bpf_wq_set_callback_impl, KF_bpf_iter_css_task_new, }; @@ -11041,6 +11051,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif @@ -11069,6 +11080,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11402,12 +11414,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_async_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + static bool is_bpf_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return is_sync_callback_calling_kfunc(btf_id) || + is_async_callback_calling_kfunc(btf_id); +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -12219,6 +12247,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); @@ -16968,6 +17006,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->in_sleepable != cur->in_sleepable) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -19639,6 +19680,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + *cnt = 3; } return 0; } -- cgit v1.2.3 From 8e83da9732d91c60fdc651b2486c8e5935eb0ca2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:15 +0200 Subject: bpf: add bpf_wq_start again, copy/paste from bpf_timer_start(). Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-15-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dd2eaaf77c93..047a21b7e4ba 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2711,6 +2711,23 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_work *w; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + w = READ_ONCE(async->work); + if (!w || !READ_ONCE(w->cb.prog)) + return -EINVAL; + + schedule_work(&w->work); + return 0; +} + __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, struct bpf_wq *wq), unsigned int flags, @@ -2804,6 +2821,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_start) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 04d63da4da53e6064683f4970d34ce997f9daee3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 23 Apr 2024 21:00:20 -0400 Subject: cgroup/cpuset: Fix incorrect top_cpuset flags Commit 8996f93fc388 ("cgroup/cpuset: Statically initialize more members of top_cpuset") uses an incorrect "<" relational operator for the CS_SCHED_LOAD_BALANCE bit when initializing the top_cpuset. This results in load_balancing turned off by default in the top cpuset which is bad for performance. Fix this by using the BIT() helper macro to set the desired top_cpuset flags and avoid similar mistake from being made in the future. Fixes: 8996f93fc388 ("cgroup/cpuset: Statically initialize more members of top_cpuset") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 032fcc93f2b8..f5443c039619 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -368,8 +368,8 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE) | (1 < CS_SCHED_LOAD_BALANCE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), -- cgit v1.2.3 From d40f92020c7a225b77e68599e4b099a4a0823408 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Apr 2024 14:43:48 -1000 Subject: workqueue: The default node_nr_active should have its max set to max_active The default nna (node_nr_active) is used when the pool isn't tied to a specific NUMA node. This can happen in the following cases: 1. On NUMA, if per-node pwq init failure and the fallback pwq is used. 2. On NUMA, if a pool is configured to span multiple nodes. 3. On single node setups. 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues") set the default nna->max to min_active because only #1 was being considered. For #2 and #3, using min_active means that the max concurrency in normal operation is pushed down to min_active which is currently 8, which can obviously lead to performance issues. exact value nna->max is set to doesn't really matter. #2 can only happen if the workqueue is intentionally configured to ignore NUMA boundaries and there's no good way to distribute max_active in this case. #3 is the default behavior on single node machines. Let's set it the default nna->max to max_active. This fixes the artificially lowered concurrency problem on single node machines and shouldn't hurt anything for other cases. Signed-off-by: Tejun Heo Reported-by: Shinichiro Kawasaki Fixes: 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues") Link: https://lore.kernel.org/dm-devel/20240410084531.2134621-1-shinichiro.kawasaki@wdc.com/ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2af0aaf026b..5f536c63a48d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1610,7 +1610,7 @@ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) min_active, max_active); } - wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; + wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; } /** -- cgit v1.2.3 From 11a921909fea230cf7afcd6842a9452f3720b61b Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:19 +0200 Subject: kernel misc: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove the sentinel from ctl_table arrays. Reduce by one the values used to compare the size of the adjusted arrays. Signed-off-by: Joel Granados --- kernel/acct.c | 1 - kernel/exit.c | 1 - kernel/hung_task.c | 1 - kernel/kexec_core.c | 1 - kernel/latencytop.c | 1 - kernel/panic.c | 1 - kernel/pid_namespace.c | 1 - kernel/pid_sysctl.h | 1 - kernel/reboot.c | 1 - kernel/signal.c | 1 - kernel/stackleak.c | 1 - kernel/sysctl.c | 2 -- kernel/ucount.c | 3 +-- kernel/utsname_sysctl.c | 1 - kernel/watchdog.c | 2 -- 15 files changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 986c8214dabf..179848ad33e9 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,7 +84,6 @@ static struct ctl_table kern_acct_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __init int kernel_acct_sysctls_init(void) diff --git a/kernel/exit.c b/kernel/exit.c index 41a12630cbbc..cd3aa9042f1a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -94,7 +94,6 @@ static struct ctl_table kern_exit_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_exit_sysctls_init(void) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2fc2727d654..1d92016b0b3c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -314,7 +314,6 @@ static struct ctl_table hung_task_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, - {} }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 0e96f6b24344..9112d69d68b0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -948,7 +948,6 @@ static struct ctl_table kexec_core_sysctls[] = { .mode = 0644, .proc_handler = kexec_limit_handler, }, - { } }; static int __init kexec_core_sysctl_init(void) diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 781249098cb6..84c53285f499 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -85,7 +85,6 @@ static struct ctl_table latencytop_sysctl[] = { .mode = 0644, .proc_handler = sysctl_latencytop, }, - {} }; #endif diff --git a/kernel/panic.c b/kernel/panic.c index 747c3f3d289a..8bff183d6180 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -100,7 +100,6 @@ static struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_panic_sysctls_init(void) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7ade20e95232..dc48fecfa1dc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -307,7 +307,6 @@ static struct ctl_table pid_ns_ctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, - { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index 2ee41a3a1dfd..fe9fb991dc42 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -41,7 +41,6 @@ static struct ctl_table pid_ns_ctl_table_vm[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static inline void register_pid_ns_sysctl_table_vm(void) { diff --git a/kernel/reboot.c b/kernel/reboot.c index 22c16e2564cc..f05dbde2c93f 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1295,7 +1295,6 @@ static struct ctl_table kern_reboot_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static void __init kernel_reboot_sysctls_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 7bdbcf1b78d0..01c4c46a51a8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4840,7 +4840,6 @@ static struct ctl_table signal_debug_table[] = { .proc_handler = proc_dointvec }, #endif - { } }; static int __init init_signal_sysctls(void) diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 34c9d81eea94..d099f3affcf1 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -54,7 +54,6 @@ static struct ctl_table stackleak_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init stackleak_sysctls_init(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81cc974913bb..e0b917328cf9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2034,7 +2034,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_INT_MAX, }, #endif - { } }; static struct ctl_table vm_table[] = { @@ -2240,7 +2239,6 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif - { } }; int __init sysctl_init_bases(void) diff --git a/kernel/ucount.c b/kernel/ucount.c index 4aa6166cb856..e196da0204dc 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -87,7 +87,6 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif - { } }; #endif /* CONFIG_SYSCTL */ @@ -96,7 +95,7 @@ bool setup_userns_sysctls(struct user_namespace *ns) #ifdef CONFIG_SYSCTL struct ctl_table *tbl; - BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); + BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 019e3a1566cf..76a772072557 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -120,7 +120,6 @@ static struct ctl_table uts_kern_table[] = { .proc_handler = proc_do_uts_string, .poll = &domainname_poll, }, - {} }; #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7b2125503af..4e472d416525 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -950,7 +950,6 @@ static struct ctl_table watchdog_sysctls[] = { }, #endif /* CONFIG_SMP */ #endif - {} }; static struct ctl_table watchdog_hardlockup_sysctl[] = { @@ -963,7 +962,6 @@ static struct ctl_table watchdog_hardlockup_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init watchdog_sysctl_init(void) -- cgit v1.2.3 From 7fd9c63f8777094514b46834c561647040f12fbb Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:24 +0200 Subject: umh: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from usermodehelper_table Signed-off-by: Joel Granados --- kernel/umh.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 1b13c5d34624..598b3ffe1522 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -560,7 +560,6 @@ static struct ctl_table usermodehelper_table[] = { .mode = 0600, .proc_handler = proc_cap_handler, }, - { } }; static int __init init_umh_sysctls(void) -- cgit v1.2.3 From 66f20b11d3a31bcfe2f64e6a91238bd71b7eb72f Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:23 +0200 Subject: ftrace: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel elements from ftrace_sysctls and user_event_sysctls Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Joel Granados --- kernel/trace/ftrace.c | 1 - kernel/trace/trace_events_user.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..6cec53aa45a6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8270,7 +8270,6 @@ static struct ctl_table ftrace_sysctls[] = { .mode = 0644, .proc_handler = ftrace_enable_sysctl, }, - {} }; static int __init ftrace_sysctl_init(void) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 70d428c394b6..304ceed9fd7d 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2833,7 +2833,6 @@ static struct ctl_table user_event_sysctls[] = { .mode = 0644, .proc_handler = set_max_user_events_sysctl, }, - {} }; static int __init trace_events_user_init(void) -- cgit v1.2.3 From fe6fc8e11b5a118a299c0d8f06407794173e6aa9 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:23 +0200 Subject: timekeeping: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from time_sysctl Signed-off-by: Joel Granados --- kernel/time/timer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index dee29f1f5b75..9d107f4b506c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -312,7 +312,6 @@ static struct ctl_table timer_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init timer_sysctl_init(void) -- cgit v1.2.3 From e822582effc6bc00da9b28cf814935a6be070504 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:23 +0200 Subject: seccomp: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from seccomp_sysctl_table. Acked-by: Kees Cook Signed-off-by: Joel Granados --- kernel/seccomp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index aca7b437882e..7ed72723fb8a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2445,7 +2445,6 @@ static struct ctl_table seccomp_sysctl_table[] = { .mode = 0644, .proc_handler = seccomp_actions_logged_handler, }, - { } }; static int __init seccomp_sysctl_init(void) -- cgit v1.2.3 From f532376e881fb370bc2901a9e1cf0b7e461638ad Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:21 +0200 Subject: scheduler: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) rm sentinel element from ctl_table arrays Acked-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Reviewed-by: Valentin Schneider Signed-off-by: Joel Granados --- kernel/sched/autogroup.c | 1 - kernel/sched/core.c | 1 - kernel/sched/deadline.c | 1 - kernel/sched/fair.c | 1 - kernel/sched/rt.c | 1 - kernel/sched/topology.c | 1 - 6 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 991fc9002535..db68a964e34e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -19,7 +19,6 @@ static struct ctl_table sched_autogroup_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init sched_autogroup_sysctl_init(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7019a40457a6..7ce76620a308 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4741,7 +4741,6 @@ static struct ctl_table sched_core_sysctls[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_core_sysctl_init(void) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc..c75d1307d86d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = { .proc_handler = proc_douintvec_minmax, .extra2 = (void *)&sysctl_sched_dl_period_max, }, - {} }; static int __init sched_dl_sysctl_init(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03be0d1330a6..4ac2cf7a918e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -157,7 +157,6 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_fair_sysctl_init(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b67e..aa4c1c874fa4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -56,7 +56,6 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, - {} }; static int __init sched_rt_sysctl_init(void) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..42c22648d124 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -322,7 +322,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sched_energy_aware_sysctl_init(void) -- cgit v1.2.3 From f842d9a96e6993763090960d1ab0f763374a99c0 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:21 +0200 Subject: printk: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) rm sentinel element from printk_sysctls Reviewed-by: Petr Mladek Signed-off-by: Joel Granados --- kernel/printk/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index c228343eeb97..3e47dedce9e5 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -76,7 +76,6 @@ static struct ctl_table printk_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - {} }; void __init printk_sysctl_init(void) -- cgit v1.2.3 From f884cd38625c403077a724417efc963487893559 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:20 +0200 Subject: kprobes: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from kprobe_sysclts Acked-by: Masami Hiramatsu (Google) Signed-off-by: Joel Granados --- kernel/kprobes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9d9095e81792..85af0e05a38f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -968,7 +968,6 @@ static struct ctl_table kprobe_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init kprobe_sysctls_init(void) -- cgit v1.2.3 From f15843f725a55d7232aa9b699be52c2c3da06982 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:19 +0200 Subject: delayacct: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from kern_delayacct_table Signed-off-by: Joel Granados --- kernel/delayacct.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 6f0c358e73d8..e039b0f99a0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -74,7 +74,6 @@ static struct ctl_table kern_delayacct_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static __init int kernel_delayacct_sysctls_init(void) -- cgit v1.2.3 From 1adb825af946fa6b2104c1713ad4999283e09fbc Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:19 +0200 Subject: bpf: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from bpf_syscall_table. Acked-by: Andrii Nakryiko Signed-off-by: Joel Granados --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c7e805087b06 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5979,7 +5979,6 @@ static struct ctl_table bpf_syscall_table[] = { .mode = 0644, .proc_handler = bpf_stats_handler, }, - { } }; static int __init bpf_syscall_sysctl_init(void) -- cgit v1.2.3 From 795f90c6f13c30484ff10355a6775979f57f78cb Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 15 Mar 2024 19:11:31 +0100 Subject: sysctl: treewide: constify argument ctl_table_root::permissions(table) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The permissions callback should not modify the ctl_table. Enforce this expectation via the typesystem. This is a step to put "struct ctl_table" into .rodata throughout the kernel. The patch was created with the following coccinelle script: @@ identifier func, head, ctl; @@ int func( struct ctl_table_header *head, - struct ctl_table *ctl) + const struct ctl_table *ctl) { ... } (insert_entry() from fs/proc/proc_sysctl.c is a false-positive) No additional occurrences of '.permissions =' were found after a tree-wide search for places missed by the conccinelle script. Reviewed-by: Joel Granados Signed-off-by: Thomas Weißschuh Signed-off-by: Joel Granados --- include/linux/sysctl.h | 2 +- ipc/ipc_sysctl.c | 2 +- ipc/mq_sysctl.c | 2 +- kernel/ucount.c | 2 +- net/sysctl_net.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 60333a6b9370..f9214de0490c 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -206,7 +206,7 @@ struct ctl_table_root { struct ctl_table_set *(*lookup)(struct ctl_table_root *root); void (*set_ownership)(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid); - int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); + int (*permissions)(struct ctl_table_header *head, const struct ctl_table *table); }; #define register_sysctl(path, table) \ diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 1a5085e5b178..19b2a67aef40 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -204,7 +204,7 @@ static void ipc_set_ownership(struct ctl_table_header *head, *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; } -static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table) +static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table) { int mode = table->mode; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 6bb1c5397c69..43c0825da9e8 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -90,7 +90,7 @@ static void mq_set_ownership(struct ctl_table_header *head, *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; } -static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table) +static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table) { int mode = table->mode; kuid_t ns_root_uid; diff --git a/kernel/ucount.c b/kernel/ucount.c index e196da0204dc..4d5b9c12c014 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -38,7 +38,7 @@ static int set_is_seen(struct ctl_table_set *set) } static int set_permissions(struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index a0a7a79991f9..f5017012a049 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -40,7 +40,7 @@ static int is_seen(struct ctl_table_set *set) /* Return standard mode bits for table entry. */ static int net_ctl_permissions(struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); -- cgit v1.2.3 From cd18bec668bb6221a54f03d0b645b7aed841f825 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 4 Apr 2024 17:57:38 +0200 Subject: sched/fair: Fix update of rd->sg_overutilized sg_overloaded is used instead of sg_overutilized to update rd->sg_overutilized. Fixes: 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240404155738.2866102-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1dd37168da50..bb1ae4ed0d91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10661,7 +10661,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized(env->dst_rq->rd, sg_overloaded); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } else if (sg_overutilized) { set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } -- cgit v1.2.3 From f1f8d0a224227e4f19a8a8881dc6355bdfc51230 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Mar 2024 10:16:13 +0100 Subject: sched/cpufreq: Take cpufreq feedback into account Aggregate the different pressures applied on the capacity of CPUs and create a new function that returns the actual capacity of the CPU: get_actual_cpu_capacity(). Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20240326091616.3696851-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb1ae4ed0d91..19199c119829 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4965,13 +4965,22 @@ done: trace_sched_util_est_se_tp(&p->se); } +static inline unsigned long get_actual_cpu_capacity(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + static inline int util_fits_cpu(unsigned long util, unsigned long uclamp_min, unsigned long uclamp_max, int cpu) { - unsigned long capacity_orig, capacity_orig_thermal; unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; bool fits, uclamp_max_fits; /* @@ -5003,7 +5012,6 @@ static inline int util_fits_cpu(unsigned long util, * goal is to cap the task. So it's okay if it's getting less. */ capacity_orig = arch_scale_cpu_capacity(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -5078,7 +5086,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) return -1; return fits; @@ -7494,7 +7503,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = get_actual_cpu_capacity(cpu); /* * First, select CPU which fits better (-1 being better than 0). @@ -7987,8 +7996,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; int prev_fits = -1, best_fits = -1; - unsigned long best_thermal_cap = 0; - unsigned long prev_thermal_cap = 0; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -8018,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; - unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cpu_cap, cpu_actual_cap, util; long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; unsigned long cur_delta, base_energy; @@ -8030,18 +8039,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpumask_empty(cpus)) continue; - /* Account thermal pressure for the energy estimation */ + /* Account external pressure for the energy estimation */ cpu = cpumask_first(cpus); - cpu_thermal_cap = arch_scale_cpu_capacity(cpu); - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + cpu_actual_cap = get_actual_cpu_capacity(cpu); - eenv.cpu_cap = cpu_thermal_cap; + eenv.cpu_cap = cpu_actual_cap; eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { struct rq *rq = cpu_rq(cpu); - eenv.pd_cap += cpu_thermal_cap; + eenv.pd_cap += cpu_actual_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; @@ -8112,7 +8120,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; - prev_thermal_cap = cpu_thermal_cap; + prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); } @@ -8127,7 +8135,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * but best energy cpu has better capacity. */ if ((max_fits < 0) && - (cpu_thermal_cap <= best_thermal_cap)) + (cpu_actual_cap <= best_actual_cap)) continue; cur_delta = compute_energy(&eenv, pd, cpus, p, @@ -8148,14 +8156,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; best_fits = max_fits; - best_thermal_cap = cpu_thermal_cap; + best_actual_cap = cpu_actual_cap; } } rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) target = best_energy_cpu; return target; @@ -9560,8 +9568,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -9573,12 +9581,9 @@ static unsigned long scale_rt_capacity(int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ used = cpu_util_rt(rq); used += cpu_util_dl(rq); - used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; -- cgit v1.2.3 From d4dbc991714eefcbd8d54a3204bd77a0a52bd32d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Mar 2024 10:16:15 +0100 Subject: sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure() Now that cpufreq provides a pressure value to the scheduler, rename arch_update_thermal_pressure into HW pressure to reflect that it returns a pressure applied by HW (i.e. with a high frequency change) and not always related to thermal mitigation but also generated by max current limitation as an example. Such high frequency signal needs filtering to be smoothed and provide an value that reflects the average available capacity into the scheduler time scale. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Qais Yousef Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20240326091616.3696851-5-vincent.guittot@linaro.org --- arch/arm/include/asm/topology.h | 6 +++--- arch/arm64/include/asm/topology.h | 6 +++--- drivers/base/arch_topology.c | 26 +++++++++++++------------- drivers/cpufreq/qcom-cpufreq-hw.c | 4 ++-- include/linux/arch_topology.h | 8 ++++---- include/linux/sched/topology.h | 8 ++++---- include/trace/events/hw_pressure.h | 29 +++++++++++++++++++++++++++++ include/trace/events/sched.h | 2 +- include/trace/events/thermal_pressure.h | 29 ----------------------------- init/Kconfig | 12 ++++++------ kernel/sched/core.c | 8 ++++---- kernel/sched/fair.c | 16 ++++++++-------- kernel/sched/pelt.c | 18 +++++++++--------- kernel/sched/pelt.h | 16 ++++++++-------- kernel/sched/sched.h | 10 +++++----- 15 files changed, 99 insertions(+), 99 deletions(-) create mode 100644 include/trace/events/hw_pressure.h delete mode 100644 include/trace/events/thermal_pressure.h (limited to 'kernel') diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 853c4f81ba4a..ad36b6570067 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -22,9 +22,9 @@ /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure API */ -#define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_update_thermal_pressure topology_update_thermal_pressure +/* Replace task scheduler's default HW pressure API */ +#define arch_scale_hw_pressure topology_get_hw_pressure +#define arch_update_hw_pressure topology_update_hw_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a323b109b9c4..0f6ef432fb84 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -35,9 +35,9 @@ void update_freq_counters_refs(void); /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure API */ -#define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_update_thermal_pressure topology_update_thermal_pressure +/* Replace task scheduler's default HW pressure API */ +#define arch_scale_hw_pressure topology_get_hw_pressure +#define arch_update_hw_pressure topology_update_hw_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 024b78a0cfc1..0248912ff687 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,7 +22,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) per_cpu(cpu_scale, cpu) = capacity; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); +DEFINE_PER_CPU(unsigned long, hw_pressure); /** - * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * topology_update_hw_pressure() - Update HW pressure for CPUs * @cpus : The related CPUs for which capacity has been reduced * @capped_freq : The maximum allowed frequency that CPUs can run at * - * Update the value of thermal pressure for all @cpus in the mask. The + * Update the value of HW pressure for all @cpus in the mask. The * cpumask should include all (online+offline) affected CPUs, to avoid * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to - * thermal capping. It might be also a boost frequency value, which is bigger + * HW capping. It might be also a boost frequency value, which is bigger * than the internal 'capacity_freq_ref' max frequency. In such case the * pressure value should simply be removed, since this is an indication that - * there is no thermal throttling. The @capped_freq must be provided in kHz. + * there is no HW throttling. The @capped_freq must be provided in kHz. */ -void topology_update_thermal_pressure(const struct cpumask *cpus, +void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq) { - unsigned long max_capacity, capacity, th_pressure; + unsigned long max_capacity, capacity, hw_pressure; u32 max_freq; int cpu; @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, /* * Handle properly the boost frequencies, which should simply clean - * the thermal pressure value. + * the HW pressure value. */ if (max_freq <= capped_freq) capacity = max_capacity; else capacity = mult_frac(max_capacity, capped_freq, max_freq); - th_pressure = max_capacity - capacity; + hw_pressure = max_capacity - capacity; - trace_thermal_pressure_update(cpu, th_pressure); + trace_hw_pressure_update(cpu, hw_pressure); for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); } -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); +EXPORT_SYMBOL_GPL(topology_update_hw_pressure); static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 70b0f21968a0..ec8df5496a0c 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) throttled_freq = freq_hz / HZ_PER_KHZ; - /* Update thermal pressure (the boost frequencies are accepted) */ - arch_update_thermal_pressure(policy->related_cpus, throttled_freq); + /* Update HW pressure (the boost frequencies are accepted) */ + arch_update_hw_pressure(policy->related_cpus, throttled_freq); /* * In the unlikely case policy is unregistered do not enable diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a63d61ca55af..b721f360d759 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -60,14 +60,14 @@ void topology_scale_freq_tick(void); void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); -DECLARE_PER_CPU(unsigned long, thermal_pressure); +DECLARE_PER_CPU(unsigned long, hw_pressure); -static inline unsigned long topology_get_thermal_pressure(int cpu) +static inline unsigned long topology_get_hw_pressure(int cpu) { - return per_cpu(thermal_pressure, cpu); + return per_cpu(hw_pressure, cpu); } -void topology_update_thermal_pressure(const struct cpumask *cpus, +void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq); struct cpu_topology { diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c8fe9bab981b..4237daa5ac7a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -270,17 +270,17 @@ unsigned long arch_scale_cpu_capacity(int cpu) } #endif -#ifndef arch_scale_thermal_pressure +#ifndef arch_scale_hw_pressure static __always_inline -unsigned long arch_scale_thermal_pressure(int cpu) +unsigned long arch_scale_hw_pressure(int cpu) { return 0; } #endif -#ifndef arch_update_thermal_pressure +#ifndef arch_update_hw_pressure static __always_inline -void arch_update_thermal_pressure(const struct cpumask *cpus, +void arch_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_frequency) { } #endif diff --git a/include/trace/events/hw_pressure.h b/include/trace/events/hw_pressure.h new file mode 100644 index 000000000000..b9cd68854128 --- /dev/null +++ b/include/trace/events/hw_pressure.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hw_pressure + +#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_THERMAL_PRESSURE_H + +#include + +TRACE_EVENT(hw_pressure_update, + TP_PROTO(int cpu, unsigned long hw_pressure), + TP_ARGS(cpu, hw_pressure), + + TP_STRUCT__entry( + __field(unsigned long, hw_pressure) + __field(int, cpu) + ), + + TP_fast_assign( + __entry->hw_pressure = hw_pressure; + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure) +); +#endif /* _TRACE_THERMAL_PRESSURE_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dbb01b4b7451..d115d64c4011 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_thermal_tp, +DECLARE_TRACE(pelt_hw_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/thermal_pressure.h deleted file mode 100644 index b68680201360..000000000000 --- a/include/trace/events/thermal_pressure.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM thermal_pressure - -#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_THERMAL_PRESSURE_H - -#include - -TRACE_EVENT(thermal_pressure_update, - TP_PROTO(int cpu, unsigned long thermal_pressure), - TP_ARGS(cpu, thermal_pressure), - - TP_STRUCT__entry( - __field(unsigned long, thermal_pressure) - __field(int, cpu) - ), - - TP_fast_assign( - __entry->thermal_pressure = thermal_pressure; - __entry->cpu = cpu; - ), - - TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure) -); -#endif /* _TRACE_THERMAL_PRESSURE_H */ - -/* This part must be outside protection */ -#include diff --git a/init/Kconfig b/init/Kconfig index aa02aec6aa7d..f0c9117962ec 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -547,24 +547,24 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP -config SCHED_THERMAL_PRESSURE +config SCHED_HW_PRESSURE bool default y if ARM && ARM_CPU_TOPOLOGY default y if ARM64 depends on SMP depends on CPU_FREQ_THERMAL help - Select this option to enable thermal pressure accounting in the - scheduler. Thermal pressure is the value conveyed to the scheduler + Select this option to enable HW pressure accounting in the + scheduler. HW pressure is the value conveyed to the scheduler that reflects the reduction in CPU compute capacity resulted from - thermal throttling. Thermal throttling occurs when the performance of - a CPU is capped due to high operating temperatures. + HW throttling. HW throttling occurs when the performance of + a CPU is capped due to high operating temperatures as an example. If selected, the scheduler will be able to balance tasks accordingly, i.e. put less load on throttled CPUs than on non/less throttled ones. This requires the architecture to implement - arch_update_thermal_pressure() and arch_scale_thermal_pressure(). + arch_update_hw_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT bool "BSD Process Accounting" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0621e4ee31de..67a8302c3131 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -5668,7 +5668,7 @@ void sched_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; u64 resched_latency; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) @@ -5679,8 +5679,8 @@ void sched_tick(void) rq_lock(rq, &rf); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 19199c119829..eef39ae3efcf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,7 +78,7 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; +int sched_hw_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { int _shift = 0; @@ -86,7 +86,7 @@ static int __init setup_sched_thermal_decay_shift(char *str) if (kstrtoint(str, 0, &_shift)) pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - sched_thermal_decay_shift = clamp(_shift, 0, 10); + sched_hw_decay_shift = clamp(_shift, 0, 10); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -4969,7 +4969,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); return capacity; } @@ -5002,7 +5002,7 @@ static inline int util_fits_cpu(unsigned long util, * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * - * Only exception is for thermal pressure since it has a direct impact + * Only exception is for HW or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level @@ -9324,7 +9324,7 @@ static inline bool others_have_blocked(struct rq *rq) if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; if (cpu_util_irq(rq)) @@ -9354,7 +9354,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; + unsigned long hw_pressure; bool decayed; /* @@ -9363,11 +9363,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 3a96da25b67c..ef00382de595 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#ifdef CONFIG_SCHED_THERMAL_PRESSURE +#ifdef CONFIG_SCHED_HW_PRESSURE /* - * thermal: + * hardware: * * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked * * util_avg and runnable_load_avg are not supported and meaningless. * * Unlike rt/dl utilization tracking that track time spent by a cpu - * running a rt/dl task through util_avg, the average thermal pressure is - * tracked through load_avg. This is because thermal pressure signal is + * running a rt/dl task through util_avg, the average HW pressure is + * tracked through load_avg. This is because HW pressure signal is * time weighted "delta" capacity unlike util_avg which is binary. * "delta capacity" = actual capacity - - * capped capacity a cpu due to a thermal event. + * capped capacity a cpu due to a HW event. */ -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { - if (___update_load_sum(now, &rq->avg_thermal, + if (___update_load_sum(now, &rq->avg_hw, capacity, capacity, capacity)) { - ___update_load_avg(&rq->avg_thermal, 1); - trace_pelt_thermal_tp(rq); + ___update_load_avg(&rq->avg_hw, 1); + trace_pelt_hw_tp(rq); return 1; } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 9e1083465fbc..2150062949d4 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#ifdef CONFIG_SCHED_THERMAL_PRESSURE -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +#ifdef CONFIG_SCHED_HW_PRESSURE +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { - return READ_ONCE(rq->avg_thermal.load_avg); + return READ_ONCE(rq->avg_hw.load_avg); } #else static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7c39dbf31f75..993edb02fb0d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1108,8 +1108,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; @@ -1561,11 +1561,11 @@ static inline u64 rq_clock_task(struct rq *rq) * 3 256 * 4 512 */ -extern int sched_thermal_decay_shift; +extern int sched_hw_decay_shift; -static inline u64 rq_clock_thermal(struct rq *rq) +static inline u64 rq_clock_hw(struct rq *rq) { - return rq_clock_task(rq) >> sched_thermal_decay_shift; + return rq_clock_task(rq) >> sched_hw_decay_shift; } static inline void rq_clock_skip_update(struct rq *rq) -- cgit v1.2.3 From 97450eb909658573dcacc1063b06d3d08642c0c1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Mar 2024 10:16:16 +0100 Subject: sched/pelt: Remove shift of thermal clock The optional shift of the clock used by thermal/hw load avg has been introduced to handle case where the signal was not always a high frequency hw signal. Now that cpufreq provides a signal for firmware and SW pressure, we can remove this exception and always keep this PELT signal aligned with other signals. Mark sysctl_sched_migration_cost boot parameter as deprecated Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Qais Yousef Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20240326091616.3696851-6-vincent.guittot@linaro.org --- Documentation/admin-guide/kernel-parameters.txt | 1 + kernel/sched/core.c | 2 +- kernel/sched/fair.c | 10 ++-------- kernel/sched/sched.h | 18 ------------------ 4 files changed, 4 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..3f390cc5f77e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5807,6 +5807,7 @@ but is useful for debugging and performance tuning. sched_thermal_decay_shift= + [Deprecated] [KNL, SMP] Set a decay shift for scheduler thermal pressure signal. Thermal pressure signal follows the default decay period of other scheduler pelt diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67a8302c3131..1a914388144a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5680,7 +5680,7 @@ void sched_tick(void) update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); - update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eef39ae3efcf..9eb63573110c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_hw_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_hw_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -9367,7 +9361,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) | + update_hw_load_avg(now, rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 993edb02fb0d..cb3792c04eea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1550,24 +1550,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_hw_decay_shift; - -static inline u64 rq_clock_hw(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_hw_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_rq_held(rq); -- cgit v1.2.3 From dc92febf7b93da5049fe177804e6b1961fcc6bd7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 09:00:23 -0700 Subject: bpf: Don't check for recursion in bpf_wq_work. __bpf_prog_enter_sleepable_recur does recursion check which is not applicable to wq callback. The callback function is part of bpf program and bpf prog might be running on the same cpu. So recursion check would incorrectly prevent callback from running. The code can call __bpf_prog_enter_sleepable(), but run_ctx would be fake, hence use explicit rcu_read_lock_trace(); migrate_disable(); to address this problem. Another reason to open code is __bpf_prog_enter* are not available in !JIT configs. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404241719.IIGdpAku-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202404241811.FFV4Bku3-lkp@intel.com/ Fixes: eb48f6cd41a0 ("bpf: wq: add bpf_wq_init") Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 047a21b7e4ba..5f61ffa7505a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1178,9 +1178,7 @@ out: static void bpf_wq_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, work); - struct bpf_tramp_run_ctx __maybe_unused run_ctx; struct bpf_async_cb *cb = &w->cb; - struct bpf_prog *prog = cb->prog; struct bpf_map *map = cb->map; bpf_callback_t callback_fn; void *value = cb->value; @@ -1190,7 +1188,7 @@ static void bpf_wq_work(struct work_struct *work) BTF_TYPE_EMIT(struct bpf_wq); callback_fn = READ_ONCE(cb->callback_fn); - if (!callback_fn || !prog) + if (!callback_fn) return; if (map->map_type == BPF_MAP_TYPE_ARRAY) { @@ -1203,19 +1201,13 @@ static void bpf_wq_work(struct work_struct *work) key = value - round_up(map->key_size, 8); } - run_ctx.bpf_cookie = 0; - - if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { - /* recursion detected */ - __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); - return; - } + rcu_read_lock_trace(); + migrate_disable(); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); - /* The verifier checked that return value is zero. */ - __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, - &run_ctx); + migrate_enable(); + rcu_read_unlock_trace(); } static void bpf_wq_delete_work(struct work_struct *work) -- cgit v1.2.3 From fc7566ad0a826cdc8886c5dbbb39ce72a0dc6333 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 24 Apr 2024 03:13:14 +0000 Subject: bpf: Introduce bpf_preempt_[disable,enable] kfuncs Introduce two new BPF kfuncs, bpf_preempt_disable and bpf_preempt_enable. These kfuncs allow disabling preemption in BPF programs. Nesting is allowed, since the intended use cases includes building native BPF spin locks without kernel helper involvement. Apart from that, this can be used to per-CPU data structures for cases where programs (or userspace) may preempt one or the other. Currently, while per-CPU access is stable, whether it will be consistent is not guaranteed, as only migration is disabled for BPF programs. Global functions are disallowed from being called, but support for them will be added as a follow up not just preempt kfuncs, but rcu_read_lock kfuncs as well. Static subprog calls are permitted. Sleepable helpers and kfuncs are disallowed in non-preemptible regions. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424031315.2757363-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/helpers.c | 12 ++++++++ kernel/bpf/verifier.c | 71 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 82 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9db35530c878..50aa87f8d77f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -421,6 +421,7 @@ struct bpf_verifier_state { struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; + u32 active_preempt_lock; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5f61ffa7505a..14c401b78e12 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2734,6 +2734,16 @@ __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc void bpf_preempt_disable(void) +{ + preempt_disable(); +} + +__bpf_kfunc void bpf_preempt_enable(void) +{ + preempt_enable(); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2814,6 +2824,8 @@ BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) +BTF_ID_FLAGS(func, bpf_preempt_disable) +BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9715c88cc025..8312c7a0ee19 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1434,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; @@ -9599,6 +9600,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + /* Only global subprogs cannot be called with preemption disabled. */ + if (env->cur_state->active_preempt_lock) { + verbose(env, "global function calls are not allowed with preemption disabled,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -10285,6 +10293,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + if (env->cur_state->active_preempt_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in non-preemptible region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (in_sleepable(env) && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11027,6 +11046,8 @@ enum special_kfunc_type { KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, KF_bpf_wq_set_callback_impl, + KF_bpf_preempt_disable, + KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, }; @@ -11081,6 +11102,8 @@ BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_preempt_disable) +BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11107,6 +11130,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; } +static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable]; +} + +static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -12195,11 +12228,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { - const struct btf_type *t, *ptr_type; + bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; + const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; @@ -12260,6 +12293,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + preempt_disable = is_kfunc_bpf_preempt_disable(&meta); + preempt_enable = is_kfunc_bpf_preempt_enable(&meta); + if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -12292,6 +12328,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + env->cur_state->active_preempt_lock--; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); + return -EACCES; + } + } else if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -15439,6 +15491,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -17006,6 +17063,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->active_preempt_lock != cur->active_preempt_lock) + return false; + if (old->in_sleepable != cur->in_sleepable) return false; @@ -17957,6 +18017,13 @@ process_bpf_exit_full: return -EINVAL; } + if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { + verbose(env, "%d bpf_preempt_enable%s missing\n", + env->cur_state->active_preempt_lock, + env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback -- cgit v1.2.3 From 91f098704c25106d88706fc9f8bcfce01fdb97df Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Apr 2024 21:51:54 +0800 Subject: workqueue: Fix divide error in wq_update_node_max_active() Yue Sun and xingwei lee reported a divide error bug in wq_update_node_max_active(): divide error: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 PID: 21 Comm: cpuhp/1 Not tainted 6.9.0-rc5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:wq_update_node_max_active+0x369/0x6b0 kernel/workqueue.c:1605 Code: 24 bf 00 00 00 80 44 89 fe e8 83 27 33 00 41 83 fc ff 75 0d 41 81 ff 00 00 00 80 0f 84 68 01 00 00 e8 fb 22 33 00 44 89 f8 99 <41> f7 fc 89 c5 89 c7 44 89 ee e8 a8 24 33 00 89 ef 8b 5c 24 04 89 RSP: 0018:ffffc9000018fbb0 EFLAGS: 00010293 RAX: 00000000000000ff RBX: 0000000000000001 RCX: ffff888100ada500 RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000080000000 RBP: 0000000000000001 R08: ffffffff815b1fcd R09: 1ffff1100364ad72 R10: dffffc0000000000 R11: ffffed100364ad73 R12: 0000000000000000 R13: 0000000000000100 R14: 0000000000000000 R15: 00000000000000ff FS: 0000000000000000(0000) GS:ffff888135c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb8c06ca6f8 CR3: 000000010d6c6000 CR4: 0000000000750ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: workqueue_offline_cpu+0x56f/0x600 kernel/workqueue.c:6525 cpuhp_invoke_callback+0x4e1/0x870 kernel/cpu.c:194 cpuhp_thread_fun+0x411/0x7d0 kernel/cpu.c:1092 smpboot_thread_fn+0x544/0xa10 kernel/smpboot.c:164 kthread+0x2ed/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 Modules linked in: ---[ end trace 0000000000000000 ]--- After analysis, it happens when all of the CPUs in a workqueue's affinity get offine. The problem can be easily reproduced by: # echo 8 > /sys/devices/virtual/workqueue//cpumask # echo 0 > /sys/devices/system/cpu/cpu3/online Use the default max_actives for nodes when all of the CPUs in the workqueue's affinity get offline to fix the problem. Reported-by: Yue Sun Reported-by: xingwei lee Link: https://lore.kernel.org/lkml/CAEkJfYPGS1_4JqvpSo0=FM0S1ytB8CEbyreLTtWpR900dUZymw@mail.gmail.com/ Fixes: 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues") Cc: stable@vger.kernel.org Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f536c63a48d..d2dbe099286b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1598,6 +1598,15 @@ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) if (off_cpu >= 0) total_cpus--; + /* If all CPUs of the wq get offline, use the default values */ + if (unlikely(!total_cpus)) { + for_each_node(node) + wq_node_nr_active(wq, node)->max = min_active; + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; + return; + } + for_each_node(node) { int node_cpus; -- cgit v1.2.3 From a60dd06af674d3bb76b40da5d722e4a0ecefe650 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 24 Apr 2024 18:03:41 +0900 Subject: genirq/cpuhotplug: Skip suspended interrupts when restoring affinity irq_restore_affinity_of_irq() restarts managed interrupts unconditionally when the first CPU in the affinity mask comes online. That's correct during normal hotplug operations, but not when resuming from S3 because the drivers are not resumed yet and interrupt delivery is not expected by them. Skip the startup of suspended interrupts and let resume_device_irqs() deal with restoring them. This ensures that irqs are not delivered to drivers during the noirq phase of resuming from S3, after non-boot CPUs are brought back online. Signed-off-by: David Stevens Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240424090341.72236-1-stevensd@chromium.org --- kernel/irq/cpuhotplug.c | 11 ++++++++--- kernel/irq/manage.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 1ed2b1739363..43340e0b6df0 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -195,10 +195,15 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - if (irqd_is_managed_and_shutdown(data)) { - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + /* + * Don't restore suspended interrupts here when a system comes back + * from S3. They are reenabled via resume_device_irqs(). + */ + if (desc->istate & IRQS_SUSPENDED) return; - } + + if (irqd_is_managed_and_shutdown(data)) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); /* * If the interrupt can only be directed to a single target diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ad3eaf2ab959..29c378d096aa 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -800,10 +800,14 @@ void __enable_irq(struct irq_desc *desc) irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the - * interrupt might be marked NOAUTOEN. So irq_startup() - * needs to be invoked when it gets enabled the first - * time. If it was already started up, then irq_startup() - * will invoke irq_enable() under the hood. + * interrupt might be marked NOAUTOEN so irq_startup() + * needs to be invoked when it gets enabled the first time. + * This is also required when __enable_irq() is invoked for + * a managed and shutdown interrupt from the S3 resume + * path. + * + * If it was already started up, then irq_startup() will + * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; -- cgit v1.2.3 From 88d724e2301a69c1ab805cd74fc27aa36ae529e0 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 23 Apr 2024 00:34:13 -0700 Subject: genirq/cpuhotplug: Retry with cpu_online_mask when migration fails When a CPU goes offline, the interrupts affine to that CPU are re-configured. Managed interrupts undergo either migration to other CPUs or shutdown if all CPUs listed in the affinity are offline. The migration of managed interrupts is guaranteed on x86 because there are interrupt vectors reserved. Regular interrupts are migrated to a still online CPU in the affinity mask or if there is no online CPU to any online CPU. This works as long as the still online CPUs in the affinity mask have interrupt vectors available, but in case that none of those CPUs has a vector available the migration fails and the device interrupt becomes stale. This is not any different from the case where the affinity mask does not contain any online CPU, but there is no fallback operation for this. Instead of giving up, retry the migration attempt with the online CPU mask if the interrupt is not managed, as managed interrupts cannot be affected by this problem. Signed-off-by: Dongli Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423073413.79625-1-dongli.zhang@oracle.com --- kernel/irq/cpuhotplug.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 43340e0b6df0..75cadbc3c232 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -130,6 +130,22 @@ static bool migrate_one_irq(struct irq_desc *desc) * CPU. */ err = irq_do_set_affinity(d, affinity, false); + + /* + * If there are online CPUs in the affinity mask, but they have no + * vectors left to make the migration work, try to break the + * affinity by migrating to any online CPU. + */ + if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != cpu_online_mask) { + pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with online CPUs\n", + d->irq, cpumask_pr_args(affinity)); + + affinity = cpu_online_mask; + brokeaff = true; + + err = irq_do_set_affinity(d, affinity, false); + } + if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.2.3 From 6678ae1918ff554f7438ff3f1a3be22d6d01f2fb Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Tue, 23 Apr 2024 02:40:37 +0000 Subject: genirq: Reuse irq_is_nmi() Move irq_is_nmi() to the internal header file and reuse it all over the place. Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423024037.3331215-1-ruanjinjie@huawei.com --- kernel/irq/internals.h | 5 +++++ kernel/irq/irqdesc.c | 5 ----- kernel/irq/manage.c | 16 ++++++++-------- kernel/irq/resend.c | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6c43ef3e7308..ed28059e9849 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -280,6 +280,11 @@ static inline int irq_desc_is_chained(struct irq_desc *desc) return (desc->action && desc->action == &chained_action); } +static inline bool irq_is_nmi(struct irq_desc *desc) +{ + return desc->istate & IRQS_NMI; +} + #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f0d4b98c7820..88ac3652fcf2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -965,11 +965,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -static bool irq_is_nmi(struct irq_desc *desc) -{ - return desc->istate & IRQS_NMI; -} - unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 29c378d096aa..61bdb45de3e0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -564,7 +564,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) /* The release function is promised process context */ might_sleep(); - if (!desc || desc->istate & IRQS_NMI) + if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ @@ -902,7 +902,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) return -EINVAL; /* Don't use NMIs as wake up interrupts please */ - if (desc->istate & IRQS_NMI) { + if (irq_is_nmi(desc)) { ret = -EINVAL; goto out_unlock; } @@ -1628,7 +1628,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ unsigned int oldtype; - if (desc->istate & IRQS_NMI) { + if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; @@ -2081,7 +2081,7 @@ const void *free_nmi(unsigned int irq, void *dev_id) unsigned long flags; const void *devname; - if (!desc || WARN_ON(!(desc->istate & IRQS_NMI))) + if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) @@ -2547,7 +2547,7 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; - if (WARN_ON(!(desc->istate & IRQS_NMI))) + if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); @@ -2683,7 +2683,7 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, return -EINVAL; /* The line cannot already be NMI */ - if (desc->istate & IRQS_NMI) + if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); @@ -2744,7 +2744,7 @@ int prepare_percpu_nmi(unsigned int irq) if (!desc) return -EINVAL; - if (WARN(!(desc->istate & IRQS_NMI), + if (WARN(!irq_is_nmi(desc), KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) { ret = -EINVAL; @@ -2786,7 +2786,7 @@ void teardown_percpu_nmi(unsigned int irq) if (!desc) return; - if (WARN_ON(!(desc->istate & IRQS_NMI))) + if (WARN_ON(!irq_is_nmi(desc))) goto out; irq_nmi_teardown(desc); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 5f2c66860ac6..b07a2d732ffb 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -190,7 +190,7 @@ int irq_inject_interrupt(unsigned int irq) * - not NMI type * - activated */ - if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data)) + if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data)) err = -EINVAL; else err = check_irq_resend(desc, true); -- cgit v1.2.3 From 3e1c6f35409f9e447bf37f64840f5b65576bfb78 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:21 -0700 Subject: bpf: make common crypto API for TC/XDP programs Add crypto API support to BPF to be able to decrypt or encrypt packets in TC/XDP BPF programs. Special care should be taken for initialization part of crypto algo because crypto alloc) doesn't work with preemtion disabled, it can be run only in sleepable BPF program. Also async crypto is not supported because of the very same issue - TC/XDP BPF programs are not sleepable. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-2-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + include/linux/bpf_crypto.h | 24 +++ kernel/bpf/Makefile | 3 + kernel/bpf/crypto.c | 385 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 1 + 6 files changed, 415 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_crypto.h create mode 100644 kernel/bpf/crypto.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 978200f6d925..364563b74db6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1275,6 +1275,7 @@ int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); diff --git a/include/linux/bpf_crypto.h b/include/linux/bpf_crypto.h new file mode 100644 index 000000000000..a41e71d4e2d9 --- /dev/null +++ b/include/linux/bpf_crypto.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_CRYPTO_H +#define _BPF_CRYPTO_H + +struct bpf_crypto_type { + void *(*alloc_tfm)(const char *algo); + void (*free_tfm)(void *tfm); + int (*has_algo)(const char *algo); + int (*setkey)(void *tfm, const u8 *key, unsigned int keylen); + int (*setauthsize)(void *tfm, unsigned int authsize); + int (*encrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv); + int (*decrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv); + unsigned int (*ivsize)(void *tfm); + unsigned int (*statesize)(void *tfm); + u32 (*get_flags)(void *tfm); + struct module *owner; + char name[14]; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type); +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type); + +#endif /* _BPF_CRYPTO_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 368c5d86b5b7..736bd22e5ce0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +ifeq ($(CONFIG_CRYPTO),y) +obj-$(CONFIG_BPF_SYSCALL) += crypto.o +endif obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c new file mode 100644 index 000000000000..2bee4af91e38 --- /dev/null +++ b/kernel/bpf/crypto.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_crypto_type_list { + const struct bpf_crypto_type *type; + struct list_head list; +}; + +/* BPF crypto initialization parameters struct */ +/** + * struct bpf_crypto_params - BPF crypto initialization parameters structure + * @type: The string of crypto operation type. + * @reserved: Reserved member, will be reused for more options in future + * Values: + * 0 + * @algo: The string of algorithm to initialize. + * @key: The cipher key used to init crypto algorithm. + * @key_len: The length of cipher key. + * @authsize: The length of authentication tag used by algorithm. + */ +struct bpf_crypto_params { + char type[14]; + u8 reserved[2]; + char algo[128]; + u8 key[256]; + u32 key_len; + u32 authsize; +}; + +static LIST_HEAD(bpf_crypto_types); +static DECLARE_RWSEM(bpf_crypto_types_sem); + +/** + * struct bpf_crypto_ctx - refcounted BPF crypto context structure + * @type: The pointer to bpf crypto type + * @tfm: The pointer to instance of crypto API struct. + * @siv_len: Size of IV and state storage for cipher + * @rcu: The RCU head used to free the crypto context with RCU safety. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + */ +struct bpf_crypto_ctx { + const struct bpf_crypto_type *type; + void *tfm; + u32 siv_len; + struct rcu_head rcu; + refcount_t usage; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -EEXIST; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (!strcmp(node->type->name, type->name)) + goto unlock; + } + + node = kmalloc(sizeof(*node), GFP_KERNEL); + err = -ENOMEM; + if (!node) + goto unlock; + + node->type = type; + list_add(&node->list, &bpf_crypto_types); + err = 0; + +unlock: + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_register_type); + +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -ENOENT; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, type->name)) + continue; + + list_del(&node->list); + kfree(node); + err = 0; + break; + } + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type); + +static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name) +{ + const struct bpf_crypto_type *type = ERR_PTR(-ENOENT); + struct bpf_crypto_type_list *node; + + down_read(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, name)) + continue; + + if (try_module_get(node->type->owner)) + type = node->type; + break; + } + up_read(&bpf_crypto_types_sem); + + return type; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_crypto_ctx_create() - Create a mutable BPF crypto context. + * + * Allocates a crypto context that can be used, acquired, and released by + * a BPF program. The crypto context returned by this function must either + * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release(). + * As crypto API functions use GFP_KERNEL allocations, this function can + * only be used in sleepable BPF programs. + * + * bpf_crypto_ctx_create() allocates memory for crypto context. + * It may return NULL if no memory is available. + * @params: pointer to struct bpf_crypto_params which contains all the + * details needed to initialise crypto context. + * @params__sz: size of steuct bpf_crypto_params usef by bpf program + * @err: integer to store error code when NULL is returned. + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz, + int *err) +{ + const struct bpf_crypto_type *type; + struct bpf_crypto_ctx *ctx; + + if (!params || params->reserved[0] || params->reserved[1] || + params__sz != sizeof(struct bpf_crypto_params)) { + *err = -EINVAL; + return NULL; + } + + type = bpf_crypto_get_type(params->type); + if (IS_ERR(type)) { + *err = PTR_ERR(type); + return NULL; + } + + if (!type->has_algo(params->algo)) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!!params->authsize ^ !!type->setauthsize) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!params->key_len || params->key_len > sizeof(params->key)) { + *err = -EINVAL; + goto err_module_put; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + *err = -ENOMEM; + goto err_module_put; + } + + ctx->type = type; + ctx->tfm = type->alloc_tfm(params->algo); + if (IS_ERR(ctx->tfm)) { + *err = PTR_ERR(ctx->tfm); + goto err_free_ctx; + } + + if (params->authsize) { + *err = type->setauthsize(ctx->tfm, params->authsize); + if (*err) + goto err_free_tfm; + } + + *err = type->setkey(ctx->tfm, params->key, params->key_len); + if (*err) + goto err_free_tfm; + + if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) { + *err = -EINVAL; + goto err_free_tfm; + } + + ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm); + + refcount_set(&ctx->usage, 1); + + return ctx; + +err_free_tfm: + type->free_tfm(ctx->tfm); +err_free_ctx: + kfree(ctx); +err_module_put: + module_put(type->owner); + + return NULL; +} + +static void crypto_free_cb(struct rcu_head *head) +{ + struct bpf_crypto_ctx *ctx; + + ctx = container_of(head, struct bpf_crypto_ctx, rcu); + ctx->type->free_tfm(ctx->tfm); + module_put(ctx->type->owner); + kfree(ctx); +} + +/** + * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context. + * @ctx: The BPF crypto context being acquired. The ctx must be a trusted + * pointer. + * + * Acquires a reference to a BPF crypto context. The context returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_crypto_ctx_release(). + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) +{ + if (!refcount_inc_not_zero(&ctx->usage)) + return NULL; + return ctx; +} + +/** + * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context. + * @ctx: The crypto context being released. + * + * Releases a previously acquired reference to a BPF crypto context. When the final + * reference of the BPF crypto context has been released, its memory + * will be released. + */ +__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->usage)) + call_rcu(&ctx->rcu, crypto_free_cb); +} + +static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv, + bool decrypt) +{ + u32 src_len, dst_len, siv_len; + const u8 *psrc; + u8 *pdst, *piv; + int err; + + if (__bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + siv_len = __bpf_dynptr_size(siv); + src_len = __bpf_dynptr_size(src); + dst_len = __bpf_dynptr_size(dst); + if (!src_len || !dst_len) + return -EINVAL; + + if (siv_len != ctx->siv_len) + return -EINVAL; + + psrc = __bpf_dynptr_data(src, src_len); + if (!psrc) + return -EINVAL; + pdst = __bpf_dynptr_data_rw(dst, dst_len); + if (!pdst) + return -EINVAL; + + piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL; + if (siv_len && !piv) + return -EINVAL; + + err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) + : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); + + return err; +} + +/** + * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, true); +} + +/** + * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, false); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(crypt_init_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_KFUNCS_END(crypt_init_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_init_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_init_kfunc_btf_ids, +}; + +BTF_KFUNCS_START(crypt_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU) +BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU) +BTF_KFUNCS_END(crypt_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_kfunc_btf_ids, +}; + +BTF_ID_LIST(bpf_crypto_dtor_ids) +BTF_ID(struct, bpf_crypto_ctx) +BTF_ID(func, bpf_crypto_ctx_release) + +static int __init crypto_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = { + { + .btf_id = bpf_crypto_dtor_ids[0], + .kfunc_btf_id = bpf_crypto_dtor_ids[1] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &crypt_init_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors, + ARRAY_SIZE(bpf_crypto_dtors), + THIS_MODULE); +} + +late_initcall(crypto_kfunc_init); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 14c401b78e12..2a69a9a36c0f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1583,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8312c7a0ee19..4e474ef44e9c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5310,6 +5310,7 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +BTF_ID(struct, bpf_crypto_ctx) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) -- cgit v1.2.3 From b7d56d953a67c839a514e382596d5af29b8d6d87 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 25 Apr 2024 09:30:16 +0000 Subject: cgroup/cpuset: Remove outdated comment in sched_partition_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment here is outdated and can cause confusion, from the code perspective, there’s also no need for new comment, so just remove it. Signed-off-by: Xiu Jianfeng Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f5443c039619..a10e4bd0c0c1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3774,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) -- cgit v1.2.3 From 91b71e78b8e478e1a9af36b15ff1d38810572866 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 16 Mar 2024 01:58:03 +0000 Subject: mm: memcg: add NULL check to obj_cgroup_put() 9 out of 16 callers perform a NULL check before calling obj_cgroup_put(). Move the NULL check in the function, similar to mem_cgroup_put(). The unlikely() NULL check in current_objcg_update() was left alone to avoid dropping the unlikey() annotation as this a fast path. Link: https://lkml.kernel.org/r/20240316015803.2777252-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 3 ++- kernel/bpf/memalloc.c | 6 ++---- mm/memcontrol.c | 18 ++++++------------ mm/zswap.c | 3 +-- 4 files changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 394fd0a887ae..b6264796815d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -818,7 +818,8 @@ static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - percpu_ref_put(&objcg->refcnt); + if (objcg) + percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 550f02e2cb13..a546aba46d5d 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -759,8 +759,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - if (ma->objcg) - obj_cgroup_put(ma->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -776,8 +775,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (ma->objcg) - obj_cgroup_put(ma->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fabce2b50c69..786c0fae6e30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2369,8 +2369,7 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } /* @@ -3145,8 +3144,7 @@ static struct obj_cgroup *current_objcg_update(void) if (old) { old = (struct obj_cgroup *) ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); old = NULL; } @@ -3418,8 +3416,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, mod_objcg_mlstate(objcg, pgdat, idx, nr); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3546,8 +3543,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -5468,8 +5464,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - if (memcg->orig_objcg) - obj_cgroup_put(memcg->orig_objcg); + obj_cgroup_put(memcg->orig_objcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); @@ -6620,8 +6615,7 @@ static void mem_cgroup_exit(struct task_struct *task) objcg = (struct obj_cgroup *) ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); /* * Some kernel allocations can happen after this point, diff --git a/mm/zswap.c b/mm/zswap.c index 6f8850c44b61..2817e2791c13 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1618,8 +1618,7 @@ put_pool: freepage: zswap_entry_cache_free(entry); reject: - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate the -- cgit v1.2.3 From a473573964e51dcb6efc182f773cd3924be4a184 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:33 -0700 Subject: lib: code tagging module support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for code tagging from dynamically loaded modules. Link: https://lkml.kernel.org/r/20240321163705.3067592-12-surenb@google.com Signed-off-by: Suren Baghdasaryan Co-developed-by: Kent Overstreet Signed-off-by: Kent Overstreet Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/codetag.h | 12 ++++++++++ kernel/module/main.c | 4 ++++ lib/codetag.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 7734269cdb63..c44f5b83f24d 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -33,6 +33,10 @@ union codetag_ref { struct codetag_type_desc { const char *section; size_t tag_size; + void (*module_load)(struct codetag_type *cttype, + struct codetag_module *cmod); + void (*module_unload)(struct codetag_type *cttype, + struct codetag_module *cmod); }; struct codetag_iterator { @@ -65,4 +69,12 @@ void codetag_to_text(struct seq_buf *out, struct codetag *ct); struct codetag_type * codetag_register_type(const struct codetag_type_desc *desc); +#if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) +void codetag_load_module(struct module *mod); +void codetag_unload_module(struct module *mod); +#else +static inline void codetag_load_module(struct module *mod) {} +static inline void codetag_unload_module(struct module *mod) {} +#endif + #endif /* _LINUX_CODETAG_H */ diff --git a/kernel/module/main.c b/kernel/module/main.c index e1e8a7a9d6c1..ffa6b3e9cb43 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1242,6 +1243,7 @@ static void free_module(struct module *mod) { trace_module_free(mod); + codetag_unload_module(mod); mod_sysfs_teardown(mod); /* @@ -2995,6 +2997,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Get rid of temporary copy. */ free_copy(info, flags); + codetag_load_module(mod); + /* Done! */ trace_module_load(mod); diff --git a/lib/codetag.c b/lib/codetag.c index 8b5b89ad508d..54d2828eba25 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -124,15 +124,20 @@ static void *get_symbol(struct module *mod, const char *prefix, const char *name { DECLARE_SEQ_BUF(sb, KSYM_NAME_LEN); const char *buf; + void *ret; seq_buf_printf(&sb, "%s%s", prefix, name); if (seq_buf_has_overflowed(&sb)) return NULL; buf = seq_buf_str(&sb); - return mod ? + preempt_disable(); + ret = mod ? (void *)find_kallsyms_symbol_value(mod, buf) : (void *)kallsyms_lookup_name(buf); + preempt_enable(); + + return ret; } static struct codetag_range get_section_range(struct module *mod, @@ -173,8 +178,11 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) down_write(&cttype->mod_lock); err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); - if (err >= 0) + if (err >= 0) { cttype->count += range_size(cttype, &range); + if (cttype->desc.module_load) + cttype->desc.module_load(cttype, cmod); + } up_write(&cttype->mod_lock); if (err < 0) { @@ -185,6 +193,52 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) return 0; } +void codetag_load_module(struct module *mod) +{ + struct codetag_type *cttype; + + if (!mod) + return; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) + codetag_module_init(cttype, mod); + mutex_unlock(&codetag_lock); +} + +void codetag_unload_module(struct module *mod) +{ + struct codetag_type *cttype; + + if (!mod) + return; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + struct codetag_module *found = NULL; + struct codetag_module *cmod; + unsigned long mod_id, tmp; + + down_write(&cttype->mod_lock); + idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) { + if (cmod->mod && cmod->mod == mod) { + found = cmod; + break; + } + } + if (found) { + if (cttype->desc.module_unload) + cttype->desc.module_unload(cttype, cmod); + + cttype->count -= range_size(cttype, &cmod->range); + idr_remove(&cttype->mod_idr, mod_id); + kfree(cmod); + } + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + #else /* CONFIG_MODULES */ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { return 0; } #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 47a92dfbe01f41bcbf359250ccb3caa589763abf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:34 -0700 Subject: lib: prevent module unloading if memory is not freed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip freeing module's data section if there are non-zero allocation tags because otherwise, once these allocations are freed, the access to their code tag would cause UAF. Link: https://lkml.kernel.org/r/20240321163705.3067592-13-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Kent Overstreet Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/codetag.h | 6 +++--- kernel/module/main.c | 27 +++++++++++++++++++-------- lib/codetag.c | 11 ++++++++--- 3 files changed, 30 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/codetag.h b/include/linux/codetag.h index c44f5b83f24d..bfd0ba5c4185 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -35,7 +35,7 @@ struct codetag_type_desc { size_t tag_size; void (*module_load)(struct codetag_type *cttype, struct codetag_module *cmod); - void (*module_unload)(struct codetag_type *cttype, + bool (*module_unload)(struct codetag_type *cttype, struct codetag_module *cmod); }; @@ -71,10 +71,10 @@ codetag_register_type(const struct codetag_type_desc *desc); #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) void codetag_load_module(struct module *mod); -void codetag_unload_module(struct module *mod); +bool codetag_unload_module(struct module *mod); #else static inline void codetag_load_module(struct module *mod) {} -static inline void codetag_unload_module(struct module *mod) {} +static inline bool codetag_unload_module(struct module *mod) { return true; } #endif #endif /* _LINUX_CODETAG_H */ diff --git a/kernel/module/main.c b/kernel/module/main.c index ffa6b3e9cb43..2d25eebc549d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1211,15 +1211,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) return module_alloc(size); } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(void *ptr, enum mod_mem_type type, + bool unload_codetags) { + if (!unload_codetags && mod_mem_type_is_core_data(type)) + return; + if (mod_mem_use_vmalloc(type)) vfree(ptr); else module_memfree(ptr); } -static void free_mod_mem(struct module *mod) +static void free_mod_mem(struct module *mod, bool unload_codetags) { for_each_mod_mem_type(type) { struct module_memory *mod_mem = &mod->mem[type]; @@ -1230,20 +1234,27 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod_mem->base, type, + unload_codetags); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags); } /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { + bool unload_codetags; + trace_module_free(mod); - codetag_unload_module(mod); + unload_codetags = codetag_unload_module(mod); + if (!unload_codetags) + pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n", + mod->name); + mod_sysfs_teardown(mod); /* @@ -1285,7 +1296,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - free_mod_mem(mod); + free_mod_mem(mod, unload_codetags); } void *__symbol_get(const char *symbol) @@ -2298,7 +2309,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod->mem[t].base, t, true); return ret; } @@ -2428,7 +2439,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) percpu_modfree(mod); module_arch_freeing_init(mod); - free_mod_mem(mod); + free_mod_mem(mod, true); } int __weak module_finalize(const Elf_Ehdr *hdr, diff --git a/lib/codetag.c b/lib/codetag.c index 54d2828eba25..408062f722ce 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -5,6 +5,7 @@ #include #include #include +#include struct codetag_type { struct list_head link; @@ -206,12 +207,13 @@ void codetag_load_module(struct module *mod) mutex_unlock(&codetag_lock); } -void codetag_unload_module(struct module *mod) +bool codetag_unload_module(struct module *mod) { struct codetag_type *cttype; + bool unload_ok = true; if (!mod) - return; + return true; mutex_lock(&codetag_lock); list_for_each_entry(cttype, &codetag_types, link) { @@ -228,7 +230,8 @@ void codetag_unload_module(struct module *mod) } if (found) { if (cttype->desc.module_unload) - cttype->desc.module_unload(cttype, cmod); + if (!cttype->desc.module_unload(cttype, cmod)) + unload_ok = false; cttype->count -= range_size(cttype, &cmod->range); idr_remove(&cttype->mod_idr, mod_id); @@ -237,6 +240,8 @@ void codetag_unload_module(struct module *mod) up_write(&cttype->mod_lock); } mutex_unlock(&codetag_lock); + + return unload_ok; } #else /* CONFIG_MODULES */ -- cgit v1.2.3 From 8a2f11878771da65b8ac135c73b47dae13afbd62 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:39 -0700 Subject: change alloc_pages name in dma_map_ops to avoid name conflicts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After redefining alloc_pages, all uses of that name are being replaced. Change the conflicting names to prevent preprocessor from replacing them when it's not intended. Link: https://lkml.kernel.org/r/20240321163705.3067592-18-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Kent Overstreet Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- arch/alpha/kernel/pci_iommu.c | 2 +- arch/mips/jazz/jazzdma.c | 2 +- arch/powerpc/kernel/dma-iommu.c | 2 +- arch/powerpc/platforms/ps3/system-bus.c | 4 ++-- arch/powerpc/platforms/pseries/vio.c | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- drivers/iommu/dma-iommu.c | 2 +- drivers/parisc/ccio-dma.c | 2 +- drivers/parisc/sba_iommu.c | 2 +- drivers/xen/grant-dma-ops.c | 2 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/dma-map-ops.h | 2 +- kernel/dma/mapping.c | 4 ++-- 13 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index c81183935e97..7fcf3e9b7103 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -929,7 +929,7 @@ const struct dma_map_ops alpha_pci_ops = { .dma_supported = alpha_pci_supported, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(alpha_pci_ops); diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index eabddb89d221..c97b089b9902 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -617,7 +617,7 @@ const struct dma_map_ops jazz_dma_ops = { .sync_sg_for_device = jazz_dma_sync_sg_for_device, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(jazz_dma_ops); diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 8920862ffd79..f0ae39e77e37 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -216,6 +216,6 @@ const struct dma_map_ops dma_iommu_ops = { .get_required_mask = dma_iommu_get_required_mask, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index d6b5f5ecd515..56dc6b29a3e7 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -695,7 +695,7 @@ static const struct dma_map_ops ps3_sb_dma_ops = { .unmap_page = ps3_unmap_page, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; @@ -709,7 +709,7 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = { .unmap_page = ps3_unmap_page, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 90ff85c879bf..477c1d5e1737 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -611,7 +611,7 @@ static const struct dma_map_ops vio_dma_mapping_ops = { .get_required_mask = dma_iommu_get_required_mask, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 2ae98f754e59..c884deca839b 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, - .alloc_pages = dma_direct_alloc_pages, + .alloc_pages_op = dma_direct_alloc_pages, .free_pages = dma_direct_free_pages, }; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e4cb26f6a943..07d087eecc17 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1723,7 +1723,7 @@ static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, .free = iommu_dma_free, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, .free_noncontiguous = iommu_dma_free_noncontiguous, diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 9ce0d20a6c58..feef537257d0 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1022,7 +1022,7 @@ static const struct dma_map_ops ccio_ops = { .map_sg = ccio_map_sg, .unmap_sg = ccio_unmap_sg, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 784037837f65..fc3863c09f83 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1090,7 +1090,7 @@ static const struct dma_map_ops sba_ops = { .map_sg = sba_map_sg, .unmap_sg = sba_unmap_sg, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 76f6f26265a3..29257d2639db 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask) static const struct dma_map_ops xen_grant_dma_ops = { .alloc = xen_grant_dma_alloc, .free = xen_grant_dma_free, - .alloc_pages = xen_grant_dma_alloc_pages, + .alloc_pages_op = xen_grant_dma_alloc_pages, .free_pages = xen_grant_dma_free_pages, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 0e6c6c25d154..1c4ef5111651 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -403,7 +403,7 @@ const struct dma_map_ops xen_swiotlb_dma_ops = { .dma_supported = xen_swiotlb_dma_supported, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, .max_mapping_size = swiotlb_max_mapping_size, }; diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 4abc60f04209..9ee319851b5f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -29,7 +29,7 @@ struct dma_map_ops { unsigned long attrs); void (*free)(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs); - struct page *(*alloc_pages)(struct device *dev, size_t size, + struct page *(*alloc_pages_op)(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 58db8fd70471..5e2d51e1cdf6 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - if (!ops->alloc_pages) + if (!ops->alloc_pages_op) return NULL; - return ops->alloc_pages(dev, size, dma_handle, dir, gfp); + return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); } struct page *dma_alloc_pages(struct device *dev, size_t size, -- cgit v1.2.3 From 88ae5fb755b0d45f790b392f578924d1ca5693a7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Mar 2024 09:36:52 -0700 Subject: mm: vmalloc: enable memory allocation profiling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This wrapps all external vmalloc allocation functions with the alloc_hooks() wrapper, and switches internal allocations to _noprof variants where appropriate, for the new memory allocation profiling feature. [surenb@google.com: arch/um: fix forward declaration for vmalloc] Link: https://lkml.kernel.org/r/20240326073750.726636-1-surenb@google.com [surenb@google.com: undo _noprof additions in the documentation] Link: https://lkml.kernel.org/r/20240326231453.1206227-5-surenb@google.com Link: https://lkml.kernel.org/r/20240321163705.3067592-31-surenb@google.com Signed-off-by: Kent Overstreet Signed-off-by: Suren Baghdasaryan Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- arch/um/include/shared/um_malloc.h | 3 +- drivers/staging/media/atomisp/pci/hmm/hmm.c | 2 +- include/linux/vmalloc.h | 60 +++++++++++++++++------ kernel/kallsyms_selftest.c | 2 +- mm/nommu.c | 56 +++++++++++----------- mm/util.c | 18 +++---- mm/vmalloc.c | 74 ++++++++++++++--------------- 7 files changed, 123 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/arch/um/include/shared/um_malloc.h b/arch/um/include/shared/um_malloc.h index 13da93284c2c..bf503658f08e 100644 --- a/arch/um/include/shared/um_malloc.h +++ b/arch/um/include/shared/um_malloc.h @@ -11,7 +11,8 @@ extern void *uml_kmalloc(int size, int flags); extern void kfree(const void *ptr); -extern void *vmalloc(unsigned long size); +extern void *vmalloc_noprof(unsigned long size); +#define vmalloc(...) vmalloc_noprof(__VA_ARGS__) extern void vfree(void *ptr); #endif /* __UM_MALLOC_H__ */ diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm.c b/drivers/staging/media/atomisp/pci/hmm/hmm.c index bb12644fd033..3e2899ad8517 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm.c @@ -205,7 +205,7 @@ static ia_css_ptr __hmm_alloc(size_t bytes, enum hmm_bo_type type, } dev_dbg(atomisp_dev, "pages: 0x%08x (%zu bytes), type: %d, vmalloc %p\n", - bo->start, bytes, type, vmalloc); + bo->start, bytes, type, vmalloc_noprof); return bo->start; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 98ea90e90439..e4a631ec430b 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -2,6 +2,8 @@ #ifndef _LINUX_VMALLOC_H #define _LINUX_VMALLOC_H +#include +#include #include #include #include @@ -138,26 +140,54 @@ extern unsigned long vmalloc_nr_pages(void); static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -extern void *vmalloc(unsigned long size) __alloc_size(1); -extern void *vzalloc(unsigned long size) __alloc_size(1); -extern void *vmalloc_user(unsigned long size) __alloc_size(1); -extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); -extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); -extern void *vmalloc_32(unsigned long size) __alloc_size(1); -extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -extern void *__vmalloc_node_range(unsigned long size, unsigned long align, +extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); +#define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) + +extern void *vzalloc_noprof(unsigned long size) __alloc_size(1); +#define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__)) + +extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1); +#define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__)) + +extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1); +#define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__)) + +extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1); +#define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__)) + +extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1); +#define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__)) + +extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1); +#define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__)) + +extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +#define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__)) + +extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); -void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, +#define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__)) + +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) + +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) + +extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); +#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) + +extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2); +#define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__)) + +extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); +#define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__)) -extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); -extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); -extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); -extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2); +extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); +#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 8a689b4ff4f9..2f84896a7bcb 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -82,7 +82,7 @@ static struct test_item test_items[] = { ITEM_FUNC(kallsyms_test_func_static), ITEM_FUNC(kallsyms_test_func), ITEM_FUNC(kallsyms_test_func_weak), - ITEM_FUNC(vmalloc), + ITEM_FUNC(vmalloc_noprof), ITEM_FUNC(vfree), #ifdef CONFIG_KALLSYMS_ALL ITEM_DATA(kallsyms_test_var_bss_static), diff --git a/mm/nommu.c b/mm/nommu.c index 5ec8f44e7ce9..88b646ea6d30 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -137,28 +137,28 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() * returns only a logical address. */ - return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); + return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { - return __vmalloc(size, gfp_mask); + return __vmalloc_noprof(size, gfp_mask); } -void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc(size, gfp_mask); + return __vmalloc_noprof(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) @@ -179,11 +179,11 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) return ret; } -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); struct page *vmalloc_to_page(const void *addr) { @@ -217,13 +217,13 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc); +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -237,11 +237,11 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO); + return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -254,11 +254,11 @@ EXPORT_SYMBOL(vzalloc); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return vmalloc(size); + return vmalloc_noprof(size); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -272,11 +272,11 @@ EXPORT_SYMBOL(vmalloc_node); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return vzalloc(size); + return vzalloc_noprof(size); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) @@ -285,11 +285,11 @@ EXPORT_SYMBOL(vzalloc_node); * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -301,15 +301,15 @@ EXPORT_SYMBOL(vmalloc_32); * VM_USERMAP is set on the corresponding VMA so that subsequent calls to * remap_vmalloc_range() are permissible. */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { /* * We'll have to sort out the ZONE_DMA bits for 64-bit, * but for now this can simply use vmalloc_user() directly. */ - return vmalloc_user(size); + return vmalloc_user_noprof(size); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { diff --git a/mm/util.c b/mm/util.c index cee3563d2071..a9e911b22b99 100644 --- a/mm/util.c +++ b/mm/util.c @@ -656,7 +656,7 @@ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } @@ -720,7 +720,7 @@ EXPORT_SYMBOL(kvrealloc_noprof); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -728,18 +728,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags) return NULL; return __vmalloc(bytes, flags); } -EXPORT_SYMBOL(__vmalloc_array); +EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vmalloc_array(size_t n, size_t size) +void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_array); +EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. @@ -747,22 +747,22 @@ EXPORT_SYMBOL(vmalloc_array); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vcalloc(size_t n, size_t size, gfp_t flags) +void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array(n, size, flags | __GFP_ZERO); } -EXPORT_SYMBOL(__vcalloc); +EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vcalloc(size_t n, size_t size) +void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vcalloc); +EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(struct folio *folio) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 68fa001648cc..61af431bb6b8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3523,12 +3523,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3558,9 +3558,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, break; if (nid == NUMA_NO_NODE) - page = alloc_pages(alloc_gfp, order); + page = alloc_pages_noprof(alloc_gfp, order); else - page = alloc_pages_node(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, alloc_gfp, order); if (unlikely(!page)) { if (!nofail) break; @@ -3617,10 +3617,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, area->caller); } else { - area->pages = kmalloc_node(array_size, nested_gfp, node); + area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); } if (!area->pages) { @@ -3730,7 +3730,7 @@ fail: * * Return: the address of the area or %NULL on failure */ -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) @@ -3877,10 +3877,10 @@ fail: * * Return: pointer to the allocated memory or %NULL on error */ -void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* @@ -3889,15 +3889,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align, * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node); +EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); #endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); /** * vmalloc - allocate virtually contiguous memory @@ -3911,12 +3911,12 @@ EXPORT_SYMBOL(__vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages @@ -3930,13 +3930,13 @@ EXPORT_SYMBOL(vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge); +EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -3951,12 +3951,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace @@ -3967,14 +3967,14 @@ EXPORT_SYMBOL(vzalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -3989,12 +3989,12 @@ EXPORT_SYMBOL(vmalloc_user); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -4007,12 +4007,12 @@ EXPORT_SYMBOL(vmalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) @@ -4035,12 +4035,12 @@ EXPORT_SYMBOL(vzalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -4051,14 +4051,14 @@ EXPORT_SYMBOL(vmalloc_32); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); /* * Atomically zero bytes in the iterator. -- cgit v1.2.3 From 46df8e73a4a3f1445f2a8429111e72ede1f4d291 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:45 +0000 Subject: mm: free up PG_slab Reclaim the Slab page flag by using a spare bit in PageType. We are perennially short of page flags for various purposes, and now that the original SLAB allocator has been retired, SLUB does not use the mapcount/page_type field. This lets us remove a number of special cases for ignoring mapcount on Slab pages. [willy@infradead.org: update vmcoreinfo] Link: https://lkml.kernel.org/r/ZgGV-O8WYQ_83kxp@casper.infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 21 +++++++++++++++++---- include/trace/events/mmflags.h | 2 +- kernel/vmcore_info.c | 3 ++- mm/memory-failure.c | 9 --------- mm/slab.h | 2 +- 5 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 94eb8a11a321..73e0b17c7728 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -109,7 +109,6 @@ enum pageflags { PG_active, PG_workingset, PG_error, - PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, PG_reserved, @@ -524,7 +523,6 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) -__PAGEFLAG(Slab, slab, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ @@ -931,7 +929,7 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif /* - * For pages that are never mapped to userspace (and aren't PageSlab), + * For pages that are never mapped to userspace, * page_type may be used. Because it is initialised to -1, we invert the * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and @@ -947,6 +945,7 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #define PG_table 0x00000200 #define PG_guard 0x00000400 #define PG_hugetlb 0x00000800 +#define PG_slab 0x00001000 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -1041,6 +1040,20 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) +FOLIO_TYPE_OPS(slab, slab) + +/** + * PageSlab - Determine if the page belongs to the slab allocator + * @page: The page to test. + * + * Context: Any context. + * Return: True for slab pages, false for any other kind of page. + */ +static inline bool PageSlab(const struct page *page) +{ + return folio_test_slab(page_folio(page)); +} + #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) #else @@ -1121,7 +1134,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ + 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index d55e53ac91bd..e46d6e82765e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -107,7 +107,6 @@ DEF_PAGEFLAG_NAME(lru), \ DEF_PAGEFLAG_NAME(active), \ DEF_PAGEFLAG_NAME(workingset), \ - DEF_PAGEFLAG_NAME(slab), \ DEF_PAGEFLAG_NAME(owner_priv_1), \ DEF_PAGEFLAG_NAME(arch_1), \ DEF_PAGEFLAG_NAME(reserved), \ @@ -135,6 +134,7 @@ IF_HAVE_PG_ARCH_X(arch_3) #define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) } #define __def_pagetype_names \ + DEF_PAGETYPE_NAME(slab), \ DEF_PAGETYPE_NAME(hugetlb), \ DEF_PAGETYPE_NAME(offline), \ DEF_PAGETYPE_NAME(guard), \ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 23c125c2e243..1d5eadd9dd61 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -198,7 +198,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); - VMCOREINFO_NUMBER(PG_slab); +#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab) + VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9e62a00b46dd..0a7a8a4ba421 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1251,7 +1251,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) #define mlock (1UL << PG_mlocked) #define lru (1UL << PG_lru) #define head (1UL << PG_head) -#define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state error_states[] = { @@ -1261,13 +1260,6 @@ static struct page_state error_states[] = { * PG_buddy pages only make a small fraction of all free pages. */ - /* - * Could in theory check if slab page is free or if we can drop - * currently unused objects without touching them. But just - * treat it as standard kernel for now. - */ - { slab, slab, MF_MSG_SLAB, me_kernel }, - { head, head, MF_MSG_HUGE, me_huge_page }, { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, @@ -1294,7 +1286,6 @@ static struct page_state error_states[] = { #undef mlock #undef lru #undef head -#undef slab #undef reserved static void update_per_node_mf_stats(unsigned long pfn, diff --git a/mm/slab.h b/mm/slab.h index 65db525e93af..1343bfa12cee 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -84,8 +84,8 @@ struct slab { }; struct rcu_head rcu_head; }; - unsigned int __unused; + unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_SLAB_OBJ_EXT unsigned long obj_exts; -- cgit v1.2.3 From 632230ff193954b514c908fdb45320f64ac9dc72 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 20:28:28 +0000 Subject: mm: rename mm_put_huge_zero_page to mm_put_huge_zero_folio Also remove mm_get_huge_zero_page() now it has no users. Link: https://lkml.kernel.org/r/20240326202833.523759-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 ++------- kernel/fork.c | 2 +- mm/huge_memory.c | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7ba59ba36354..9d11e886aa9a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -367,12 +367,7 @@ static inline bool is_huge_zero_pud(pud_t pud) } struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); -void mm_put_huge_zero_page(struct mm_struct *mm); - -static inline struct page *mm_get_huge_zero_page(struct mm_struct *mm) -{ - return &mm_get_huge_zero_folio(mm)->page; -} +void mm_put_huge_zero_folio(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) @@ -500,7 +495,7 @@ static inline bool is_huge_zero_pud(pud_t pud) return false; } -static inline void mm_put_huge_zero_page(struct mm_struct *mm) +static inline void mm_put_huge_zero_folio(struct mm_struct *mm) { return; } diff --git a/kernel/fork.c b/kernel/fork.c index aebb3e6c96dc..99076dbe27d8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1344,7 +1344,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); - mm_put_huge_zero_page(mm); + mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 098b8ad4bcc7..c4820cd4749e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -241,7 +241,7 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) return READ_ONCE(huge_zero_folio); } -void mm_put_huge_zero_page(struct mm_struct *mm) +void mm_put_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); -- cgit v1.2.3 From 529ce23a764f25d172198b4c6ba90f1e2ad17f93 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:44 -0700 Subject: mm: switch mm->get_unmapped_area() to a flag The mm_struct contains a function pointer *get_unmapped_area(), which is set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() during the initialization of the mm. Since the function pointer only ever points to two functions that are named the same across all arch's, a function pointer is not really required. In addition future changes will want to add versions of the functions that take additional arguments. So to save a pointers worth of bytes in mm_struct, and prevent adding additional function pointers to mm_struct in future changes, remove it and keep the information about which get_unmapped_area() to use in a flag. Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by mmf_init_flags(). Most MM flags get clobbered on fork. In the pre-existing behavior mm->get_unmapped_area() would get copied to the new mm in dup_mm(), so not clobbering the flag preserves the existing behavior around inheriting the topdown-ness. Introduce a helper, mm_get_unmapped_area(), to easily convert code that refers to the old function pointer to instead select and call either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the flag. Then drop the mm->get_unmapped_area() function pointer. Leave the get_unmapped_area() pointer in struct file_operations alone. The main purpose of this change is to reorganize in preparation for future changes, but it also converts the calls of mm->get_unmapped_area() from indirect branches into a direct ones. The stress-ng bigheap benchmark calls realloc a lot, which calls through get_unmapped_area() in the kernel. On x86, the change yielded a ~1% improvement there on a retpoline config. In testing a few x86 configs, removing the pointer unfortunately didn't result in any actual size reductions in the compiled layout of mm_struct. But depending on compiler or arch alignment requirements, the change could shrink the size of mm_struct. Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Acked-by: Dave Hansen Acked-by: Liam R. Howlett Reviewed-by: Kirill A. Shutemov Acked-by: Alexei Starovoitov Cc: Dan Williams Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov (AMD) Cc: Christophe Leroy Cc: Deepak Gupta Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Kees Cook Cc: Mark Brown Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/s390/mm/hugetlbpage.c | 2 +- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 15 ++++++--------- arch/sparc/mm/hugetlbpage.c | 2 +- arch/x86/kernel/cpu/sgx/driver.c | 2 +- arch/x86/mm/hugetlbpage.c | 2 +- arch/x86/mm/mmap.c | 4 ++-- drivers/char/mem.c | 2 +- drivers/dax/device.c | 6 +++--- fs/hugetlbfs/inode.c | 4 ++-- fs/proc/inode.c | 3 ++- fs/ramfs/file-mmu.c | 2 +- include/linux/mm_types.h | 6 +----- include/linux/sched/coredump.h | 5 ++++- include/linux/sched/mm.h | 5 +++++ io_uring/io_uring.c | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/debug.c | 6 ------ mm/huge_memory.c | 9 ++++----- mm/mmap.c | 21 ++++++++++++++++++--- mm/shmem.c | 11 +++++------ mm/util.c | 6 +++--- 23 files changed, 66 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ca43b6fce71c..7d948e243f4b 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -318,7 +318,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, goto check_asce_limit; } - if (mm->get_unmapped_area == arch_get_unmapped_area) + if (!test_bit(MMF_TOPDOWN, &mm->flags)) addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index b14fc0887654..6b2e4436ad4a 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -185,10 +185,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 1e9a9e016237..1dbf7211666e 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -218,14 +218,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long align_goal, addr = -ENOMEM; - unsigned long (*get_area)(struct file *, unsigned long, - unsigned long, unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return get_area(NULL, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -238,7 +234,8 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u align_goal = (64UL * 1024); do { - addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); + addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, + len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); break; @@ -256,7 +253,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u * be obtained. */ if (addr & ~PAGE_MASK) - addr = get_area(NULL, orig_addr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); return addr; } @@ -292,7 +289,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap == RLIM_INFINITY || sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -303,7 +300,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap = (task_size / 6 * 5); mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 8ed5bdf95d25..c23012e3a353 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -123,7 +123,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, (!vma || addr + len <= vm_start_gap(vma))) return addr; } - if (mm->get_unmapped_area == arch_get_unmapped_area) + if (!test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 262f5fb18d74..22b65a5f5ec6 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index dab6db288e5b..06ca9a60bac2 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -115,7 +115,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } get_unmapped_area: - if (mm->get_unmapped_area == arch_get_unmapped_area) + if (!test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c90c20904a60..a2cabb1c81e1 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -129,9 +129,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); else - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, arch_rnd(mmap64_rnd_bits), task_size_64bit(0), diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 3c6670cf905f..9b80e622ae80 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -544,7 +544,7 @@ static unsigned long get_unmapped_area_zero(struct file *file, } /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */ - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); #else return -ENOSYS; #endif diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 93ebedc5ec8c..47c126d37b59 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -329,14 +329,14 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align, + pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6502c7e776d1..3dee18bf47ed 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -249,11 +249,11 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } /* - * Use mm->get_unmapped_area value as a hint to use topdown routine. + * Use MMF_TOPDOWN flag as a hint to use topdown routine. * If architectures have special needs, they should define their own * version of hugetlb_get_unmapped_area. */ - if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) + if (test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); return hugetlb_get_unmapped_area_bottomup(file, addr, len, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 75396a24fd8c..d19434e2a58e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -455,8 +455,9 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); #endif + return orig_addr; } diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index c7a1aa3c882b..b45c7edc3225 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fee6561feb7e..fa0d6995706f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -777,11 +777,7 @@ struct mm_struct { } ____cacheline_aligned_in_smp; struct maple_tree mm_mt; -#ifdef CONFIG_MMU - unsigned long (*get_unmapped_area) (struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); -#endif + unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 02f5090ffea2..e62ff805cfc9 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -92,9 +92,12 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_VM_MERGE_ANY 30 #define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_TOPDOWN 31 /* mm searches top down by default */ +#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) + #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ - MMF_VM_MERGE_ANY_MASK) + MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) static inline unsigned long mmf_init_flags(unsigned long flags) { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index b6543f9d78d6..ed1caa26c8be 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -8,6 +8,7 @@ #include #include #include +#include /* * Routines for handling mm_structs @@ -186,6 +187,10 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); + unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c170a2b8d2cf..d5edfb8444d7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3525,7 +3525,7 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, #else addr = 0UL; #endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..16dbf4f6b77f 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -314,7 +314,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..9cb89e875f0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -980,7 +980,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); #else return addr; #endif diff --git a/mm/debug.c b/mm/debug.c index e8a96b8b7197..b71186f1fb0b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -177,9 +177,6 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { pr_emerg("mm %px task_size %lu\n" -#ifdef CONFIG_MMU - "get_unmapped_area %px\n" -#endif "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" @@ -205,9 +202,6 @@ void dump_mm(const struct mm_struct *mm) "def_flags: %#lx(%pGv)\n", mm, mm->task_size, -#ifdef CONFIG_MMU - mm->get_unmapped_area, -#endif mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 157cee64850c..7fdb2275a1e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -816,8 +816,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = current->mm->get_unmapped_area(filp, addr, len_pad, - off >> PAGE_SHIFT, flags); + ret = mm_get_unmapped_area(current->mm, filp, addr, len_pad, + off >> PAGE_SHIFT, flags); /* * The failure might be due to length padding. The caller will retry @@ -835,8 +835,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && - !off_sub) + if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) return ret + size; ret += off_sub; @@ -853,7 +852,7 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, if (ret) return ret; - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); diff --git a/mm/mmap.c b/mm/mmap.c index 77a625e13ec1..dfb8a518e8c9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1812,7 +1812,8 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long (*get_area)(struct file *, unsigned long, - unsigned long, unsigned long, unsigned long); + unsigned long, unsigned long, unsigned long) + = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) @@ -1822,7 +1823,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1841,7 +1841,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (!file) pgoff = 0; - addr = get_area(file, addr, len, pgoff, flags); + if (get_area) + addr = get_area(file, addr, len, pgoff, flags); + else + addr = mm_get_unmapped_area(current->mm, file, addr, len, + pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; @@ -1856,6 +1860,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +unsigned long +mm_get_unmapped_area(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (test_bit(MMF_TOPDOWN, &mm->flags)) + return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags); + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} +EXPORT_SYMBOL(mm_get_unmapped_area); + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. diff --git a/mm/shmem.c b/mm/shmem.c index 98985179f495..fa2a0ed97507 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2267,8 +2267,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; @@ -2278,8 +2276,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, + flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2336,7 +2334,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, + inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -4801,7 +4800,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif diff --git a/mm/util.c b/mm/util.c index a9e911b22b99..c9e519e6811f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -469,17 +469,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } #endif -- cgit v1.2.3 From 25176ad09ca395fcc83b1fc78adf25c8eb1bd964 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 2 Apr 2024 14:55:15 +0200 Subject: mm/treewide: rename CONFIG_HAVE_FAST_GUP to CONFIG_HAVE_GUP_FAST Nowadays, we call it "GUP-fast", the external interface includes functions like "get_user_pages_fast()", and we renamed all internal functions to reflect that as well. Let's make the config option reflect that. Link: https://lkml.kernel.org/r/20240402125516.223131-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Peter Xu Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/loongarch/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/x86/Kconfig | 2 +- include/linux/rmap.h | 8 ++++---- kernel/events/core.c | 4 ++-- mm/Kconfig | 2 +- mm/gup.c | 10 +++++----- mm/internal.h | 2 +- 14 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b14aed3a17ab..817918f6635a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -99,7 +99,7 @@ config ARM select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD - select HAVE_FAST_GUP if ARM_LPAE + select HAVE_GUP_FAST if ARM_LPAE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b11c98b3e84..de076a191e9f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -205,7 +205,7 @@ config ARM64 select HAVE_SAMPLE_FTRACE_DIRECT select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_ERROR_INJECTION diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index a5f300ec6f28..cd642eefd9e5 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -119,7 +119,7 @@ config LOONGARCH select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN select HAVE_EXIT_THREAD - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 516dc7022bd7..f1aa1bf11166 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -68,7 +68,7 @@ config MIPS select HAVE_DYNAMIC_FTRACE select HAVE_EBPF_JIT if !CPU_MICROMIPS select HAVE_EXIT_THREAD - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1c4be3373686..e42cc8cd415f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -236,7 +236,7 @@ config PPC select HAVE_DYNAMIC_FTRACE_WITH_REGS if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index be09c8836d56..3ee60ddef93e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -132,7 +132,7 @@ config RISCV select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION select HAVE_EBPF_JIT if MMU - select HAVE_FAST_GUP if MMU + select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8f01ada6845e..d9aed4c93ee6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -174,7 +174,7 @@ config S390 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2ad3e29f0ebe..7292542f75e8 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -38,7 +38,7 @@ config SUPERH select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DYNAMIC_FTRACE - select HAVE_FAST_GUP if MMU + select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4474bf32d0a4..d95de5eab213 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -221,7 +221,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA select HAVE_EXIT_THREAD - select HAVE_FAST_GUP + select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b7944a833668..9bf9324214fc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -284,7 +284,7 @@ static inline int hugetlb_try_share_anon_rmap(struct folio *folio) VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); /* Paired with the memory barrier in try_grab_folio(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb(); if (unlikely(folio_maybe_dma_pinned(folio))) @@ -295,7 +295,7 @@ static inline int hugetlb_try_share_anon_rmap(struct folio *folio) * This is conceptually a smp_wmb() paired with the smp_rmb() in * gup_must_unshare(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb__after_atomic(); return 0; } @@ -541,7 +541,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, */ /* Paired with the memory barrier in try_grab_folio(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb(); if (unlikely(folio_maybe_dma_pinned(folio))) @@ -552,7 +552,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, * This is conceptually a smp_wmb() paired with the smp_rmb() in * gup_must_unshare(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_mb__after_atomic(); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 724e6d7e128f..c5a0dc1f135f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7539,7 +7539,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; -#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_HAVE_GUP_FAST pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; @@ -7587,7 +7587,7 @@ again: if (pte_present(pte)) size = pte_leaf_size(pte); pte_unmap(ptep); -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ return size; } diff --git a/mm/Kconfig b/mm/Kconfig index f0ed3168db00..50df323eaece 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -473,7 +473,7 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_FAST_GUP +config HAVE_GUP_FAST depends on MMU bool diff --git a/mm/gup.c b/mm/gup.c index 7609efc55f1c..8dcbeae714e2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -501,7 +501,7 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) #ifdef CONFIG_MMU -#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_FAST_GUP) +#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST) static int record_subpages(struct page *page, unsigned long sz, unsigned long addr, unsigned long end, struct page **pages) @@ -515,7 +515,7 @@ static int record_subpages(struct page *page, unsigned long sz, return nr; } -#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */ #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, @@ -2782,7 +2782,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_HAVE_GUP_FAST /* * Used in the GUP-fast path to determine whether GUP is permitted to work on @@ -3361,7 +3361,7 @@ static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ #ifndef gup_fast_permitted /* @@ -3381,7 +3381,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, int nr_pinned = 0; unsigned seq; - if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) || !gup_fast_permitted(start, end)) return 0; diff --git a/mm/internal.h b/mm/internal.h index 640298a98a3c..8fd41f889a95 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1265,7 +1265,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* -- cgit v1.2.3 From 6b839b3b76cf17296ebd4a893841f32cae08229c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 5 Feb 2024 09:26:30 -0800 Subject: regset: use kvzalloc() for regset_get_alloc() While browsing through ChromeOS crash reports, I found one with an allocation failure that looked like this: chrome: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=urgent,mems_allowed=0 CPU: 7 PID: 3295 Comm: chrome Not tainted 5.15.133-20574-g8044615ac35c #1 (HASH:1162 1) Hardware name: Google Lazor (rev3 - 8) with KB Backlight (DT) Call trace: ... warn_alloc+0x104/0x174 __alloc_pages+0x5f0/0x6e4 kmalloc_order+0x44/0x98 kmalloc_order_trace+0x34/0x124 __kmalloc+0x228/0x36c __regset_get+0x68/0xcc regset_get_alloc+0x1c/0x28 elf_core_dump+0x3d8/0xd8c do_coredump+0xeb8/0x1378 get_signal+0x14c/0x804 ... An order 7 allocation is (1 << 7) contiguous pages, or 512K. It's not a surprise that this allocation failed on a system that's been running for a while. More digging showed that it was fairly easy to see the order 7 allocation by just sending a SIGQUIT to chrome (or other processes) to generate a core dump. The actual amount being allocated was 279,584 bytes and it was for "core_note_type" NT_ARM_SVE. There was quite a bit of discussion [1] on the mailing lists in response to my v1 patch attempting to switch to vmalloc. The overall conclusion was that we could likely reduce the 279,584 byte allocation by quite a bit and Mark Brown has sent a patch to that effect [2]. However even with the 279,584 byte allocation gone there are still 65,552 byte allocations. These are just barely more than the 65,536 bytes and thus would require an order 5 allocation. An order 5 allocation is still something to avoid unless necessary and nothing needs the memory here to be contiguous. Change the allocation to kvzalloc() which should still be efficient for small allocations but doesn't force the memory subsystem to work hard (and maybe fail) at getting a large contiguous chunk. [1] https://lore.kernel.org/r/20240201171159.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid [2] https://lore.kernel.org/r/20240203-arm64-sve-ptrace-regset-size-v1-1-2c3ba1386b9e@kernel.org Link: https://lkml.kernel.org/r/20240205092626.v2.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid Signed-off-by: Douglas Anderson Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Christian Brauner Cc: Dave Martin Cc: Eric Biederman Cc: Jan Kara Cc: Kees Cook Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 2 +- kernel/regset.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..ac178ad38823 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1928,7 +1928,7 @@ static void free_note_info(struct elf_note_info *info) threads = t->next; WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); for (i = 1; i < info->thread_notes; ++i) - kfree(t->notes[i].data); + kvfree(t->notes[i].data); kfree(t); } kfree(info->psinfo.data); diff --git a/kernel/regset.c b/kernel/regset.c index 586823786f39..b2871fa68b2a 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -16,14 +16,14 @@ static int __regset_get(struct task_struct *target, if (size > regset->n * regset->size) size = regset->n * regset->size; if (!p) { - to_free = p = kzalloc(size, GFP_KERNEL); + to_free = p = kvzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; } res = regset->regset_get(target, regset, (struct membuf){.p = p, .left = size}); if (res < 0) { - kfree(to_free); + kvfree(to_free); return res; } *data = p; @@ -71,6 +71,6 @@ int copy_regset_to_user(struct task_struct *target, ret = regset_get_alloc(target, regset, size, &buf); if (ret > 0) ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; - kfree(buf); + kvfree(buf); return ret; } -- cgit v1.2.3 From 56fd61628b7a5c1ff474619580796f11d5c8973a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Mar 2024 15:30:42 +0100 Subject: kcov: avoid clang out-of-range warning The area_size is never larger than the maximum on 64-bit architectutes: kernel/kcov.c:634:29: error: result of comparison of constant 1152921504606846975 with expression of type '__u32' (aka 'unsigned int') is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) ~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The compiler can correctly optimize the check away and the code appears correct to me, so just add a cast to avoid the warning. Link: https://lkml.kernel.org/r/20240328143051.1069575-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Justin Stitt Cc: Andrey Konovalov Cc: Bill Wendling Cc: Dmitry Vyukov Cc: Nathan Chancellor Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- kernel/kcov.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index f9ac2e9e460f..c3124f6d5536 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -627,7 +627,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, mode = kcov_get_mode(remote_arg->trace_mode); if (mode < 0) return mode; - if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) + if ((unsigned long)remote_arg->area_size > + LONG_MAX / sizeof(unsigned long)) return -EINVAL; kcov->mode = mode; t->kcov = kcov; -- cgit v1.2.3 From 051e7503070179f2278c0d4fa7dee441adb558ca Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Apr 2024 16:00:57 +0200 Subject: blktrace: convert strncpy() to strscpy_pad() gcc-9 warns about a possibly non-terminated string copy: kernel/trace/blktrace.c: In function 'do_blk_trace_setup': kernel/trace/blktrace.c:527:2: error: 'strncpy' specified bound 32 equals destination size [-Werror=stringop-truncation] Newer versions are fine here because they see the following explicit nul-termination. Using strscpy_pad() avoids the warning and simplifies the code a little. The padding helps give a clean buffer to userspace. Link: https://lkml.kernel.org/r/20240409140059.3806717-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Justin Stitt Cc: Alexey Starikovskiy Cc: Bob Moore Cc: Jens Axboe Cc: Len Brown Cc: Lin Ming Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Rafael J. Wysocki Cc: "Richard Russon (FlatCap)" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/trace/blktrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d5d94510afd3..8fd292d34d89 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); /* * some device names have larger paths - convert the slashes -- cgit v1.2.3 From e9730744bf3af04cda23799029342aa3cddbc454 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:34 +0100 Subject: kdb: Fix buffer overflow during tab-complete Currently, when the user attempts symbol completion with the Tab key, kdb will use strncpy() to insert the completed symbol into the command buffer. Unfortunately it passes the size of the source buffer rather than the destination to strncpy() with predictably horrible results. Most obviously if the command buffer is already full but cp, the cursor position, is in the middle of the buffer, then we will write past the end of the supplied buffer. Fix this by replacing the dubious strncpy() calls with memmove()/memcpy() calls plus explicit boundary checks to make sure we have enough space before we start moving characters around. Reported-by: Justin Stitt Closes: https://lore.kernel.org/all/CAFhGd8qESuuifuHsNjFPR-Va3P80bxrw+LqvC8deA8GziUJLpw@mail.gmail.com/ Cc: stable@vger.kernel.org Reviewed-by: Douglas Anderson Reviewed-by: Justin Stitt Tested-by: Justin Stitt Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-1-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9443bc63c5a2..06dfbccb1033 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -367,14 +367,19 @@ poll_again: kdb_printf(kdb_prompt_str); kdb_printf("%s", buffer); } else if (tab != 2 && count > 0) { - len_tmp = strlen(p_tmp); - strncpy(p_tmp+len_tmp, cp, lastchar-cp+1); - len_tmp = strlen(p_tmp); - strncpy(cp, p_tmp+len, len_tmp-len + 1); - len = len_tmp - len; - kdb_printf("%s", cp); - cp += len; - lastchar += len; + /* How many new characters do we want from tmpbuffer? */ + len_tmp = strlen(p_tmp) - len; + if (lastchar + len_tmp >= bufend) + len_tmp = bufend - lastchar; + + if (len_tmp) { + /* + 1 ensures the '\0' is memmove'd */ + memmove(cp+len_tmp, cp, (lastchar-cp) + 1); + memcpy(cp, p_tmp+len, len_tmp); + kdb_printf("%s", cp); + cp += len_tmp; + lastchar += len_tmp; + } } kdb_nextline = 1; /* reset output line number */ break; -- cgit v1.2.3 From 09b35989421dfd5573f0b4683c7700a7483c71f9 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:35 +0100 Subject: kdb: Use format-strings rather than '\0' injection in kdb_read() Currently when kdb_read() needs to reposition the cursor it uses copy and paste code that works by injecting an '\0' at the cursor position before delivering a carriage-return and reprinting the line (which stops at the '\0'). Tidy up the code by hoisting the copy and paste code into an appropriately named function. Additionally let's replace the '\0' injection with a proper field width parameter so that the string will be abridged during formatting instead. Cc: stable@vger.kernel.org # Not a bug fix but it is needed for later bug fixes Tested-by: Justin Stitt Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-2-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 55 +++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 06dfbccb1033..50789c99b3ba 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -184,6 +184,33 @@ char kdb_getchar(void) unreachable(); } +/** + * kdb_position_cursor() - Place cursor in the correct horizontal position + * @prompt: Nil-terminated string containing the prompt string + * @buffer: Nil-terminated string containing the entire command line + * @cp: Cursor position, pointer the character in buffer where the cursor + * should be positioned. + * + * The cursor is positioned by sending a carriage-return and then printing + * the content of the line until we reach the correct cursor position. + * + * There is some additional fine detail here. + * + * Firstly, even though kdb_printf() will correctly format zero-width fields + * we want the second call to kdb_printf() to be conditional. That keeps things + * a little cleaner when LOGGING=1. + * + * Secondly, we can't combine everything into one call to kdb_printf() since + * that renders into a fixed length buffer and the combined print could result + * in unwanted truncation. + */ +static void kdb_position_cursor(char *prompt, char *buffer, char *cp) +{ + kdb_printf("\r%s", kdb_prompt_str); + if (cp > buffer) + kdb_printf("%.*s", (int)(cp - buffer), buffer); +} + /* * kdb_read * @@ -212,7 +239,6 @@ static char *kdb_read(char *buffer, size_t bufsize) * and null byte */ char *lastchar; char *p_tmp; - char tmp; static char tmpbuffer[CMD_BUFLEN]; int len = strlen(buffer); int len_tmp; @@ -249,12 +275,8 @@ poll_again: } *(--lastchar) = '\0'; --cp; - kdb_printf("\b%s \r", cp); - tmp = *cp; - *cp = '\0'; - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - *cp = tmp; + kdb_printf("\b%s ", cp); + kdb_position_cursor(kdb_prompt_str, buffer, cp); } break; case 10: /* linefeed */ @@ -272,19 +294,14 @@ poll_again: memcpy(tmpbuffer, cp+1, lastchar - cp - 1); memcpy(cp, tmpbuffer, lastchar - cp - 1); *(--lastchar) = '\0'; - kdb_printf("%s \r", cp); - tmp = *cp; - *cp = '\0'; - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - *cp = tmp; + kdb_printf("%s ", cp); + kdb_position_cursor(kdb_prompt_str, buffer, cp); } break; case 1: /* Home */ if (cp > buffer) { - kdb_printf("\r"); - kdb_printf(kdb_prompt_str); cp = buffer; + kdb_position_cursor(kdb_prompt_str, buffer, cp); } break; case 5: /* End */ @@ -390,13 +407,9 @@ poll_again: memcpy(cp+1, tmpbuffer, lastchar - cp); *++lastchar = '\0'; *cp = key; - kdb_printf("%s\r", cp); + kdb_printf("%s", cp); ++cp; - tmp = *cp; - *cp = '\0'; - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - *cp = tmp; + kdb_position_cursor(kdb_prompt_str, buffer, cp); } else { *++lastchar = '\0'; *cp++ = key; -- cgit v1.2.3 From db2f9c7dc29114f531df4a425d0867d01e1f1e28 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:36 +0100 Subject: kdb: Fix console handling when editing and tab-completing commands Currently, if the cursor position is not at the end of the command buffer and the user uses the Tab-complete functions, then the console does not leave the cursor in the correct position. For example consider the following buffer with the cursor positioned at the ^: md kdb_pro 10 ^ Pressing tab should result in: md kdb_prompt_str 10 ^ However this does not happen. Instead the cursor is placed at the end (after then 10) and further cursor movement redraws incorrectly. The same problem exists when we double-Tab but in a different part of the code. Fix this by sending a carriage return and then redisplaying the text to the left of the cursor. Cc: stable@vger.kernel.org Reviewed-by: Douglas Anderson Tested-by: Justin Stitt Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-3-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 50789c99b3ba..5fccb46f399e 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -383,6 +383,8 @@ poll_again: kdb_printf("\n"); kdb_printf(kdb_prompt_str); kdb_printf("%s", buffer); + if (cp != lastchar) + kdb_position_cursor(kdb_prompt_str, buffer, cp); } else if (tab != 2 && count > 0) { /* How many new characters do we want from tmpbuffer? */ len_tmp = strlen(p_tmp) - len; @@ -396,6 +398,9 @@ poll_again: kdb_printf("%s", cp); cp += len_tmp; lastchar += len_tmp; + if (cp != lastchar) + kdb_position_cursor(kdb_prompt_str, + buffer, cp); } } kdb_nextline = 1; /* reset output line number */ -- cgit v1.2.3 From 6244917f377bf64719551b58592a02a0336a7439 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:37 +0100 Subject: kdb: Merge identical case statements in kdb_read() The code that handles case 14 (down) and case 16 (up) has been copy and pasted despite being byte-for-byte identical. Combine them. Cc: stable@vger.kernel.org # Not a bug fix but it is needed for later bug fixes Reviewed-by: Douglas Anderson Tested-by: Justin Stitt Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-4-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 5fccb46f399e..a73779529803 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -317,6 +317,7 @@ poll_again: } break; case 14: /* Down */ + case 16: /* Up */ memset(tmpbuffer, ' ', strlen(kdb_prompt_str) + (lastchar-buffer)); *(tmpbuffer+strlen(kdb_prompt_str) + @@ -331,15 +332,6 @@ poll_again: ++cp; } break; - case 16: /* Up */ - memset(tmpbuffer, ' ', - strlen(kdb_prompt_str) + (lastchar-buffer)); - *(tmpbuffer+strlen(kdb_prompt_str) + - (lastchar-buffer)) = '\0'; - kdb_printf("\r%s\r", tmpbuffer); - *lastchar = (char)key; - *(lastchar+1) = '\0'; - return lastchar; case 9: /* Tab */ if (tab < 2) ++tab; -- cgit v1.2.3 From c9b51ddb66b1d96e4d364c088da0f1dfb004c574 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:38 +0100 Subject: kdb: Use format-specifiers rather than memset() for padding in kdb_read() Currently when the current line should be removed from the display kdb_read() uses memset() to fill a temporary buffer with spaces. The problem is not that this could be trivially implemented using a format string rather than open coding it. The real problem is that it is possible, on systems with a long kdb_prompt_str, to write past the end of the tmpbuffer. Happily, as mentioned above, this can be trivially implemented using a format string. Make it so! Cc: stable@vger.kernel.org Reviewed-by: Douglas Anderson Tested-by: Justin Stitt Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-5-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index a73779529803..2aeaf9765b24 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -318,11 +318,9 @@ poll_again: break; case 14: /* Down */ case 16: /* Up */ - memset(tmpbuffer, ' ', - strlen(kdb_prompt_str) + (lastchar-buffer)); - *(tmpbuffer+strlen(kdb_prompt_str) + - (lastchar-buffer)) = '\0'; - kdb_printf("\r%s\r", tmpbuffer); + kdb_printf("\r%*c\r", + (int)(strlen(kdb_prompt_str) + (lastchar - buffer)), + ' '); *lastchar = (char)key; *(lastchar+1) = '\0'; return lastchar; -- cgit v1.2.3 From 80bd73c154e3063c4f9293163daf3262335f9f86 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:39 +0100 Subject: kdb: Replace double memcpy() with memmove() in kdb_read() At several points in kdb_read() there are variants of the following code pattern (with offsets slightly altered): memcpy(tmpbuffer, cp, lastchar - cp); memcpy(cp-1, tmpbuffer, lastchar - cp); *(--lastchar) = '\0'; There is no need to use tmpbuffer here, since we can use memmove() instead so refactor in the obvious way. Additionally the strings that are being copied are already properly terminated so let's also change the code so that the library calls also move the terminator. Changing how the terminators are managed has no functional effect for now but might allow us to retire lastchar at a later point. lastchar, although stored as a pointer, is functionally equivalent to caching strlen(buffer). Reviewed-by: Douglas Anderson Tested-by: Justin Stitt Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-6-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 2aeaf9765b24..40617f36a6db 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -269,12 +269,9 @@ poll_again: switch (key) { case 8: /* backspace */ if (cp > buffer) { - if (cp < lastchar) { - memcpy(tmpbuffer, cp, lastchar - cp); - memcpy(cp-1, tmpbuffer, lastchar - cp); - } - *(--lastchar) = '\0'; - --cp; + memmove(cp-1, cp, lastchar - cp + 1); + lastchar--; + cp--; kdb_printf("\b%s ", cp); kdb_position_cursor(kdb_prompt_str, buffer, cp); } @@ -291,9 +288,8 @@ poll_again: return buffer; case 4: /* Del */ if (cp < lastchar) { - memcpy(tmpbuffer, cp+1, lastchar - cp - 1); - memcpy(cp, tmpbuffer, lastchar - cp - 1); - *(--lastchar) = '\0'; + memmove(cp, cp+1, lastchar - cp); + lastchar--; kdb_printf("%s ", cp); kdb_position_cursor(kdb_prompt_str, buffer, cp); } @@ -398,9 +394,8 @@ poll_again: default: if (key >= 32 && lastchar < bufend) { if (cp < lastchar) { - memcpy(tmpbuffer, cp, lastchar - cp); - memcpy(cp+1, tmpbuffer, lastchar - cp); - *++lastchar = '\0'; + memmove(cp+1, cp, lastchar - cp + 1); + lastchar++; *cp = key; kdb_printf("%s", cp); ++cp; -- cgit v1.2.3 From 64d504cfcd514743aaed3a5b79c060f0143149e9 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 24 Apr 2024 15:03:40 +0100 Subject: kdb: Simplify management of tmpbuffer in kdb_read() The current approach to filling tmpbuffer with completion candidates is confusing, with the buffer management being especially hard to reason about. That's because it doesn't copy the completion canidate into tmpbuffer, instead of copies a whole bunch of other nonsense and then runs the completion search from the middle of tmpbuffer! Change this to copy nothing but the completion candidate into tmpbuffer. Pretty much everything else in this patch is renaming to reflect the above change: s/p_tmp/tmpbuffer/ s/buf_size/sizeof(tmpbuffer)/ Reviewed-by: Douglas Anderson Tested-by: Justin Stitt Link: https://lore.kernel.org/r/20240424-kgdb_read_refactor-v3-7-f236dbe9828d@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 40617f36a6db..3131334d7a81 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -239,6 +239,7 @@ static char *kdb_read(char *buffer, size_t bufsize) * and null byte */ char *lastchar; char *p_tmp; + char tmp; static char tmpbuffer[CMD_BUFLEN]; int len = strlen(buffer); int len_tmp; @@ -246,8 +247,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int count; int i; int diag, dtab_count; - int key, buf_size, ret; - + int key, ret; diag = kdbgetintenv("DTABCOUNT", &dtab_count); if (diag) @@ -329,21 +329,16 @@ poll_again: case 9: /* Tab */ if (tab < 2) ++tab; - p_tmp = buffer; - while (*p_tmp == ' ') - p_tmp++; - if (p_tmp > cp) - break; - memcpy(tmpbuffer, p_tmp, cp-p_tmp); - *(tmpbuffer + (cp-p_tmp)) = '\0'; - p_tmp = strrchr(tmpbuffer, ' '); - if (p_tmp) - ++p_tmp; - else - p_tmp = tmpbuffer; - len = strlen(p_tmp); - buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer); - count = kallsyms_symbol_complete(p_tmp, buf_size); + + tmp = *cp; + *cp = '\0'; + p_tmp = strrchr(buffer, ' '); + p_tmp = (p_tmp ? p_tmp + 1 : buffer); + strscpy(tmpbuffer, p_tmp, sizeof(tmpbuffer)); + *cp = tmp; + + len = strlen(tmpbuffer); + count = kallsyms_symbol_complete(tmpbuffer, sizeof(tmpbuffer)); if (tab == 2 && count > 0) { kdb_printf("\n%d symbols are found.", count); if (count > dtab_count) { @@ -355,14 +350,14 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - ret = kallsyms_symbol_next(p_tmp, i, buf_size); + ret = kallsyms_symbol_next(tmpbuffer, i, sizeof(tmpbuffer)); if (WARN_ON(!ret)) break; if (ret != -E2BIG) - kdb_printf("%s ", p_tmp); + kdb_printf("%s ", tmpbuffer); else - kdb_printf("%s... ", p_tmp); - *(p_tmp + len) = '\0'; + kdb_printf("%s... ", tmpbuffer); + tmpbuffer[len] = '\0'; } if (i >= dtab_count) kdb_printf("..."); @@ -373,14 +368,14 @@ poll_again: kdb_position_cursor(kdb_prompt_str, buffer, cp); } else if (tab != 2 && count > 0) { /* How many new characters do we want from tmpbuffer? */ - len_tmp = strlen(p_tmp) - len; + len_tmp = strlen(tmpbuffer) - len; if (lastchar + len_tmp >= bufend) len_tmp = bufend - lastchar; if (len_tmp) { /* + 1 ensures the '\0' is memmove'd */ memmove(cp+len_tmp, cp, (lastchar-cp) + 1); - memcpy(cp, p_tmp+len, len_tmp); + memcpy(cp, tmpbuffer+len, len_tmp); kdb_printf("%s", cp); cp += len_tmp; lastchar += len_tmp; -- cgit v1.2.3 From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:08 +0000 Subject: bpf: verifier: prevent userspace memory access With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset is a userspace address, the BPF program might read that memory if the user has mapped it. Make the verifier add guard instructions around such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); Comparing just the upper 32 bits of the load address with the upper 32 bits of uaddress_limit implies that the values are being aligned down to a 4GB boundary before comparison. The above means that all loads with address <= uaddress_limit + 4GB are skipped. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory, therefore a correctly functioning BPF program should not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyze the same program after it has gone through the JITs of ARM64 and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation (upstream) --------------------- 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi --------------------------------- f: movabs $0x800000000000,%r11 19: cmp %r11,%rdi 1c: jb 0x000000000000002a 1e: mov %rdi,%r11 21: add $0x0,%r11 28: jae 0x000000000000002e 2a: xor %edi,%edi 2c: jmp 0x0000000000000032 2e: mov 0x0(%rdi),%rdi --------------------------------- 32: xor %eax,%eax 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. This patch doesn't make any changes for the x86-64 JIT. ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl #12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], #16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], #16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], #16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], #16 \--------------------------------------- 54: ldp x29, x30, [sp], #16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], #16 60: ret 60: ldp x25, x26, [sp], #16 64: nop 64: ldp x21, x22, [sp], #16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], #16 6c: br x10 6c: ldp x29, x30, [sp], #16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables") Reported-by: Breno Leitao Suggested-by: Alexei Starovoitov Acked-by: Ilya Leoshkevich Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 6 ++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index df5fac428408..b520b66ad14b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3473,3 +3473,9 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +/* x86-64 JIT emits its own code to filter user addresses so return 0 here */ +u64 bpf_arch_uaddress_limit(void) +{ + return 0; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index c99bc3df2d28..219ee7a76874 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -963,6 +963,7 @@ bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e8..1ea5ce5bb599 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void) return false; } +u64 __weak bpf_arch_uaddress_limit(void) +{ +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) + return TASK_SIZE; +#else + return 0; +#endif +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa24beb65393..cb7ad1f795e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Make it impossible to de-reference a userspace address */ + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { + struct bpf_insn *patch = &insn_buf[0]; + u64 uaddress_limit = bpf_arch_uaddress_limit(); + + if (!uaddress_limit) + goto next_insn; + + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + if (insn->off) + *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); + *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); + *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); + *patch++ = *insn; + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || -- cgit v1.2.3 From 1dd1eff161bd55968d3d46bc36def62d71fb4785 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 27 Apr 2024 18:28:08 +0800 Subject: softirq: Fix suspicious RCU usage in __do_softirq() Currently, the condition "__this_cpu_read(ksoftirqd) == current" is used to invoke rcu_softirq_qs() in ksoftirqd tasks context for non-RT kernels. This works correctly as long as the context is actually task context but this condition is wrong when: - the current task is ksoftirqd - the task is interrupted in a RCU read side critical section - __do_softirq() is invoked on return from interrupt Syzkaller triggered the following scenario: -> finish_task_switch() -> put_task_struct_rcu_user() -> call_rcu(&task->rcu, delayed_put_task_struct) -> __kasan_record_aux_stack() -> pfn_valid() -> rcu_read_lock_sched() __irq_exit_rcu() -> __do_softirq)() -> if (!IS_ENABLED(CONFIG_PREEMPT_RT) && __this_cpu_read(ksoftirqd) == current) -> rcu_softirq_qs() -> RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map)) The rcu quiescent state is reported in the rcu-read critical section, so the lockdep warning is triggered. Fix this by splitting out the inner working of __do_softirq() into a helper function which takes an argument to distinguish between ksoftirqd task context and interrupted context and invoke it from the relevant call sites with the proper context information and use that for the conditional invocation of rcu_softirq_qs(). Reported-by: syzbot+dce04ed6d1438ad69656@syzkaller.appspotmail.com Suggested-by: Thomas Gleixner Signed-off-by: Zqiang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240427102808.29356-1-qiang.zhang1211@gmail.com Link: https://lore.kernel.org/lkml/8f281a10-b85a-4586-9586-5bbc12dc784f@paulmck-laptop/T/#mea8aba4abfcb97bbf499d169ce7f30c4cff1b0e3 --- kernel/softirq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b315b21fb28c..02582017759a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -508,7 +508,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __softirq_entry __do_softirq(void) +static void handle_softirqs(bool ksirqd) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; @@ -563,8 +563,7 @@ restart: pending >>= softirq_bit; } - if (!IS_ENABLED(CONFIG_PREEMPT_RT) && - __this_cpu_read(ksoftirqd) == current) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ksirqd) rcu_softirq_qs(); local_irq_disable(); @@ -584,6 +583,11 @@ restart: current_restore_flags(old_flags, PF_MEMALLOC); } +asmlinkage __visible void __softirq_entry __do_softirq(void) +{ + handle_softirqs(false); +} + /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ @@ -921,7 +925,7 @@ static void run_ksoftirqd(unsigned int cpu) * We can safely run softirq on inline stack, as we are not deep * in the task stack here. */ - __do_softirq(); + handle_softirqs(true); ksoftirqd_run_end(); cond_resched(); return; -- cgit v1.2.3 From dce3696271af7765f04428ec31b1b87dc7d016c6 Mon Sep 17 00:00:00 2001 From: LuMingYin Date: Sat, 27 Apr 2024 08:23:47 +0100 Subject: tracing/probes: Fix memory leak in traceprobe_parse_probe_arg_body() If traceprobe_parse_probe_arg_body() failed to allocate 'parg->fmt', it jumps to the label 'out' instead of 'fail' by mistake.In the result, the buffer 'tmp' is not freed in this case and leaks its memory. Thus jump to the label 'fail' in that error case. Link: https://lore.kernel.org/all/20240427072347.1421053-1-lumingyindetect@126.com/ Fixes: 032330abd08b ("tracing/probes: Cleanup probe argument parser") Signed-off-by: LuMingYin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfe3ee6035ec..42bc0f362226 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1466,7 +1466,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->fmt = kmalloc(len, GFP_KERNEL); if (!parg->fmt) { ret = -ENOMEM; - goto out; + goto fail; } snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, parg->count); -- cgit v1.2.3 From 5af385f5f4cddf908f663974847a4083b2ff2c79 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 29 Apr 2024 15:47:51 +0100 Subject: bounds: Use the right number of bits for power-of-two CONFIG_NR_CPUS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bits_per() rounds up to the next power of two when passed a power of two. This causes crashes on some machines and configurations. Reported-by: Михаил Новоселов Tested-by: Ильфат Гаптрахманов Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3347 Link: https://lore.kernel.org/all/1c978cf1-2934-4e66-e4b3-e81b04cb3571@rosalinux.ru/ Fixes: f2d5dcb48f7b (bounds: support non-power-of-two CONFIG_NR_CPUS) Cc: Signed-off-by: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index c5a9fcd2d622..29b2cd00df2c 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, order_base_2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN -- cgit v1.2.3 From 0db63c0b86e981a1e97d2596d64ceceba1a5470e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Apr 2024 17:25:44 -0700 Subject: bpf: Fix verifier assumptions about socket->sk The verifier assumes that 'sk' field in 'struct socket' is valid and non-NULL when 'socket' pointer itself is trusted and non-NULL. That may not be the case when socket was just created and passed to LSM socket_accept hook. Fix this verifier assumption and adjust tests. Reported-by: Liam Wisehart Acked-by: Kumar Kartikeya Dwivedi Fixes: 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.") Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20240427002544.68803-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 23 +++++++++++++++++----- .../bpf/progs/bench_local_storage_create.c | 5 +++-- tools/testing/selftests/bpf/progs/local_storage.c | 20 ++++++++++--------- tools/testing/selftests/bpf/progs/lsm_cgroup.c | 8 ++++++-- 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87ff414899cf..5d42db05315e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2368,6 +2368,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -5400,8 +5402,6 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); - /* For mark_ptr_or_null_reg */ - val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -5719,7 +5719,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* Types listed in the reg2btf_ids are always trusted */ - if (reg2btf_ids[base_type(reg->type)]) + if (reg2btf_ids[base_type(reg->type)] && + !bpf_type_has_unsafe_modifiers(reg->type)) return true; /* If a register is not referenced, it is trusted if it has the @@ -6339,6 +6340,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) #define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) +#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null) /* * Allow list few fields as RCU trusted or full trusted. @@ -6402,7 +6404,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) { struct inode *d_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct socket) { +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; @@ -6437,11 +6439,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } +static bool type_is_trusted_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, + "__safe_trusted_or_null"); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -6550,6 +6561,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; + } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) { + flag |= PTR_TRUSTED | PTR_MAYBE_NULL; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c index e4bfbba6c193..c8ec0d0368e4 100644 --- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c @@ -61,14 +61,15 @@ SEC("lsm.s/socket_post_create") int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) { + struct sock *sk = sock->sk; struct storage *stg; __u32 pid; pid = bpf_get_current_pid_tgid() >> 32; - if (pid != bench_pid) + if (pid != bench_pid || !sk) return 0; - stg = bpf_sk_storage_get(&sk_storage_map, sock->sk, NULL, + stg = bpf_sk_storage_get(&sk_storage_map, sk, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); if (stg) diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index e5e3a8b8dd07..637e75df2e14 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -140,11 +140,12 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, 0); + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0); if (!storage) return 0; @@ -155,24 +156,24 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, /* This tests that we can associate multiple elements * with the local storage. */ - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map2, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map2, sk)) return 0; - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map, sk)) return 0; /* Ensure that the sk_storage_map is disconnected from the storage. */ - if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap) + if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap) return 0; sk_storage_result = 0; @@ -185,11 +186,12 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index 02c11d16b692..d7598538aa2d 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -103,11 +103,15 @@ static __always_inline int real_bind(struct socket *sock, int addrlen) { struct sockaddr_ll sa = {}; + struct sock *sk = sock->sk; - if (sock->sk->__sk_common.skc_family != AF_PACKET) + if (!sk) + return 1; + + if (sk->__sk_common.skc_family != AF_PACKET) return 1; - if (sock->sk->sk_kern_sock) + if (sk->sk_kern_sock) return 1; bpf_probe_read_kernel(&sa, sizeof(sa), address); -- cgit v1.2.3 From 54db412e618e9c43e5167f809a901f554e8c43e2 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sun, 28 Apr 2024 12:21:43 +0200 Subject: clocksource: Make the int help prompt unit readable in ncurses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When doing make menuconfig and searching for the CLOCKSOURCE_WATCHDOG_MAX_SKEW_US config item, the help says: │ Symbol: CLOCKSOURCE_WATCHDOG_MAX_SKEW_US [=125] │ Type : integer │ Range : [50 1000] │ Defined at kernel/time/Kconfig:204 │ Prompt: Clocksource watchdog maximum allowable skew (in s) ^^^ │ Depends on: GENERIC_CLOCKEVENTS [=y] && CLOCKSOURCE_WATCHDOG [=y] because on some terminals, it cannot display the 'μ' char, unicode number 0x3bc. So simply write it out so that there's no trouble. Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20240428102143.26764-1-bp@kernel.org --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fc3b1a06c981..8ebb6d5a106b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -202,7 +202,7 @@ config HIGH_RES_TIMERS the size of the kernel image. config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US - int "Clocksource watchdog maximum allowable skew (in μs)" + int "Clocksource watchdog maximum allowable skew (in microseconds)" depends on CLOCKSOURCE_WATCHDOG range 50 1000 default 125 -- cgit v1.2.3 From cb01621b6d91567ac74c8b95e4db731febdbdec3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2024 15:13:22 +0300 Subject: bpf: Use struct_size() Use struct_size() instead of hand writing it. This is less verbose and more robust. Signed-off-by: Andy Shevchenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240429121323.3818497-1-andriy.shevchenko@linux.intel.com --- kernel/bpf/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95c7fd093e55..a8ca6dd6e614 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -2455,13 +2456,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { + struct bpf_prog_array *p; + if (prog_cnt) - return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog_array_item) * - (prog_cnt + 1), - flags); + p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); + else + p = &bpf_empty_prog_array.hdr; - return &bpf_empty_prog_array.hdr; + return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) -- cgit v1.2.3 From a3034872cd90a6881ad4e10ca6d30e1215a99ada Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2024 15:00:05 +0300 Subject: bpf: Switch to krealloc_array() Let the krealloc_array() copy the original data and check for a multiplication overflow. Signed-off-by: Andy Shevchenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240429120005.3539116-1-andriy.shevchenko@linux.intel.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a8ca6dd6e614..99b8b1c9a248 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -850,7 +850,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return -EINVAL; } - tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; -- cgit v1.2.3 From 7b831bd3cf322fdacd07f321d6d7297914ed79bc Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 29 Apr 2024 20:50:30 +0000 Subject: PM: hibernate: replace deprecated strncpy() with strscpy() strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. This kernel config option is simply assigned with the resume_file buffer. It should be NUL-terminated but not necessarily NUL-padded as per its further usage with other string apis: | static int __init find_resume_device(void) | { | if (!strlen(resume_file)) | return -ENOENT; | | pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); Use strscpy() [2] as it guarantees NUL-termination on the destination buffer. Specifically, use the new 2-argument version of strscpy() introduced in Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Signed-off-by: Justin Stitt Reviewed-by: Kees Cook Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 43b1a82e800c..0a213f69a9e4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1361,7 +1361,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy(resume_file, str, 255); + strscpy(resume_file, str); return 1; } -- cgit v1.2.3 From b98a5c68ccaa94e93b9e898091fe2cf21c1500e6 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:24 +0200 Subject: bpf: Do not walk twice the map on free If someone stores both a timer and a workqueue in a map, on free we would walk it twice. Add a check in array_map_free_timers_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-1-27afe7f3b17c@kernel.org --- kernel/bpf/arraymap.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 580d07b15471..feabc0193852 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -436,13 +436,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER)) - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + for (i = 0; i < array->map.max_entries; i++) { + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + } + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ -- cgit v1.2.3 From a891711d0166133ec5120615fcf365d9745d82b2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:25 +0200 Subject: bpf: Do not walk twice the hash map on free If someone stores both a timer and a workqueue in a hash map, on free, we would walk it twice. Add a check in htab_free_malloced_timers_or_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-2-27afe7f3b17c@kernel.org --- kernel/bpf/hashtab.c | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0179183c543a..06115f8728e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab); } -static void htab_free_prealloced_timers(struct bpf_htab *htab) +static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -235,27 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); - cond_resched(); - } -} - -static void htab_free_prealloced_wq(struct bpf_htab *htab) -{ - u32 num_entries = htab->map.max_entries; - int i; - - if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - return; - if (htab_has_extra_elems(htab)) - num_entries += num_possible_cpus(); - - for (i = 0; i < num_entries; i++) { - struct htab_elem *elem; - - elem = get_htab_elem(htab, i); - bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -1515,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) +static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; @@ -1527,10 +1510,10 @@ static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - if (is_timer) + if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); - else + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } @@ -1544,17 +1527,11 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, true); - else - htab_free_prealloced_timers(htab); - } - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, false); + htab_free_malloced_timers_and_wq(htab); else - htab_free_prealloced_wq(htab); + htab_free_prealloced_timers_and_wq(htab); } } -- cgit v1.2.3 From 535a3692ba7245792e6f23654507865d4293c850 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:24 +0200 Subject: bpf: Add support for kprobe session attach Adding support to attach bpf program for entry and return probe of the same function. This is common use case which at the moment requires to create two kprobe multi links. Adding new BPF_TRACE_KPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the bpf program on return probe simply by returning zero or non zero from the entry bpf program execution to execute or not the bpf program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-2-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 7 ++++++- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++-------- tools/include/uapi/linux/bpf.h | 1 + 4 files changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94a72593ead..90706a47f6ff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f655adf42e39..13ad74ecf2cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4016,11 +4016,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_KPROBE_SESSION) + return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; @@ -5281,7 +5285,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) ret = bpf_uprobe_multi_link_attach(attr, prog); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ba722b57af3..06a9671834b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1631,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static bool is_kprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + +static inline bool is_kprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1646,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_attach_cookie_proto_umulti; @@ -2834,10 +2845,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, void *data) { struct bpf_kprobe_multi_link *link; + int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); - return 0; + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + return is_kprobe_session(link->link.prog) ? err : 0; } static void @@ -2981,7 +2993,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + if (!is_kprobe_multi(prog)) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; @@ -3062,10 +3074,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error; - if (flags & BPF_F_KPROBE_MULTI_RETURN) - link->fp.exit_handler = kprobe_multi_link_exit_handler; - else + if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; + if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) + link->fp.exit_handler = kprobe_multi_link_exit_handler; link->addrs = addrs; link->cookies = cookies; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d94a72593ead..90706a47f6ff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From adf46d88ae4b2557f7e2e02547a25fb866935711 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:25 +0200 Subject: bpf: Add support for kprobe session context Adding struct bpf_session_run_ctx object to hold session related data, which is atm is_return bool and data pointer coming in following changes. Placing bpf_session_run_ctx layer in between bpf_run_ctx and bpf_kprobe_multi_run_ctx so the session data can be retrieved regardless of if it's kprobe_multi or uprobe_multi link, which support is coming in future. This way both kprobe_multi and uprobe_multi can use same kfuncs to access the session data. Adding bpf_session_is_return kfunc that returns true if the bpf program is executed from the exit probe of the kprobe multi link attached in wrapper mode. It returns false otherwise. Adding new kprobe hook for kprobe program type. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-3-jolsa@kernel.org --- kernel/bpf/btf.c | 3 +++ kernel/trace/bpf_trace.c | 67 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8291fbfd27b1..821063660d9f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -218,6 +218,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, + BTF_KFUNC_HOOK_KPROBE, BTF_KFUNC_HOOK_MAX, }; @@ -8157,6 +8158,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; + case BPF_PROG_TYPE_KPROBE: + return BTF_KFUNC_HOOK_KPROBE; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 06a9671834b6..3e88212bbf84 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2596,6 +2596,11 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ +struct bpf_session_run_ctx { + struct bpf_run_ctx run_ctx; + bool is_return; +}; + #ifdef CONFIG_FPROBE struct bpf_kprobe_multi_link { struct bpf_link link; @@ -2609,7 +2614,7 @@ struct bpf_kprobe_multi_link { }; struct bpf_kprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; struct bpf_kprobe_multi_link *link; unsigned long entry_ip; }; @@ -2788,7 +2793,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) if (WARN_ON_ONCE(!ctx)) return 0; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); link = run_ctx->link; if (!link->cookies) return 0; @@ -2805,15 +2811,20 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs) + unsigned long entry_ip, struct pt_regs *regs, + bool is_return) { struct bpf_kprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + }, .link = link, .entry_ip = entry_ip, }; @@ -2828,7 +2839,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, migrate_disable(); rcu_read_lock(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); @@ -2848,7 +2859,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false); return is_kprobe_session(link->link.prog) ? err : 0; } @@ -2860,7 +2871,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -3503,3 +3514,45 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return 0; } #endif /* CONFIG_UPROBES */ + +#ifdef CONFIG_FPROBE +__bpf_kfunc_start_defs(); + +__bpf_kfunc bool bpf_session_is_return(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->is_return; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_session_is_return) +BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) + +static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + return 0; + + if (!is_kprobe_session(prog)) + return -EACCES; + + return 0; +} + +static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { + .owner = THIS_MODULE, + .set = &kprobe_multi_kfunc_set_ids, + .filter = bpf_kprobe_multi_filter, +}; + +static int __init bpf_kprobe_multi_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); +} + +late_initcall(bpf_kprobe_multi_kfuncs_init); +#endif -- cgit v1.2.3 From 5c919acef85147886eb2abf86fb147f94680a8b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:26 +0200 Subject: bpf: Add support for kprobe session cookie Adding support for cookie within the session of kprobe multi entry and return program. The session cookie is u64 value and can be retrieved be new kfunc bpf_session_cookie, which returns pointer to the cookie value. The bpf program can use the pointer to store (on entry) and load (on return) the value. The cookie value is implemented via fprobe feature that allows to share values between entry and return ftrace fprobe callbacks. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-4-jolsa@kernel.org --- kernel/bpf/verifier.c | 7 +++++++ kernel/trace/bpf_trace.c | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d42db05315e..7360f04f9ec7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11063,6 +11063,7 @@ enum special_kfunc_type { KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, + KF_bpf_session_cookie, }; BTF_SET_START(special_kfunc_set) @@ -11123,6 +11124,7 @@ BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_session_cookie) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12294,6 +12296,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) { + meta.r0_size = sizeof(u64); + meta.r0_rdonly = false; + } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e88212bbf84..f5154c051d2c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2599,6 +2599,7 @@ fs_initcall(bpf_event_init); struct bpf_session_run_ctx { struct bpf_run_ctx run_ctx; bool is_return; + void *data; }; #ifdef CONFIG_FPROBE @@ -2819,11 +2820,12 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct pt_regs *regs, - bool is_return) + bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { .session_ctx = { .is_return = is_return, + .data = data, }, .link = link, .entry_ip = entry_ip, @@ -2859,7 +2861,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false); + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } @@ -2871,7 +2873,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -3089,6 +3091,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->fp.entry_handler = kprobe_multi_link_handler; if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) link->fp.exit_handler = kprobe_multi_link_exit_handler; + if (is_kprobe_session(prog)) + link->fp.entry_data_size = sizeof(u64); link->addrs = addrs; link->cookies = cookies; @@ -3526,10 +3530,19 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } +__bpf_kfunc __u64 *bpf_session_cookie(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->data; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) +BTF_ID_FLAGS(func, bpf_session_cookie) BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) -- cgit v1.2.3 From 543576ec15b17c0c93301ac8297333c7b6e84ac7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 26 Apr 2024 16:16:18 -0700 Subject: bpf: Add BPF_PROG_TYPE_CGROUP_SKB attach type enforcement in BPF_LINK_CREATE bpf_prog_attach uses attach_type_to_prog_type to enforce proper attach type for BPF_PROG_TYPE_CGROUP_SKB. link_create uses bpf_prog_get and relies on bpf_prog_attach_check_attach_type to properly verify prog_type <> attach_type association. Add missing attach_type enforcement for the link_create case. Otherwise, it's currently possible to attach cgroup_skb prog types to other cgroup hooks. Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Link: https://lore.kernel.org/bpf/0000000000004792a90615a1dde0@google.com/ Reported-by: syzbot+838346b979830606c854@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240426231621.2716876-2-sdf@google.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..cb61d8880dbe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3985,6 +3985,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, * check permissions at attach time. */ return -EPERM; + + ptype = attach_type_to_prog_type(attach_type); + if (prog->type != ptype) + return -EINVAL; + return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; -- cgit v1.2.3 From 3eaea21b4d27cff0017c20549aeb53034c58fc23 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Mar 2024 11:17:26 -0700 Subject: uprobes: encapsulate preparation of uprobe args buffer Move the logic of fetching temporary per-CPU uprobe buffer and storing uprobes args into it to a new helper function. Store data size as part of this buffer, simplifying interfaces a bit, as now we only pass single uprobe_cpu_buffer reference around, instead of pointer + dsize. This logic was duplicated across uprobe_dispatcher and uretprobe_dispatcher, and now will be centralized. All this is also in preparation to make this uprobe_cpu_buffer handling logic optional in the next patch. Link: https://lore.kernel.org/all/20240318181728.2795838-2-andrii@kernel.org/ [Masami: update for v6.9-rc3 kernel] Signed-off-by: Andrii Nakryiko Reviewed-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 78 ++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9e461362450a..155cba8e6e7e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = { struct uprobe_cpu_buffer { struct mutex mutex; void *buf; + int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; @@ -943,9 +944,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) mutex_unlock(&ucb->mutex); } +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, + struct pt_regs *regs) +{ + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + dsize = __get_data_size(&tu->tp, regs, NULL); + + ucb = uprobe_buffer_get(); + ucb->dsize = tu->tp.size + dsize; + + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + + return ucb; +} + static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize, + struct uprobe_cpu_buffer *ucb, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; @@ -956,14 +974,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; @@ -977,14 +995,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { struct event_file_link *link; @@ -993,7 +1011,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, 0, regs, ucb, link->file); rcu_read_unlock(); return 0; @@ -1001,13 +1019,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, func, regs, ucb, link->file); rcu_read_unlock(); } @@ -1335,7 +1353,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; @@ -1356,7 +1374,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -1379,13 +1397,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); - if (size - esize > tu->tp.size + dsize) { - int len = tu->tp.size + dsize; - - memset(data + len, 0, size - esize - len); - } + if (size - esize > ucb->dsize) + memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1395,21 +1410,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb, dsize); + __uprobe_perf_func(tu, 0, regs, ucb); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { - __uprobe_perf_func(tu, func, regs, ucb, dsize); + __uprobe_perf_func(tu, func, regs, ucb); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1475,10 +1490,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb; - int dsize, esize; int ret = 0; - tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1490,18 +1503,14 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + ucb = prepare_uprobe_buffer(tu, regs); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb, dsize); + ret |= uprobe_trace_func(tu, regs, ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb, dsize); + ret |= uprobe_perf_func(tu, regs, ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1513,7 +1522,6 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb; - int dsize, esize; tu = container_of(con, struct trace_uprobe, consumer); @@ -1525,18 +1533,14 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + ucb = prepare_uprobe_buffer(tu, regs); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb, dsize); + uretprobe_trace_func(tu, func, regs, ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb, dsize); + uretprobe_perf_func(tu, func, regs, ucb); #endif uprobe_buffer_put(ucb); return 0; -- cgit v1.2.3 From 1b8f85defbc82e2eb8f27c5f6060ea507ad4d5a3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Mar 2024 11:17:27 -0700 Subject: uprobes: prepare uprobe args buffer lazily MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uprobe_cpu_buffer and corresponding logic to store uprobe args into it are used for uprobes/uretprobes that are created through tracefs or perf events. BPF is yet another user of uprobe/uretprobe infrastructure, but doesn't need uprobe_cpu_buffer and associated data. For BPF-only use cases this buffer handling and preparation is a pure overhead. At the same time, BPF-only uprobe/uretprobe usage is very common in practice. Also, for a lot of cases applications are very senstivie to performance overheads, as they might be tracing a very high frequency functions like malloc()/free(), so every bit of performance improvement matters. All that is to say that this uprobe_cpu_buffer preparation is an unnecessary overhead that each BPF user of uprobes/uretprobe has to pay. This patch is changing this by making uprobe_cpu_buffer preparation optional. It will happen only if either tracefs-based or perf event-based uprobe/uretprobe consumer is registered for given uprobe/uretprobe. For BPF-only use cases this step will be skipped. We used uprobe/uretprobe benchmark which is part of BPF selftests (see [0]) to estimate the improvements. We have 3 uprobe and 3 uretprobe scenarios, which vary an instruction that is replaced by uprobe: nop (fastest uprobe case), `push rbp` (typical case), and non-simulated `ret` instruction (slowest case). Benchmark thread is constantly calling user space function in a tight loop. User space function has attached BPF uprobe or uretprobe program doing nothing but atomic counter increments to count number of triggering calls. Benchmark emits throughput in millions of executions per second. BEFORE these changes ==================== uprobe-nop : 2.657 ± 0.024M/s uprobe-push : 2.499 ± 0.018M/s uprobe-ret : 1.100 ± 0.006M/s uretprobe-nop : 1.356 ± 0.004M/s uretprobe-push : 1.317 ± 0.019M/s uretprobe-ret : 0.785 ± 0.007M/s AFTER these changes =================== uprobe-nop : 2.732 ± 0.022M/s (+2.8%) uprobe-push : 2.621 ± 0.016M/s (+4.9%) uprobe-ret : 1.105 ± 0.007M/s (+0.5%) uretprobe-nop : 1.396 ± 0.007M/s (+2.9%) uretprobe-push : 1.347 ± 0.008M/s (+2.3%) uretprobe-ret : 0.800 ± 0.006M/s (+1.9) So the improvements on this particular machine seems to be between 2% and 5%. [0] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/all/20240318181728.2795838-3-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 49 ++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 155cba8e6e7e..cbab487e8522 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -941,15 +941,21 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void) static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { + if (!ucb) + return; mutex_unlock(&ucb->mutex); } static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer **ucbp) { struct uprobe_cpu_buffer *ucb; int dsize, esize; + if (*ucbp) + return *ucbp; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); dsize = __get_data_size(&tu->tp, regs, NULL); @@ -958,22 +964,25 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + *ucbp = ucb; return ucb; } static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, + struct uprobe_cpu_buffer **ucbp, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; + struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); + ucb = prepare_uprobe_buffer(tu, regs, ucbp); if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; @@ -1002,7 +1011,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; @@ -1011,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, link->file); + __uprobe_trace_func(tu, 0, regs, ucbp, link->file); rcu_read_unlock(); return 0; @@ -1019,13 +1028,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, link->file); + __uprobe_trace_func(tu, func, regs, ucbp, link->file); rcu_read_unlock(); } @@ -1353,10 +1362,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; + struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; @@ -1374,6 +1384,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + ucb = prepare_uprobe_buffer(tu, regs, ucbp); size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) @@ -1410,21 +1421,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb); + __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { - __uprobe_perf_func(tu, func, regs, ucb); + __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1489,7 +1500,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; + struct uprobe_cpu_buffer *ucb = NULL; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); @@ -1503,14 +1514,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - ucb = prepare_uprobe_buffer(tu, regs); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb); + ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb); + ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1521,7 +1530,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; + struct uprobe_cpu_buffer *ucb = NULL; tu = container_of(con, struct trace_uprobe, consumer); @@ -1533,14 +1542,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - ucb = prepare_uprobe_buffer(tu, regs); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb); + uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb); + uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; -- cgit v1.2.3 From cdf355cc60e388d992bdd205b8ee70dc4d533461 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Mar 2024 11:17:28 -0700 Subject: uprobes: add speculative lockless system-wide uprobe filter check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's very common with BPF-based uprobe/uretprobe use cases to have a system-wide (not PID specific) probes used. In this case uprobe's trace_uprobe_filter->nr_systemwide counter is bumped at registration time, and actual filtering is short circuited at the time when uprobe/uretprobe is triggered. This is a great optimization, and the only issue with it is that to even get to checking this counter uprobe subsystem is taking read-side trace_uprobe_filter->rwlock. This is actually noticeable in profiles and is just another point of contention when uprobe is triggered on multiple CPUs simultaneously. This patch moves this nr_systemwide check outside of filter list's rwlock scope, as rwlock is meant to protect list modification, while nr_systemwide-based check is speculative and racy already, despite the lock (as discussed in [0]). trace_uprobe_filter_remove() and trace_uprobe_filter_add() already check for filter->nr_systewide explicitly outside of __uprobe_perf_filter, so no modifications are required there. Confirming with BPF selftests's based benchmarks. BEFORE (based on changes in previous patch) =========================================== uprobe-nop : 2.732 ± 0.022M/s uprobe-push : 2.621 ± 0.016M/s uprobe-ret : 1.105 ± 0.007M/s uretprobe-nop : 1.396 ± 0.007M/s uretprobe-push : 1.347 ± 0.008M/s uretprobe-ret : 0.800 ± 0.006M/s AFTER ===== uprobe-nop : 2.878 ± 0.017M/s (+5.5%, total +8.3%) uprobe-push : 2.753 ± 0.013M/s (+5.3%, total +10.2%) uprobe-ret : 1.142 ± 0.010M/s (+3.8%, total +3.8%) uretprobe-nop : 1.444 ± 0.008M/s (+3.5%, total +6.5%) uretprobe-push : 1.410 ± 0.010M/s (+4.8%, total +7.1%) uretprobe-ret : 0.816 ± 0.002M/s (+2.0%, total +3.9%) In the above, first percentage value is based on top of previous patch (lazy uprobe buffer optimization), while the "total" percentage is based on kernel without any of the changes in this patch set. As can be seen, we get about 4% - 10% speed up, in total, with both lazy uprobe buffer and speculative filter check optimizations. [0] https://lore.kernel.org/bpf/20240313131926.GA19986@redhat.com/ Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/all/20240318181728.2795838-4-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index cbab487e8522..8541fa1494ae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1226,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; - if (filter->nr_systemwide) - return true; - list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; @@ -1353,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; + /* + * speculative short-circuiting check to avoid unnecessarily taking + * filter->rwlock below, if the uprobe has system-wide consumer + */ + if (READ_ONCE(filter->nr_systemwide)) + return true; + read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); -- cgit v1.2.3 From d9b15224dd8ff83b2aef87e4cd5ad10c875ef7d6 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:04 +0800 Subject: tracing/probes: support '%pd' type for print struct dentry's name During fault locating, the file name needs to be printed based on the dentry address. The offset needs to be calculated each time, which is troublesome. Similar to printk, kprobe support print type '%pd' for print dentry's name. For example "name=$arg1:%pd" casts the `$arg1` as (struct dentry *), dereferences the "d_name.name" field and stores it to "name" argument as a kernel string. Here is an example: [tracing]# echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events [tracing]# echo 1 > events/kprobes/testprobe/enable [tracing]# grep -q "1" events/kprobes/testprobe/enable [tracing]# echo 0 > events/kprobes/testprobe/enable [tracing]# cat trace | grep "enable" bash-14844 [002] ..... 16912.889543: testprobe: (dput+0x4/0x30) name="enable" grep-15389 [003] ..... 16922.834182: testprobe: (dput+0x4/0x30) name="enable" grep-15389 [003] ..... 16922.836103: testprobe: (dput+0x4/0x30) name="enable" bash-14844 [001] ..... 16931.820909: testprobe: (dput+0x4/0x30) name="enable" Note that this expects the given argument (e.g. $arg1) is an address of struct dentry. User must ensure it. Link: https://lore.kernel.org/all/20240322064308.284457-2-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_fprobe.c | 6 ++++++ kernel/trace/trace_kprobe.c | 6 ++++++ kernel/trace/trace_probe.c | 50 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_probe.h | 2 ++ 5 files changed, 65 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 233d1af39fff..869978d84672 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5540,7 +5540,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b@/, ustring,\n" - "\t symstr, \\[\\]\n" + "\t symstr, %pd, \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: ;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 4f4280815522..62e6a8f4aae9 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc, is_return); @@ -1154,6 +1159,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14099cc17fc9..c68d4e830fbe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -782,6 +782,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -933,6 +934,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); @@ -979,6 +984,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfe3ee6035ec..f7ee97e45d1f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1739,6 +1739,56 @@ error: return ERR_PTR(ret); } +/* @buf: *buf must be equal to NULL. Caller must to free *buf */ +int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) +{ + int i, used, ret; + const int bufsize = MAX_DENTRY_ARGS_LEN; + char *tmpbuf = NULL; + + if (*buf) + return -EINVAL; + + used = 0; + for (i = 0; i < argc; i++) { + if (glob_match("*:%pd", argv[i])) { + char *tmp; + char *equal; + + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + tmp[strlen(argv[i]) - 4] = '\0'; + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; + } + } + + *buf = tmpbuf; + return 0; +nomem: + kfree(tmpbuf); + return -ENOMEM; +} + void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) { clear_btf_context(ctx); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cef3a50628a3..5803e6a41570 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,6 +34,7 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX #define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) @@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); -- cgit v1.2.3 From 20fe4d07bde67ec26716835b61be2c4eeca1c6bc Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:05 +0800 Subject: tracing/probes: support '%pD' type for print struct file's name As like '%pd' type, this patch supports print type '%pD' for print file's name. For example "name=$arg1:%pD" casts the `$arg1` as (struct file*), dereferences the "file.f_path.dentry.d_name.name" field and stores it to "name" argument as a kernel string. Here is an example: [tracing]# echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_event [tracing]# echo 1 > events/kprobes/testprobe/enable [tracing]# grep -q "1" events/kprobes/testprobe/enable [tracing]# echo 0 > events/kprobes/testprobe/enable [tracing]# grep "vfs_read" trace | grep "enable" grep-15108 [003] ..... 5228.328609: testprobe: (vfs_read+0x4/0xbb0) name="enable" Note that this expects the given argument (e.g. $arg1) is an address of struct file. User must ensure it. Link: https://lore.kernel.org/all/20240322064308.284457-3-yebin10@huawei.com/ [Masami: replaced "previous patch" with '%pd' type] Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_probe.c | 57 ++++++++++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 869978d84672..7fe4b539a423 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5540,7 +5540,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b@/, ustring,\n" - "\t symstr, %pd, \\[\\]\n" + "\t symstr, %pd/%pD, \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: ;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index f7ee97e45d1f..d4a887dac821 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include +#include #include "trace_btf.h" #include "trace_probe.h" @@ -1751,35 +1752,47 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) used = 0; for (i = 0; i < argc; i++) { - if (glob_match("*:%pd", argv[i])) { - char *tmp; - char *equal; - - if (!tmpbuf) { - tmpbuf = kmalloc(bufsize, GFP_KERNEL); - if (!tmpbuf) - return -ENOMEM; - } + char *tmp; + char *equal; + size_t arg_len; - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) - goto nomem; + if (!glob_match("*:%p[dD]", argv[i])) + continue; - equal = strchr(tmp, '='); - if (equal) - *equal = '\0'; - tmp[strlen(argv[i]) - 4] = '\0'; + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + arg_len = strlen(argv[i]); + tmp[arg_len - 4] = '\0'; + if (argv[i][arg_len - 1] == 'd') ret = snprintf(tmpbuf + used, bufsize - used, "%s%s+0x0(+0x%zx(%s)):string", equal ? tmp : "", equal ? "=" : "", offsetof(struct dentry, d_name.name), equal ? equal + 1 : tmp); - kfree(tmp); - if (ret >= bufsize - used) - goto nomem; - argv[i] = tmpbuf + used; - used += ret + 1; - } + else + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; } *buf = tmpbuf; -- cgit v1.2.3 From 5120d167e21c674afd0630c65e7f6a00fa0667f1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 8 Apr 2024 10:51:40 -0700 Subject: rethook: Remove warning messages printed for finding return address of a frame. The function rethook_find_ret_addr() prints a warning message and returns 0 when the target task is running and is not the "current" task in order to prevent the incorrect return address, although it still may return an incorrect address. However, the warning message turns into noise when BPF profiling programs call bpf_get_task_stack() on running tasks in a firm with a large number of hosts. The callers should be aware and willing to take the risk of receiving an incorrect return address from a task that is currently running other than the "current" one. A warning is not needed here as the callers are intent on it. Link: https://lore.kernel.org/all/20240408175140.60223-1-thinker.li@gmail.com/ Acked-by: Andrii Nakryiko Acked-by: John Fastabend Signed-off-by: Kui-Feng Lee Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/rethook.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index fa03094e9e69..4297a132a7ae 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -248,7 +248,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame if (WARN_ON_ONCE(!cur)) return 0; - if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + if (tsk != current && task_is_running(tsk)) return 0; do { -- cgit v1.2.3 From 0dc715295d4143d1659879f7f50ad4e9a6f6a99c Mon Sep 17 00:00:00 2001 From: Jonathan Haslam Date: Mon, 22 Apr 2024 03:23:05 -0700 Subject: uprobes: reduce contention on uprobes_tree access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Active uprobes are stored in an RB tree and accesses to this tree are dominated by read operations. Currently these accesses are serialized by a spinlock but this leads to enormous contention when large numbers of threads are executing active probes. This patch converts the spinlock used to serialize access to the uprobes_tree RB tree into a reader-writer spinlock. This lock type aligns naturally with the overwhelmingly read-only nature of the tree usage here. Although the addition of reader-writer spinlocks are discouraged [0], this fix is proposed as an interim solution while an RCU based approach is implemented (that work is in a nascent form). This fix also has the benefit of being trivial, self contained and therefore simple to backport. We have used a uprobe benchmark from the BPF selftests [1] to estimate the improvements. Each block of results below show 1 line per execution of the benchmark ("the "Summary" line) and each line is a run with one more thread added - a thread is a "producer". The lines are edited to remove extraneous output. The tests were executed with this driver script: for num_threads in {1..20} do sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary done SPINLOCK (BEFORE) ================== Summary: hits 1.396 ± 0.007M/s ( 1.396M/prod) Summary: hits 1.656 ± 0.016M/s ( 0.828M/prod) Summary: hits 2.246 ± 0.008M/s ( 0.749M/prod) Summary: hits 2.114 ± 0.010M/s ( 0.529M/prod) Summary: hits 2.013 ± 0.009M/s ( 0.403M/prod) Summary: hits 1.753 ± 0.008M/s ( 0.292M/prod) Summary: hits 1.847 ± 0.001M/s ( 0.264M/prod) Summary: hits 1.889 ± 0.001M/s ( 0.236M/prod) Summary: hits 1.833 ± 0.006M/s ( 0.204M/prod) Summary: hits 1.900 ± 0.003M/s ( 0.190M/prod) Summary: hits 1.918 ± 0.006M/s ( 0.174M/prod) Summary: hits 1.925 ± 0.002M/s ( 0.160M/prod) Summary: hits 1.837 ± 0.001M/s ( 0.141M/prod) Summary: hits 1.898 ± 0.001M/s ( 0.136M/prod) Summary: hits 1.799 ± 0.016M/s ( 0.120M/prod) Summary: hits 1.850 ± 0.005M/s ( 0.109M/prod) Summary: hits 1.816 ± 0.002M/s ( 0.101M/prod) Summary: hits 1.787 ± 0.001M/s ( 0.094M/prod) Summary: hits 1.764 ± 0.002M/s ( 0.088M/prod) RW SPINLOCK (AFTER) =================== Summary: hits 1.444 ± 0.020M/s ( 1.444M/prod) Summary: hits 2.279 ± 0.011M/s ( 1.139M/prod) Summary: hits 3.422 ± 0.014M/s ( 1.141M/prod) Summary: hits 3.565 ± 0.017M/s ( 0.891M/prod) Summary: hits 2.671 ± 0.013M/s ( 0.534M/prod) Summary: hits 2.409 ± 0.005M/s ( 0.401M/prod) Summary: hits 2.485 ± 0.008M/s ( 0.355M/prod) Summary: hits 2.496 ± 0.003M/s ( 0.312M/prod) Summary: hits 2.585 ± 0.002M/s ( 0.287M/prod) Summary: hits 2.908 ± 0.011M/s ( 0.291M/prod) Summary: hits 2.346 ± 0.016M/s ( 0.213M/prod) Summary: hits 2.804 ± 0.004M/s ( 0.234M/prod) Summary: hits 2.556 ± 0.001M/s ( 0.197M/prod) Summary: hits 2.754 ± 0.004M/s ( 0.197M/prod) Summary: hits 2.482 ± 0.002M/s ( 0.165M/prod) Summary: hits 2.412 ± 0.005M/s ( 0.151M/prod) Summary: hits 2.710 ± 0.003M/s ( 0.159M/prod) Summary: hits 2.826 ± 0.005M/s ( 0.157M/prod) Summary: hits 2.718 ± 0.001M/s ( 0.143M/prod) Summary: hits 2.844 ± 0.006M/s ( 0.142M/prod) The numbers in parenthesis give averaged throughput per thread which is of greatest interest here as a measure of scalability. Improvements are in the order of 22 - 68% with this particular benchmark (mean = 43%). V2: - Updated commit message to include benchmark results. [0] https://docs.kernel.org/locking/spinlocks.html [1] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c Link: https://lore.kernel.org/all/20240422102306.6026-1-jonathan.haslam@gmail.com/ Signed-off-by: Jonathan Haslam Acked-by: Jiri Olsa Signed-off-by: Masami Hiramatsu (Google) --- kernel/events/uprobes.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..8ae0eefc3a34 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return uprobe; } @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); return u; } @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe) if (WARN_ON(!uprobe_is_active(uprobe))) return; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ put_uprobe(uprobe); } @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode, get_uprobe(u); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } -- cgit v1.2.3 From b0e28a4b5becea84ae6fca5cbd8a6b80a134e223 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Apr 2024 12:09:08 -0700 Subject: ftrace: make extra rcu_is_watching() validation check optional Introduce CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING config option to control whether ftrace low-level code performs additional rcu_is_watching()-based validation logic in an attempt to catch noinstr violations. This check is expected to never be true and is mostly useful for low-level validation of ftrace subsystem invariants. For most users it should probably be kept disabled to eliminate unnecessary runtime overhead. This improves BPF multi-kretprobe (relying on ftrace and rethook infrastructure) runtime throughput by 2%, according to BPF benchmarks ([0]). [0] https://lore.kernel.org/bpf/CAEf4BzauQ2WKMjZdc9s0rBWa01BYbgwHN6aNDXQSHYia47pQ-w@mail.gmail.com/ Link: https://lore.kernel.org/all/20240418190909.704286-1-andrii@kernel.org/ Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Paul E. McKenney Acked-by: Masami Hiramatsu (Google) Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- include/linux/trace_recursion.h | 2 +- kernel/trace/Kconfig | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index d48cd92d2364..24ea8ac049b4 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -135,7 +135,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif -#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#ifdef CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING # define trace_warn_on_no_rcu(ip) \ ({ \ bool __ret = !rcu_is_watching(); \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 47345bf1d4a9..d093e16d01c1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config FTRACE_VALIDATE_RCU_IS_WATCHING + bool "Validate RCU is on during ftrace execution" + depends on FUNCTION_TRACER + depends on ARCH_WANTS_NO_INSTR + help + All callbacks that attach to the function tracing have some sort of + protection against recursion. This option is only to verify that + ftrace (and other users of ftrace_test_recursion_trylock()) are not + called outside of RCU, as if they are, it can cause a race. But it + also has a noticeable overhead when enabled. + + If unsure, say N + config RING_BUFFER_RECORD_RECURSION bool "Record functions that recurse in the ring buffer" depends on FTRACE_RECORD_RECURSION -- cgit v1.2.3 From e03c05ac9813410d15c9c39ccf02c84efe563533 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Apr 2024 12:09:09 -0700 Subject: rethook: honor CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING in rethook_try_get() Take into account CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING when validating that RCU is watching when trying to setup rethooko on a function entry. One notable exception when we force rcu_is_watching() check is CONFIG_KPROBE_EVENTS_ON_NOTRACE=y case, in which case kretprobes will use old-style int3-based workflow instead of relying on ftrace, making RCU watching check important to validate. This further (in addition to improvements in the previous patch) improves BPF multi-kretprobe (which rely on rethook) runtime throughput by 2.3%, according to BPF benchmarks ([0]). [0] https://lore.kernel.org/bpf/CAEf4BzauQ2WKMjZdc9s0rBWa01BYbgwHN6aNDXQSHYia47pQ-w@mail.gmail.com/ Link: https://lore.kernel.org/all/20240418190909.704286-2-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Paul E. McKenney Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/rethook.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 4297a132a7ae..30d224946881 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; +#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) /* * This expects the caller will set up a rethook on a function entry. * When the function returns, the rethook will eventually be reclaimed @@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) */ if (unlikely(!rcu_is_watching())) return NULL; +#endif return (struct rethook_node *)objpool_pop(&rh->pool); } -- cgit v1.2.3 From a284e43852380ab71eeb996389e01992d74a8dde Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 1 May 2024 12:37:12 -0700 Subject: hardening: Enable KCFI and some other options Add some stuff that got missed along the way: - CONFIG_UNWIND_PATCH_PAC_INTO_SCS=y so SCS vs PAC is hardware selectable. - CONFIG_X86_KERNEL_IBT=y while a default, just be sure. - CONFIG_CFI_CLANG=y globally. - CONFIG_PAGE_TABLE_CHECK=y for userspace mapping sanity. Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240501193709.make.982-kees@kernel.org Signed-off-by: Kees Cook --- arch/arm64/configs/hardening.config | 1 + arch/x86/configs/hardening.config | 3 +++ kernel/configs/hardening.config | 8 ++++++++ 3 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/arch/arm64/configs/hardening.config b/arch/arm64/configs/hardening.config index b0e795208998..24179722927e 100644 --- a/arch/arm64/configs/hardening.config +++ b/arch/arm64/configs/hardening.config @@ -5,6 +5,7 @@ CONFIG_ARM64_SW_TTBR0_PAN=y # Software Shadow Stack or PAC CONFIG_SHADOW_CALL_STACK=y +CONFIG_UNWIND_PATCH_PAC_INTO_SCS=y # Pointer authentication (ARMv8.3 and later). If hardware actually supports # it, one can turn off CONFIG_STACKPROTECTOR_STRONG with this enabled. diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config index 7b497f3b7bc3..de319852a1e9 100644 --- a/arch/x86/configs/hardening.config +++ b/arch/x86/configs/hardening.config @@ -10,5 +10,8 @@ CONFIG_INTEL_IOMMU_DEFAULT_ON=y CONFIG_INTEL_IOMMU_SVM=y CONFIG_AMD_IOMMU=y +# Enforce CET Indirect Branch Tracking in the kernel. +CONFIG_X86_KERNEL_IBT=y + # Enable CET Shadow Stack for userspace. CONFIG_X86_USER_SHADOW_STACK=y diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 7a5bbfc024b7..47e6564129c3 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -23,6 +23,10 @@ CONFIG_SLAB_FREELIST_HARDENED=y CONFIG_SHUFFLE_PAGE_ALLOCATOR=y CONFIG_RANDOM_KMALLOC_CACHES=y +# Sanity check userspace page table mappings. +CONFIG_PAGE_TABLE_CHECK=y +CONFIG_PAGE_TABLE_CHECK_ENFORCED=y + # Randomize kernel stack offset on syscall entry. CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y @@ -81,6 +85,10 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y +# Enable Kernel Control Flow Integrity (currently Clang only). +CONFIG_CFI_CLANG=y +# CONFIG_CFI_PERMISSIVE is not set + # Attack surface reduction: do not autoload TTY line disciplines. # CONFIG_LDISC_AUTOLOAD is not set -- cgit v1.2.3 From ac2f438c2a85acd07e0ac7dc2f69d45bda1bb498 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 1 May 2024 10:01:30 -0700 Subject: bpf: crypto: fix build when CONFIG_CRYPTO=m Crypto subsytem can be build as a module. In this case we still have to build BPF crypto framework otherwise the build will fail. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405011634.4JK40epY-lkp@intel.com/ Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240501170130.1682309-1-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 85786fd97d2a..7eb9ad3a3ae6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,7 +44,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif -ifeq ($(CONFIG_CRYPTO),y) +ifneq ($(CONFIG_CRYPTO),) obj-$(CONFIG_BPF_SYSCALL) += crypto.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ -- cgit v1.2.3 From 951bcae6c5a0bfaa55b27c5f16178204988f0379 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 18:20:43 +0200 Subject: kallsyms: Avoid weak references for kallsyms symbols kallsyms is a directory of all the symbols in the vmlinux binary, and so creating it is somewhat of a chicken-and-egg problem, as its non-zero size affects the layout of the binary, and therefore the values of the symbols. For this reason, the kernel is linked more than once, and the first pass does not include any kallsyms data at all. For the linker to accept this, the symbol declarations describing the kallsyms metadata are emitted as having weak linkage, so they can remain unsatisfied. During the subsequent passes, the weak references are satisfied by the kallsyms metadata that was constructed based on information gathered from the preceding passes. Weak references lead to somewhat worse codegen, because taking their address may need to produce NULL (if the reference was unsatisfied), and this is not usually supported by RIP or PC relative symbol references. Given that these references are ultimately always satisfied in the final link, let's drop the weak annotation, and instead, provide fallback definitions in the linker script that are only emitted if an unsatisfied reference exists. While at it, drop the FRV specific annotation that these symbols reside in .rodata - FRV is long gone. Tested-by: Nick Desaulniers # Boot Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Acked-by: Arnd Bergmann Link: https://lkml.kernel.org/r/20230504174320.3930345-1-ardb%40kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Masahiro Yamada --- include/asm-generic/vmlinux.lds.h | 19 +++++++++++++++++++ kernel/kallsyms.c | 6 ------ kernel/kallsyms_internal.h | 30 ++++++++++++------------------ 3 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f7749d0f2562..e8449be62058 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -448,11 +448,30 @@ #endif #endif +/* + * Some symbol definitions will not exist yet during the first pass of the + * link, but are guaranteed to exist in the final link. Provide preliminary + * definitions that will be superseded in the final link to avoid having to + * rely on weak external linkage, which requires a GOT when used in position + * independent code. + */ +#define PRELIMINARY_SYMBOL_DEFINITIONS \ + PROVIDE(kallsyms_addresses = .); \ + PROVIDE(kallsyms_offsets = .); \ + PROVIDE(kallsyms_names = .); \ + PROVIDE(kallsyms_num_syms = .); \ + PROVIDE(kallsyms_relative_base = .); \ + PROVIDE(kallsyms_token_table = .); \ + PROVIDE(kallsyms_token_index = .); \ + PROVIDE(kallsyms_markers = .); \ + PROVIDE(kallsyms_seqs_of_names = .); + /* * Read only Data */ #define RO_DATA(align) \ . = ALIGN((align)); \ + PRELIMINARY_SYMBOL_DEFINITIONS \ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ __start_rodata = .; \ *(.rodata) *(.rodata.*) \ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 18edd57b5fe8..22ea19a36e6e 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -325,12 +325,6 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; - /* This kernel should never had been booted. */ - if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) - BUG_ON(!kallsyms_addresses); - else - BUG_ON(!kallsyms_offsets); - /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; high = kallsyms_num_syms; diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 27fabdcc40f5..85480274fc8f 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -5,27 +5,21 @@ #include /* - * These will be re-linked against their real values - * during the second link stage. + * These will be re-linked against their real values during the second link + * stage. Preliminary values must be provided in the linker script using the + * PROVIDE() directive so that the first link stage can complete successfully. */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; +extern const unsigned long kallsyms_addresses[]; +extern const int kallsyms_offsets[]; +extern const u8 kallsyms_names[]; -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); +extern const unsigned int kallsyms_num_syms; +extern const unsigned long kallsyms_relative_base; -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; +extern const char kallsyms_token_table[]; +extern const u16 kallsyms_token_index[]; -extern const unsigned int kallsyms_markers[] __weak; -extern const u8 kallsyms_seqs_of_names[] __weak; +extern const unsigned int kallsyms_markers[]; +extern const u8 kallsyms_seqs_of_names[]; #endif // LINUX_KALLSYMS_INTERNAL_H_ -- cgit v1.2.3 From 377d9095117c084b835e38c020faf5a78e386f01 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 18:20:44 +0200 Subject: vmlinux: Avoid weak reference to notes section Weak references are references that are permitted to remain unsatisfied in the final link. This means they cannot be implemented using place relative relocations, resulting in GOT entries when using position independent code generation. The notes section should always exist, so the weak annotations can be omitted. Acked-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Signed-off-by: Masahiro Yamada --- kernel/ksysfs.c | 4 ++-- lib/buildid.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 495b69a71a5d..07fb5987b42b 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -228,8 +228,8 @@ KERNEL_ATTR_RW(rcu_normal); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __weak; -extern const void __stop_notes __weak; +extern const void __start_notes; +extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, diff --git a/lib/buildid.c b/lib/buildid.c index 898301b49eb6..7954dd92e36c 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -182,8 +182,8 @@ unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init; */ void __init init_vmlinux_build_id(void) { - extern const void __start_notes __weak; - extern const void __stop_notes __weak; + extern const void __start_notes; + extern const void __stop_notes; unsigned int size = &__stop_notes - &__start_notes; build_id_parse_buf(&__start_notes, vmlinux_build_id, size); -- cgit v1.2.3 From 75961ffb5cb3e5196f19cae7683f35cc88b50800 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 2 May 2024 10:37:23 +0100 Subject: swiotlb: initialise restricted pool list_head when SWIOTLB_DYNAMIC=y Using restricted DMA pools (CONFIG_DMA_RESTRICTED_POOL=y) in conjunction with dynamic SWIOTLB (CONFIG_SWIOTLB_DYNAMIC=y) leads to the following crash when initialising the restricted pools at boot-time: | Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 | Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP | pc : rmem_swiotlb_device_init+0xfc/0x1ec | lr : rmem_swiotlb_device_init+0xf0/0x1ec | Call trace: | rmem_swiotlb_device_init+0xfc/0x1ec | of_reserved_mem_device_init_by_idx+0x18c/0x238 | of_dma_configure_id+0x31c/0x33c | platform_dma_configure+0x34/0x80 faddr2line reveals that the crash is in the list validation code: include/linux/list.h:83 include/linux/rculist.h:79 include/linux/rculist.h:106 kernel/dma/swiotlb.c:306 kernel/dma/swiotlb.c:1695 because add_mem_pool() is trying to list_add_rcu() to a NULL 'mem->pools'. Fix the crash by initialising the 'mem->pools' list_head in rmem_swiotlb_device_init() before calling add_mem_pool(). Reported-by: Nikita Ioffe Tested-by: Nikita Ioffe Fixes: 1aaa736815eb ("swiotlb: allocate a new memory pool when existing pools are full") Signed-off-by: Will Deacon Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a5e0dfc44d24..0de66f0ff43a 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1798,6 +1798,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, mem->for_alloc = true; #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock_init(&mem->lock); + INIT_LIST_HEAD_RCU(&mem->pools); #endif add_mem_pool(mem, pool); -- cgit v1.2.3 From b1439b179d351977641a1df9745a24d08693f9d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Apr 2024 18:38:38 -0400 Subject: swsusp: don't bother with setting block size same as with the swap... Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- kernel/power/swap.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5bc04bfe2db1..d9abb7ab031d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -368,11 +368,7 @@ static int swsusp_swap_check(void) if (IS_ERR(hib_resume_bdev_file)) return PTR_ERR(hib_resume_bdev_file); - res = set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE); - if (res < 0) - fput(hib_resume_bdev_file); - - return res; + return 0; } /** @@ -1574,7 +1570,6 @@ int swsusp_check(bool exclusive) hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { - set_blocksize(file_bdev(hib_resume_bdev_file), PAGE_SIZE); clear_page(swsusp_header); error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); -- cgit v1.2.3 From 0e148d3cca0dc1a7c6063939f6cb9ba4866c39a7 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 3 May 2024 15:44:09 +0200 Subject: stackleak: Use a copy of the ctl_table argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sysctl handlers are not supposed to modify the ctl_table passed to them. Adapt the logic to work with a temporary variable, similar to how it is done in other parts of the kernel. This is also a prerequisite to enforce the immutability of the argument through the callbacks. Reviewed-by: Luis Chamberlain Signed-off-by: Thomas Weißschuh Reviewed-by: Tycho Andersen Link: https://lore.kernel.org/r/20240503-sysctl-const-stackleak-v1-1-603fecb19170@weissschuh.net Signed-off-by: Kees Cook --- kernel/stackleak.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 34c9d81eea94..59cdfaf5118e 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -27,10 +27,10 @@ static int stack_erasing_sysctl(struct ctl_table *table, int write, int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); int prev_state = state; + struct ctl_table table_copy = *table; - table->data = &state; - table->maxlen = sizeof(int); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + table_copy.data = &state; + ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); state = !!state; if (ret || !write || state == prev_state) return ret; -- cgit v1.2.3 From b63db58e2fa5d6963db9c45df88e60060f0ff35f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 2 May 2024 09:03:15 -0400 Subject: eventfs/tracing: Add callback for release of an eventfs_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synthetic events create and destroy tracefs files when they are created and removed. The tracing subsystem has its own file descriptor representing the state of the events attached to the tracefs files. There's a race between the eventfs files and this file descriptor of the tracing system where the following can cause an issue: With two scripts 'A' and 'B' doing: Script 'A': echo "hello int aaa" > /sys/kernel/tracing/synthetic_events while : do echo 0 > /sys/kernel/tracing/events/synthetic/hello/enable done Script 'B': echo > /sys/kernel/tracing/synthetic_events Script 'A' creates a synthetic event "hello" and then just writes zero into its enable file. Script 'B' removes all synthetic events (including the newly created "hello" event). What happens is that the opening of the "enable" file has: { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); [..] But deleting the events frees the "file" descriptor, and a "use after free" happens with the dereference at "file->tr". The file descriptor does have a reference counter, but there needs to be a way to decrement it from the eventfs when the eventfs_inode is removed that represents this file descriptor. Add an optional "release" callback to the eventfs_entry array structure, that gets called when the eventfs file is about to be removed. This allows for the creating on the eventfs file to increment the tracing file descriptor ref counter. When the eventfs file is deleted, it can call the release function that will call the put function for the tracing file descriptor. This will protect the tracing file from being freed while a eventfs file that references it is being opened. Link: https://lore.kernel.org/linux-trace-kernel/20240426073410.17154-1-Tze-nan.Wu@mediatek.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240502090315.448cba46@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Tze-nan wu Tested-by: Tze-nan Wu (吳澤南) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 23 +++++++++++++++++++++-- include/linux/tracefs.h | 3 +++ kernel/trace/trace_events.c | 12 ++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 894c6ca1e500..f5510e26f0f6 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -84,10 +84,17 @@ enum { static void release_ei(struct kref *ref) { struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + const struct eventfs_entry *entry; struct eventfs_root_inode *rei; WARN_ON_ONCE(!ei->is_freed); + for (int i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (entry->release) + entry->release(entry->name, ei->data); + } + kfree(ei->entry_attrs); kfree_const(ei->name); if (ei->is_events) { @@ -112,6 +119,18 @@ static inline void free_ei(struct eventfs_inode *ei) } } +/* + * Called when creation of an ei fails, do not call release() functions. + */ +static inline void cleanup_ei(struct eventfs_inode *ei) +{ + if (ei) { + /* Set nr_entries to 0 to prevent release() function being called */ + ei->nr_entries = 0; + free_ei(ei); + } +} + static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) { if (ei) @@ -734,7 +753,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode /* Was the parent freed? */ if (list_empty(&ei->list)) { - free_ei(ei); + cleanup_ei(ei); ei = NULL; } return ei; @@ -835,7 +854,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - free_ei(ei); + cleanup_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 7a5fe17b6bf9..d03f74658716 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -62,6 +62,8 @@ struct eventfs_file; typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); +typedef void (*eventfs_release)(const char *name, void *data); + /** * struct eventfs_entry - dynamically created eventfs file call back handler * @name: Then name of the dynamic file in an eventfs directory @@ -72,6 +74,7 @@ typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, struct eventfs_entry { const char *name; eventfs_callback callback; + eventfs_release release; }; struct eventfs_inode; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 52f75c36bbca..6ef29eba90ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2552,6 +2552,14 @@ static int event_callback(const char *name, umode_t *mode, void **data, return 0; } +/* The file is incremented on creation and freeing the enable file decrements it */ +static void event_release(const char *name, void *data) +{ + struct trace_event_file *file = data; + + event_file_put(file); +} + static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { @@ -2566,6 +2574,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { .name = "enable", .callback = event_callback, + .release = event_release, }, { .name = "filter", @@ -2634,6 +2643,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) return ret; } + /* Gets decremented on freeing of the "enable" file */ + event_file_get(file); + return 0; } -- cgit v1.2.3 From 628d701f2de5b9a16d1dd82bea68fd895f56f1a1 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:20 +1000 Subject: powerpc/dexcr: Add DEXCR prctl interface Now that we track a DEXCR on a per-task basis, individual tasks are free to configure it as they like. The interface is a pair of getter/setter prctl's that work on a single aspect at a time (multiple aspects at once is more difficult if there are different rules applied for each aspect, now or in future). The getter shows the current state of the process config, and the setter allows setting/clearing the aspect. Signed-off-by: Benjamin Gray [mpe: Account for PR_RISCV_SET_ICACHE_FLUSH_CTX, shrink some longs lines] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-5-bgray@linux.ibm.com --- arch/powerpc/include/asm/processor.h | 10 ++++ arch/powerpc/kernel/dexcr.c | 101 +++++++++++++++++++++++++++++++++++ include/uapi/linux/prctl.h | 16 ++++++ kernel/sys.c | 16 ++++++ 4 files changed, 143 insertions(+) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index aad85a24134a..e44cac0da346 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -334,6 +334,16 @@ extern int set_endian(struct task_struct *tsk, unsigned int val); extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr); extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val); +#ifdef CONFIG_PPC_BOOK3S_64 + +#define PPC_GET_DEXCR_ASPECT(tsk, asp) get_dexcr_prctl((tsk), (asp)) +#define PPC_SET_DEXCR_ASPECT(tsk, asp, val) set_dexcr_prctl((tsk), (asp), (val)) + +int get_dexcr_prctl(struct task_struct *tsk, unsigned long asp); +int set_dexcr_prctl(struct task_struct *tsk, unsigned long asp, unsigned long val); + +#endif + extern void load_fp_state(struct thread_fp_state *fp); extern void store_fp_state(struct thread_fp_state *fp); extern void load_vr_state(struct thread_vr_state *vr); diff --git a/arch/powerpc/kernel/dexcr.c b/arch/powerpc/kernel/dexcr.c index d5cd77421088..3a0358e91c60 100644 --- a/arch/powerpc/kernel/dexcr.c +++ b/arch/powerpc/kernel/dexcr.c @@ -21,3 +21,104 @@ static int __init init_task_dexcr(void) return 0; } early_initcall(init_task_dexcr) + +/* Allow thread local configuration of these by default */ +#define DEXCR_PRCTL_EDITABLE ( \ + DEXCR_PR_IBRTPD | \ + DEXCR_PR_SRAPD | \ + DEXCR_PR_NPHIE) + +static int prctl_to_aspect(unsigned long which, unsigned int *aspect) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + *aspect = DEXCR_PR_SBHE; + break; + case PR_PPC_DEXCR_IBRTPD: + *aspect = DEXCR_PR_IBRTPD; + break; + case PR_PPC_DEXCR_SRAPD: + *aspect = DEXCR_PR_SRAPD; + break; + case PR_PPC_DEXCR_NPHIE: + *aspect = DEXCR_PR_NPHIE; + break; + default: + return -ENODEV; + } + + return 0; +} + +int get_dexcr_prctl(struct task_struct *task, unsigned long which) +{ + unsigned int aspect; + int ret; + + ret = prctl_to_aspect(which, &aspect); + if (ret) + return ret; + + if (aspect & DEXCR_PRCTL_EDITABLE) + ret |= PR_PPC_DEXCR_CTRL_EDITABLE; + + if (aspect & mfspr(SPRN_DEXCR)) + ret |= PR_PPC_DEXCR_CTRL_SET; + else + ret |= PR_PPC_DEXCR_CTRL_CLEAR; + + if (aspect & task->thread.dexcr_onexec) + ret |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + else + ret |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + + return ret; +} + +int set_dexcr_prctl(struct task_struct *task, unsigned long which, unsigned long ctrl) +{ + unsigned long dexcr; + unsigned int aspect; + int err = 0; + + err = prctl_to_aspect(which, &aspect); + if (err) + return err; + + if (!(aspect & DEXCR_PRCTL_EDITABLE)) + return -EPERM; + + if (ctrl & ~PR_PPC_DEXCR_CTRL_MASK) + return -EINVAL; + + if (ctrl & PR_PPC_DEXCR_CTRL_SET && ctrl & PR_PPC_DEXCR_CTRL_CLEAR) + return -EINVAL; + + if (ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC && ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC) + return -EINVAL; + + /* + * We do not want an unprivileged process being able to disable + * a setuid process's hash check instructions + */ + if (aspect == DEXCR_PR_NPHIE && + ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + dexcr = mfspr(SPRN_DEXCR); + + if (ctrl & PR_PPC_DEXCR_CTRL_SET) + dexcr |= aspect; + else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) + dexcr &= ~aspect; + + if (ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC) + task->thread.dexcr_onexec |= aspect; + else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC) + task->thread.dexcr_onexec &= ~aspect; + + mtspr(SPRN_DEXCR, dexcr); + + return 0; +} diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 370ed14b1ae0..713d28788df7 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -306,4 +306,20 @@ struct prctl_mm_map { # define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc # define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f +/* PowerPC Dynamic Execution Control Register (DEXCR) controls */ +#define PR_PPC_GET_DEXCR 72 +#define PR_PPC_SET_DEXCR 73 +/* DEXCR aspect to act on */ +# define PR_PPC_DEXCR_SBHE 0 /* Speculative branch hint enable */ +# define PR_PPC_DEXCR_IBRTPD 1 /* Indirect branch recurrent target prediction disable */ +# define PR_PPC_DEXCR_SRAPD 2 /* Subroutine return address prediction disable */ +# define PR_PPC_DEXCR_NPHIE 3 /* Non-privileged hash instruction enable */ +/* Action to apply / return */ +# define PR_PPC_DEXCR_CTRL_EDITABLE 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */ +# define PR_PPC_DEXCR_CTRL_SET 0x2 /* Set the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_CLEAR 0x4 /* Clear the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_MASK 0x1f + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 8bb106a56b3a..f9c95410278c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -146,6 +146,12 @@ #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif +#ifndef PPC_GET_DEXCR_ASPECT +# define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL) +#endif +#ifndef PPC_SET_DEXCR_ASPECT +# define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2726,6 +2732,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_MDWE: error = prctl_get_mdwe(arg2, arg3, arg4, arg5); break; + case PR_PPC_GET_DEXCR: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = PPC_GET_DEXCR_ASPECT(me, arg2); + break; + case PR_PPC_SET_DEXCR: + if (arg4 || arg5) + return -EINVAL; + error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; -- cgit v1.2.3 From b3e90f375b3c7ab85aef631ebb0ad8ce66cbf3fd Mon Sep 17 00:00:00 2001 From: Yoann Congal Date: Sun, 5 May 2024 10:03:42 +0200 Subject: printk: Change type of CONFIG_BASE_SMALL to bool CONFIG_BASE_SMALL is currently a type int but is only used as a boolean. So, change its type to bool and adapt all usages: CONFIG_BASE_SMALL == 0 becomes !IS_ENABLED(CONFIG_BASE_SMALL) and CONFIG_BASE_SMALL != 0 becomes IS_ENABLED(CONFIG_BASE_SMALL). Reviewed-by: Petr Mladek Reviewed-by: Greg Kroah-Hartman Reviewed-by: Masahiro Yamada Signed-off-by: Yoann Congal Link: https://lore.kernel.org/r/20240505080343.1471198-3-yoann.congal@smile.fr Signed-off-by: Petr Mladek --- arch/x86/include/asm/mpspec.h | 6 +++--- drivers/tty/vt/vc_screen.c | 2 +- include/linux/threads.h | 4 ++-- include/linux/udp.h | 2 +- include/linux/xarray.h | 2 +- init/Kconfig | 6 ++---- kernel/futex/core.c | 2 +- kernel/user.c | 2 +- 8 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c72c7ff78fcd..d593e52e6635 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -16,10 +16,10 @@ extern int pic_mode; * Summit or generic (i.e. installer) kernels need lots of bus entries. * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ -#if CONFIG_BASE_SMALL == 0 -# define MAX_MP_BUSSES 260 -#else +#ifdef CONFIG_BASE_SMALL # define MAX_MP_BUSSES 32 +#else +# define MAX_MP_BUSSES 260 #endif #define MAX_IRQ_SOURCES 256 diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c index 67e2cb7c96ee..da33c6c4691c 100644 --- a/drivers/tty/vt/vc_screen.c +++ b/drivers/tty/vt/vc_screen.c @@ -51,7 +51,7 @@ #include #define HEADER_SIZE 4u -#define CON_BUF_SIZE (CONFIG_BASE_SMALL ? 256 : PAGE_SIZE) +#define CON_BUF_SIZE (IS_ENABLED(CONFIG_BASE_SMALL) ? 256 : PAGE_SIZE) /* * Our minor space: diff --git a/include/linux/threads.h b/include/linux/threads.h index c34173e6c5f1..1674a471b0b4 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h @@ -25,13 +25,13 @@ /* * This controls the default maximum pid allocated to a process */ -#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000) +#define PID_MAX_DEFAULT (IS_ENABLED(CONFIG_BASE_SMALL) ? 0x1000 : 0x8000) /* * A maximum of 4 million PIDs should be enough for a while. * [NOTE: PID/TIDs are limited to 2^30 ~= 1 billion, see FUTEX_TID_MASK.] */ -#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ +#define PID_MAX_LIMIT (IS_ENABLED(CONFIG_BASE_SMALL) ? PAGE_SIZE * 8 : \ (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT)) /* diff --git a/include/linux/udp.h b/include/linux/udp.h index 3748e82b627b..2d21cdb41ad8 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -24,7 +24,7 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) } #define UDP_HTABLE_SIZE_MIN_PERNET 128 -#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) +#define UDP_HTABLE_SIZE_MIN (IS_ENABLED(CONFIG_BASE_SMALL) ? 128 : 256) #define UDP_HTABLE_SIZE_MAX 65536 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index cb571dfcf4b1..3f81ee5f9fb9 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1141,7 +1141,7 @@ static inline void xa_release(struct xarray *xa, unsigned long index) * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT -#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) diff --git a/init/Kconfig b/init/Kconfig index ff5b607c3218..4b517523d566 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -743,7 +743,7 @@ config LOG_CPU_MAX_BUF_SHIFT int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" depends on SMP range 0 21 - default 0 if BASE_SMALL != 0 + default 0 if BASE_SMALL default 12 depends on PRINTK help @@ -1945,9 +1945,7 @@ config RT_MUTEXES default y if PREEMPT_RT config BASE_SMALL - int - default 0 if BASE_FULL - default 1 if !BASE_FULL + def_bool !BASE_FULL config MODULE_SIG_FORMAT def_bool n diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1e78ef24321e..06a1f091be81 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1150,7 +1150,7 @@ static int __init futex_init(void) unsigned int futex_shift; unsigned long i; -#if CONFIG_BASE_SMALL +#ifdef CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); diff --git a/kernel/user.c b/kernel/user.c index 03cedc366dc9..aa1162deafe4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -88,7 +88,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); * when changing user ID's (ie setuid() and friends). */ -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) +#define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -- cgit v1.2.3 From 3a35c13007dea132a65f07de05c26b87837fadc2 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Mon, 8 Apr 2024 09:46:22 +0200 Subject: kunit: Handle test faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, when a kernel test thread crashed (e.g. NULL pointer dereference, general protection fault), the KUnit test hanged for 30 seconds and exited with a timeout error. Fix this issue by waiting on task_struct->vfork_done instead of the custom kunit_try_catch.try_completion, and track the execution state by initially setting try_result with -EINTR and only setting it to 0 if the test passed. Fix kunit_generic_run_threadfn_adapter() signature by returning 0 instead of calling kthread_complete_and_exit(). Because thread's exit code is never checked, always set it to 0 to make it clear. To make this explicit, export kthread_exit() for KUnit tests built as module. Fix the -EINTR error message, which couldn't be reached until now. This is tested with a following patch. Cc: Brendan Higgins Cc: Eric W. Biederman Cc: Shuah Khan Reviewed-by: Kees Cook Reviewed-by: David Gow Tested-by: Rae Moar Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20240408074625.65017-5-mic@digikod.net Signed-off-by: Shuah Khan --- include/kunit/try-catch.h | 3 --- kernel/kthread.c | 1 + lib/kunit/try-catch.c | 19 ++++++++++++------- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/kunit/try-catch.h b/include/kunit/try-catch.h index c507dd43119d..7c966a1adbd3 100644 --- a/include/kunit/try-catch.h +++ b/include/kunit/try-catch.h @@ -14,13 +14,11 @@ typedef void (*kunit_try_catch_func_t)(void *); -struct completion; struct kunit; /** * struct kunit_try_catch - provides a generic way to run code which might fail. * @test: The test case that is currently being executed. - * @try_completion: Completion that the control thread waits on while test runs. * @try_result: Contains any errno obtained while running test case. * @try: The function, the test case, to attempt to run. * @catch: The function called if @try bails out. @@ -46,7 +44,6 @@ struct kunit; struct kunit_try_catch { /* private: internal use only. */ struct kunit *test; - struct completion *try_completion; int try_result; kunit_try_catch_func_t try; kunit_try_catch_func_t catch; diff --git a/kernel/kthread.c b/kernel/kthread.c index c5e40830c1f2..f7be976ff88a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -315,6 +315,7 @@ void __noreturn kthread_exit(long result) kthread->result = result; do_exit(0); } +EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c index cab8b24b5d5a..7a3910dd78a6 100644 --- a/lib/kunit/try-catch.c +++ b/lib/kunit/try-catch.c @@ -18,7 +18,7 @@ void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch) { try_catch->try_result = -EFAULT; - kthread_complete_and_exit(try_catch->try_completion, -EFAULT); + kthread_exit(0); } EXPORT_SYMBOL_GPL(kunit_try_catch_throw); @@ -26,9 +26,12 @@ static int kunit_generic_run_threadfn_adapter(void *data) { struct kunit_try_catch *try_catch = data; + try_catch->try_result = -EINTR; try_catch->try(try_catch->context); + if (try_catch->try_result == -EINTR) + try_catch->try_result = 0; - kthread_complete_and_exit(try_catch->try_completion, 0); + return 0; } static unsigned long kunit_test_timeout(void) @@ -58,13 +61,11 @@ static unsigned long kunit_test_timeout(void) void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context) { - DECLARE_COMPLETION_ONSTACK(try_completion); struct kunit *test = try_catch->test; struct task_struct *task_struct; int exit_code, time_remaining; try_catch->context = context; - try_catch->try_completion = &try_completion; try_catch->try_result = 0; task_struct = kthread_create(kunit_generic_run_threadfn_adapter, try_catch, "kunit_try_catch_thread"); @@ -75,8 +76,12 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context) } get_task_struct(task_struct); wake_up_process(task_struct); - - time_remaining = wait_for_completion_timeout(&try_completion, + /* + * As for a vfork(2), task_struct->vfork_done (pointing to the + * underlying kthread->exited) can be used to wait for the end of a + * kernel thread. + */ + time_remaining = wait_for_completion_timeout(task_struct->vfork_done, kunit_test_timeout()); if (time_remaining == 0) { try_catch->try_result = -ETIMEDOUT; @@ -92,7 +97,7 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context) if (exit_code == -EFAULT) try_catch->try_result = 0; else if (exit_code == -EINTR) - kunit_err(test, "wake_up_process() was never called\n"); + kunit_err(test, "try faulted\n"); else if (exit_code == -ETIMEDOUT) kunit_err(test, "try timed out\n"); else if (exit_code) -- cgit v1.2.3 From d786957ebd3fb4cfd9147dbcccd1e8f3871b45ce Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:44 +0100 Subject: bpf/verifier: replace calls to mark_reg_unknown. In order to further simplify the code in adjust_scalar_min_max_vals all the calls to mark_reg_unknown are replaced by __mark_reg_unknown. static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { ... mark all regs not init ... return; } __mark_reg_unknown(env, regs + regno); } The 'regno >= MAX_BPF_REG' does not apply to adjust_scalar_min_max_vals(), because it is only called from the following stack: - check_alu_op - adjust_reg_min_max_vals - adjust_scalar_min_max_vals The check_alu_op() does check_reg_arg() which verifies that both src and dst register numbers are within bounds. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7360f04f9ec7..41c66cc6db80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13887,7 +13887,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known; s64 smin_val, smax_val; @@ -13994,7 +13993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14007,7 +14006,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14020,7 +14019,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14029,7 +14028,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_arsh(dst_reg, &src_reg); break; default: - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } -- cgit v1.2.3 From 0922c78f592c60e5a8fe6ab968479def124d4ff3 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:45 +0100 Subject: bpf/verifier: refactor checks for range computation Split range computation checks in its own function, isolating pessimitic range set for dst_reg and failing return to a single point. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko bpf/verifier: improve code after range computation recent changes. Link: https://lore.kernel.org/r/20240506141849.185293-3-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 109 +++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41c66cc6db80..bdaf0413bf06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13878,6 +13878,50 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, + const struct bpf_reg_state *src_reg) +{ + bool src_is_const = false; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + + if (insn_bitness == 32) { + if (tnum_subreg_is_const(src_reg->var_off) + && src_reg->s32_min_value == src_reg->s32_max_value + && src_reg->u32_min_value == src_reg->u32_max_value) + src_is_const = true; + } else { + if (tnum_is_const(src_reg->var_off) + && src_reg->smin_value == src_reg->smax_value + && src_reg->umin_value == src_reg->umax_value) + src_is_const = true; + } + + switch (BPF_OP(insn->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + return true; + + /* Compute range for the following only if the src_reg is const. + */ + case BPF_XOR: + case BPF_OR: + case BPF_MUL: + return src_is_const; + + /* Shift operators range is only computable if shift dimension operand + * is a constant. Shifts greater than 31 or 63 are undefined. This + * includes shifts by a negative number. + */ + case BPF_LSH: + case BPF_RSH: + case BPF_ARSH: + return (src_is_const && src_reg->umax_value < insn_bitness); + default: + return false; + } +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -13888,51 +13932,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); - bool src_known; - s64 smin_val, smax_val; - u64 umin_val, umax_val; - s32 s32_min_val, s32_max_val; - u32 u32_min_val, u32_max_val; - u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; - smin_val = src_reg.smin_value; - smax_val = src_reg.smax_value; - umin_val = src_reg.umin_value; - umax_val = src_reg.umax_value; - - s32_min_val = src_reg.s32_min_value; - s32_max_val = src_reg.s32_max_value; - u32_min_val = src_reg.u32_min_value; - u32_max_val = src_reg.u32_max_value; - - if (alu32) { - src_known = tnum_subreg_is_const(src_reg.var_off); - if ((src_known && - (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || - s32_min_val > s32_max_val || u32_min_val > u32_max_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } else { - src_known = tnum_is_const(src_reg.var_off); - if ((src_known && - (smin_val != smax_val || umin_val != umax_val)) || - smin_val > smax_val || umin_val > umax_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } - - if (!src_known && - opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) { __mark_reg_unknown(env, dst_reg); return 0; } @@ -13989,46 +13992,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_xor(dst_reg, &src_reg); break; case BPF_LSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_lsh(dst_reg, &src_reg); else scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_rsh(dst_reg, &src_reg); else scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_arsh(dst_reg, &src_reg); else scalar_min_max_arsh(dst_reg, &src_reg); break; default: - __mark_reg_unknown(env, dst_reg); break; } -- cgit v1.2.3 From 138cc42c05d11fd5ee82ee1606d2c9823373a926 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:46 +0100 Subject: bpf/verifier: improve XOR and OR range computation Range for XOR and OR operators would not be attempted unless src_reg would resolve to a single value, i.e. a known constant value. This condition is unnecessary, and the following XOR/OR operator handling could compute a possible better range. Acked-by: Eduard Zingerman Signed-off-by: Cupertino Miranda Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-4-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bdaf0413bf06..1f6deb3e44c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13900,12 +13900,12 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_ADD: case BPF_SUB: case BPF_AND: + case BPF_XOR: + case BPF_OR: return true; /* Compute range for the following only if the src_reg is const. */ - case BPF_XOR: - case BPF_OR: case BPF_MUL: return src_is_const; -- cgit v1.2.3 From 41d047a871062f1a4d1871a1908d380c14e75428 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:48 +0100 Subject: bpf/verifier: relax MUL range computation check MUL instruction required that src_reg would be a known value (i.e. src_reg would be a const value). The condition in this case can be relaxed, since the range computation algorithm used in current code already supports a proper range computation for any valid range value on its operands. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20240506141849.185293-6-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1f6deb3e44c5..9e3aba08984e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13902,12 +13902,8 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_AND: case BPF_XOR: case BPF_OR: - return true; - - /* Compute range for the following only if the src_reg is const. - */ case BPF_MUL: - return src_is_const; + return true; /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This -- cgit v1.2.3 From e0550222e03bae3fd629641e246ef7f47803d795 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 29 Apr 2024 23:06:54 +0000 Subject: printk: cleanup deprecated uses of strncpy/strcpy Cleanup some deprecated uses of strncpy() and strcpy() [1]. There doesn't seem to be any bugs with the current code but the readability of this code could benefit from a quick makeover while removing some deprecated stuff as a benefit. The most interesting replacement made in this patch involves concatenating "ttyS" with a digit-led user-supplied string. Instead of doing two distinct string copies with carefully managed offsets and lengths, let's use the more robust and self-explanatory scnprintf(). scnprintf will 1) respect the bounds of @buf, 2) null-terminate @buf, 3) do the concatenation. This allows us to drop the manual NUL-byte assignment. Also, since isdigit() is used about a dozen lines after the open-coded version we'll replace it for uniformity's sake. All the strcpy() --> strscpy() replacements are trivial as the source strings are literals and much smaller than the destination size. No behavioral change here. Use the new 2-argument version of strscpy() introduced in Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). However, to make this work fully (since the size must be known at compile time), also update the extern-qualified declaration to have the proper size information. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 [2] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [3] Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240429-strncpy-kernel-printk-printk-c-v1-1-4da7926d7b69@google.com [pmladek@suse.com: Removed obsolete brackets and added empty lines.] Signed-off-by: Petr Mladek --- include/linux/printk.h | 2 +- kernel/printk/printk.c | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 2fde40cc9677..dbbd202b1cb3 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -71,7 +71,7 @@ extern void console_verbose(void); /* strlen("ratelimit") + 1 */ #define DEVKMSG_STR_MAX_SIZE 10 -extern char devkmsg_log_str[]; +extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE]; struct ctl_table; extern int suppress_printk; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b06f63e276c1..360182dfe4cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -178,9 +178,9 @@ static int __init control_devkmsg(char *str) * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) - strcpy(devkmsg_log_str, "on"); + strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) - strcpy(devkmsg_log_str, "off"); + strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* @@ -209,7 +209,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return -EINVAL; old = devkmsg_log; - strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); + strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); @@ -227,7 +227,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* ... and restore old setting. */ devkmsg_log = old; - strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); + strscpy(devkmsg_log_str, old_str); return -EINVAL; } @@ -2500,22 +2500,22 @@ static int __init console_setup(char *str) /* * Decode str into name, index, options. */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(buf, "ttyS"); - strncpy(buf + 4, str, sizeof(buf) - 5); - } else { - strncpy(buf, str, sizeof(buf) - 1); - } - buf[sizeof(buf) - 1] = 0; + if (isdigit(str[0])) + scnprintf(buf, sizeof(buf), "ttyS%s", str); + else + strscpy(buf, str); + options = strchr(str, ','); if (options) *(options++) = 0; + #ifdef __sparc__ if (!strcmp(str, "ttya")) - strcpy(buf, "ttyS0"); + strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) - strcpy(buf, "ttyS1"); + strscpy(buf, "ttyS1"); #endif + for (s = buf; *s; s++) if (isdigit(*s) || *s == ',') break; -- cgit v1.2.3 From 327e2c97c46a4d971c5450a9d05b4a673f46c4da Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 7 Apr 2024 21:11:41 -0700 Subject: swiotlb: remove alloc_size argument to swiotlb_tbl_map_single() Currently swiotlb_tbl_map_single() takes alloc_align_mask and alloc_size arguments to specify an swiotlb allocation that is larger than mapping_size. This larger allocation is used solely by iommu_dma_map_single() to handle untrusted devices that should not have DMA visibility to memory pages that are partially used for unrelated kernel data. Having two arguments to specify the allocation is redundant. While alloc_align_mask naturally specifies the alignment of the starting address of the allocation, it can also implicitly specify the size by rounding up the mapping_size to that alignment. Additionally, the current approach has an edge case bug. iommu_dma_map_page() already does the rounding up to compute the alloc_size argument. But swiotlb_tbl_map_single() then calculates the alignment offset based on the DMA min_align_mask, and adds that offset to alloc_size. If the offset is non-zero, the addition may result in a value that is larger than the max the swiotlb can allocate. If the rounding up is done _after_ the alignment offset is added to the mapping_size (and the original mapping_size conforms to the value returned by swiotlb_max_mapping_size), then the max that the swiotlb can allocate will not be exceeded. In view of these issues, simplify the swiotlb_tbl_map_single() interface by removing the alloc_size argument. Most call sites pass the same value for mapping_size and alloc_size, and they pass alloc_align_mask as zero. Just remove the redundant argument from these callers, as they will see no functional change. For iommu_dma_map_page() also remove the alloc_size argument, and have swiotlb_tbl_map_single() compute the alloc_size by rounding up mapping_size after adding the offset based on min_align_mask. This has the side effect of fixing the edge case bug but with no other functional change. Also add a sanity test on the alloc_align_mask. While IOMMU code currently ensures the granule is not larger than PAGE_SIZE, if that guarantee were to be removed in the future, the downstream effect on the swiotlb might go unnoticed until strange allocation failures occurred. Tested on an ARM64 system with 16K page size and some kernel test-only hackery to allow modifying the DMA min_align_mask and the granule size that becomes the alloc_align_mask. Tested these combinations with a variety of original memory addresses and sizes, including those that reproduce the edge case bug: * 4K granule and 0 min_align_mask * 4K granule and 0xFFF min_align_mask (4K - 1) * 16K granule and 0xFFF min_align_mask * 64K granule and 0xFFF min_align_mask * 64K granule and 0x3FFF min_align_mask (16K - 1) With the changes, all combinations pass. Signed-off-by: Michael Kelley Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- drivers/iommu/dma-iommu.c | 2 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 2 +- kernel/dma/swiotlb.c | 56 +++++++++++++++++++++++++++++++++++------------ 4 files changed, 45 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e4cb26f6a943..7a9e814fc6b3 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1165,7 +1165,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, trace_swiotlb_bounced(dev, phys, size); aligned_size = iova_align(iovad, size); - phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size, + phys = swiotlb_tbl_map_single(dev, phys, size, iova_mask(iovad), dir, attrs); if (phys == DMA_MAPPING_ERROR) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 0e6c6c25d154..d4f1f8d1ebd8 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -216,7 +216,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ trace_swiotlb_bounced(dev, dev_addr, size); - map = swiotlb_tbl_map_single(dev, phys, size, size, 0, dir, attrs); + map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ea23097e351f..14bc10c1bb23 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -43,7 +43,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, - size_t mapping_size, size_t alloc_size, + size_t mapping_size, unsigned int alloc_aligned_mask, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a5e0dfc44d24..046da973a7e2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1340,15 +1340,40 @@ static unsigned long mem_used(struct io_tlb_mem *mem) #endif /* CONFIG_DEBUG_FS */ +/** + * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area + * @dev: Device which maps the buffer. + * @orig_addr: Original (non-bounced) physical IO buffer address + * @mapping_size: Requested size of the actual bounce buffer, excluding + * any pre- or post-padding for alignment + * @alloc_align_mask: Required start and end alignment of the allocated buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Find and allocate a suitable sequence of IO TLB slots for the request. + * The allocated space starts at an alignment specified by alloc_align_mask, + * and the size of the allocated space is rounded up so that the total amount + * of allocated space is a multiple of (alloc_align_mask + 1). If + * alloc_align_mask is zero, the allocated space may be at any alignment and + * the size is not rounded up. + * + * The returned address is within the allocated space and matches the bits + * of orig_addr that are specified in the DMA min_align_mask for the device. As + * such, this returned address may be offset from the beginning of the allocated + * space. The bounce buffer space starting at the returned address for + * mapping_size bytes is initialized to the contents of the original IO buffer + * area. Any pre-padding (due to an offset) and any post-padding (due to + * rounding-up the size) is not initialized. + */ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, - size_t mapping_size, size_t alloc_size, - unsigned int alloc_align_mask, enum dma_data_direction dir, - unsigned long attrs) + size_t mapping_size, unsigned int alloc_align_mask, + enum dma_data_direction dir, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset; struct io_tlb_pool *pool; unsigned int i; + size_t size; int index; phys_addr_t tlb_addr; unsigned short pad_slots; @@ -1362,20 +1387,24 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); - if (mapping_size > alloc_size) { - dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", - mapping_size, alloc_size); - return (phys_addr_t)DMA_MAPPING_ERROR; - } + /* + * The default swiotlb memory pool is allocated with PAGE_SIZE + * alignment. If a mapping is requested with larger alignment, + * the mapping may be unable to use the initial slot(s) in all + * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request + * of or near the maximum mapping size would always fail. + */ + dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK, + "Alloc alignment may prevent fulfilling requests with max mapping_size\n"); offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr); - index = swiotlb_find_slots(dev, orig_addr, - alloc_size + offset, alloc_align_mask, &pool); + size = ALIGN(mapping_size + offset, alloc_align_mask + 1); + index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, mem->nslabs, mem_used(mem)); + size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -1388,7 +1417,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, offset &= (IO_TLB_SIZE - 1); index += pad_slots; pool->slots[index].pad_slots = pad_slots; - for (i = 0; i < nr_slots(alloc_size + offset); i++) + for (i = 0; i < (nr_slots(size) - pad_slots); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; /* @@ -1543,8 +1572,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); - swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, - attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From fe7514b149e0a8a6f3031d286e52d40163b0b11a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 7 May 2024 13:20:20 +0200 Subject: dma: compile-out DMA sync op calls when not used Some platforms do have DMA, but DMA there is always direct and coherent. Currently, even on such platforms DMA sync operations are compiled and called. Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when either sync operations are needed or there is DMA ops or swiotlb or DMA debug is enabled. Compile global dma_sync_*() and dma_need_sync() only when it's set, otherwise provide empty inline stubs. The change allows for future optimizations of DMA sync calls depending on runtime conditions. Signed-off-by: Alexander Lobakin Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 62 ++++++++++++++++++++++++--------------------- kernel/dma/Kconfig | 5 ++++ kernel/dma/mapping.c | 22 ++++++++-------- 3 files changed, 50 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4a658de44ee9..a569b56b25e2 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -117,14 +117,6 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, - enum dma_data_direction dir); -void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir); -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir); -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, @@ -147,7 +139,6 @@ u64 dma_get_required_mask(struct device *dev); bool dma_addressing_limited(struct device *dev); size_t dma_max_mapping_size(struct device *dev); size_t dma_opt_mapping_size(struct device *dev); -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); @@ -195,22 +186,6 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } -static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ -} -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ -} -static inline void dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction dir) -{ -} -static inline void dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction dir) -{ -} static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return -ENOMEM; @@ -277,10 +252,6 @@ static inline size_t dma_opt_mapping_size(struct device *dev) { return 0; } -static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) -{ - return false; -} static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; @@ -310,6 +281,39 @@ static inline int dma_mmap_noncontiguous(struct device *dev, } #endif /* CONFIG_HAS_DMA */ +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir); +void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir); +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir); +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); +#else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ +} +static inline void dma_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ +} +static inline void dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, int nelems, enum dma_data_direction dir) +{ +} +static inline void dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, enum dma_data_direction dir) +{ +} +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return false; +} +#endif /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ + struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index d62f5957f36b..c06e56be0ca1 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -107,6 +107,11 @@ config DMA_BOUNCE_UNALIGNED_KMALLOC bool depends on SWIOTLB +config DMA_NEED_SYNC + def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \ + ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || DMA_OPS || \ + SWIOTLB + config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" depends on OF && OF_RESERVED_MEM && SWIOTLB diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 58db8fd70471..c78b78e95a26 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -329,6 +329,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, } EXPORT_SYMBOL(dma_unmap_resource); +#ifdef CONFIG_DMA_NEED_SYNC void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { @@ -385,6 +386,17 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL(dma_sync_sg_for_device); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); +#endif /* CONFIG_DMA_NEED_SYNC */ + /* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems * that the intention is to allow exporting memory allocated via the @@ -841,16 +853,6 @@ size_t dma_opt_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_opt_mapping_size); -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (dma_map_direct(dev, ops)) - return dma_direct_need_sync(dev, dma_addr); - return ops->sync_single_for_cpu || ops->sync_single_for_device; -} -EXPORT_SYMBOL_GPL(dma_need_sync); - unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.2.3 From f406c8e4b770ca3b0df84a17349e13f2b6b07d10 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 7 May 2024 13:20:21 +0200 Subject: dma: avoid redundant calls for sync operations Quite often, devices do not need dma_sync operations on x86_64 at least. Indeed, when dev_is_dma_coherent(dev) is true and dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu() and friends do nothing. However, indirectly calling them when CONFIG_RETPOLINE=y consumes about 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate. Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%. Add dev->need_dma_sync boolean and turn it off during the device initialization (dma_set_mask()) depending on the setup: dev_is_dma_coherent() for the direct DMA, !(sync_single_for_device || sync_single_for_cpu) or the new dma_map_ops flag, %DMA_F_CAN_SKIP_SYNC, advertised for non-NULL DMA ops. Then later, if/when swiotlb is used for the first time, the flag is reset back to on, from swiotlb_tbl_map_single(). On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows +3-5% increase for direct DMA. Suggested-by: Christoph Hellwig # direct DMA shortcut Co-developed-by: Eric Dumazet Signed-off-by: Eric Dumazet Signed-off-by: Alexander Lobakin Signed-off-by: Christoph Hellwig --- include/linux/device.h | 4 ++++ include/linux/dma-map-ops.h | 12 ++++++++++ include/linux/dma-mapping.h | 53 ++++++++++++++++++++++++++++++++++++++----- kernel/dma/mapping.c | 55 +++++++++++++++++++++++++++++++++++---------- kernel/dma/swiotlb.c | 6 +++++ 5 files changed, 113 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/device.h b/include/linux/device.h index b9f5464f44ed..ed95b829f05b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -691,6 +691,7 @@ struct device_physical_location { * and optionall (if the coherent mask is large enough) also * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. + * @dma_need_sync: The device needs performing DMA sync operations. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information @@ -803,6 +804,9 @@ struct device { #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif +#ifdef CONFIG_DMA_NEED_SYNC + bool dma_need_sync:1; +#endif }; /** diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 4abc60f04209..4893cb89cb52 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -18,8 +18,11 @@ struct iommu_ops; * * DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can * handle PCI P2PDMA pages in the map_sg/unmap_sg operation. + * DMA_F_CAN_SKIP_SYNC: DMA sync operations can be skipped if the device is + * coherent and it's not an SWIOTLB buffer. */ #define DMA_F_PCI_P2PDMA_SUPPORTED (1 << 0) +#define DMA_F_CAN_SKIP_SYNC (1 << 1) struct dma_map_ops { unsigned int flags; @@ -273,6 +276,15 @@ static inline bool dev_is_dma_coherent(struct device *dev) } #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */ +static inline void dma_reset_need_sync(struct device *dev) +{ +#ifdef CONFIG_DMA_NEED_SYNC + /* Reset it only once so that the function can be called on hotpath */ + if (unlikely(!dev->dma_need_sync)) + dev->dma_need_sync = true; +#endif +} + /* * Check whether potential kmalloc() buffers are safe for non-coherent DMA. */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index a569b56b25e2..eb4e15893b6c 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -282,16 +282,59 @@ static inline int dma_mmap_noncontiguous(struct device *dev, #endif /* CONFIG_HAS_DMA */ #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, +void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, +void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, +void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, +void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr); + +static inline bool dma_dev_need_sync(const struct device *dev) +{ + /* Always call DMA sync operations when debugging is enabled */ + return dev->dma_need_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG); +} + +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + if (dma_dev_need_sync(dev)) + __dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void dma_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + if (dma_dev_need_sync(dev)) + __dma_sync_single_for_device(dev, addr, size, dir); +} + +static inline void dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, int nelems, enum dma_data_direction dir) +{ + if (dma_dev_need_sync(dev)) + __dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} + +static inline void dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, enum dma_data_direction dir) +{ + if (dma_dev_need_sync(dev)) + __dma_sync_sg_for_device(dev, sg, nelems, dir); +} + +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false; +} #else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ +static inline bool dma_dev_need_sync(const struct device *dev) +{ + return false; +} static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c78b78e95a26..3524bc92c37f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -330,7 +330,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, EXPORT_SYMBOL(dma_unmap_resource); #ifdef CONFIG_DMA_NEED_SYNC -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, +void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -342,9 +342,9 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, ops->sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } -EXPORT_SYMBOL(dma_sync_single_for_cpu); +EXPORT_SYMBOL(__dma_sync_single_for_cpu); -void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, +void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -356,9 +356,9 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, ops->sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } -EXPORT_SYMBOL(dma_sync_single_for_device); +EXPORT_SYMBOL(__dma_sync_single_for_device); -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, +void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -370,9 +370,9 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, ops->sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } -EXPORT_SYMBOL(dma_sync_sg_for_cpu); +EXPORT_SYMBOL(__dma_sync_sg_for_cpu); -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, +void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -384,18 +384,47 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, ops->sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); } -EXPORT_SYMBOL(dma_sync_sg_for_device); +EXPORT_SYMBOL(__dma_sync_sg_for_device); -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_map_direct(dev, ops)) + /* + * dma_need_sync could've been reset on first SWIOTLB buffer + * mapping, but @dma_addr is not necessary an SWIOTLB buffer. + * In this case, fall back to more granular check. + */ return dma_direct_need_sync(dev, dma_addr); - return ops->sync_single_for_cpu || ops->sync_single_for_device; + return true; } -EXPORT_SYMBOL_GPL(dma_need_sync); -#endif /* CONFIG_DMA_NEED_SYNC */ +EXPORT_SYMBOL_GPL(__dma_need_sync); + +static void dma_setup_need_sync(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC)) + /* + * dma_need_sync will be reset to %true on first SWIOTLB buffer + * mapping, if any. During the device initialization, it's + * enough to check only for the DMA coherence. + */ + dev->dma_need_sync = !dev_is_dma_coherent(dev); + else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu && + !ops->sync_sg_for_device && !ops->sync_sg_for_cpu) + /* + * Synchronization is not possible when none of DMA sync ops + * is set. + */ + dev->dma_need_sync = false; + else + dev->dma_need_sync = true; +} +#else /* !CONFIG_DMA_NEED_SYNC */ +static inline void dma_setup_need_sync(struct device *dev) { } +#endif /* !CONFIG_DMA_NEED_SYNC */ /* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems @@ -785,6 +814,8 @@ int dma_set_mask(struct device *dev, u64 mask) arch_dma_set_mask(dev, mask); *dev->dma_mask = mask; + dma_setup_need_sync(dev); + return 0; } EXPORT_SYMBOL(dma_set_mask); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 046da973a7e2..ae3e593eaadb 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1408,6 +1408,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } + /* + * If dma_need_sync wasn't set, reset it on first SWIOTLB buffer + * mapping to always sync SWIOTLB buffers. + */ + dma_reset_need_sync(dev); + /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if -- cgit v1.2.3 From 31f605a308e627f06e4e6ab77254473f1c90f0bf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 2 May 2024 16:12:17 +0200 Subject: kcsan, compiler_types: Introduce __data_racy type qualifier Based on the discussion at [1], it would be helpful to mark certain variables as explicitly "data racy", which would result in KCSAN not reporting data races involving any accesses on such variables. To do that, introduce the __data_racy type qualifier: struct foo { ... int __data_racy bar; ... }; In KCSAN-kernels, __data_racy turns into volatile, which KCSAN already treats specially by considering them "marked". In non-KCSAN kernels the type qualifier turns into no-op. The generated code between KCSAN-instrumented kernels and non-KCSAN kernels is already huge (inserted calls into runtime for every memory access), so the extra generated code (if any) due to volatile for few such __data_racy variables are unlikely to have measurable impact on performance. Link: https://lore.kernel.org/all/CAHk-=wi3iondeh_9V2g3Qz5oHTRjLsOpoy83hb58MVh=nRZe0A@mail.gmail.com/ [1] Suggested-by: Linus Torvalds Signed-off-by: Marco Elver Cc: Paul E. McKenney Cc: Tetsuo Handa Signed-off-by: Paul E. McKenney --- Documentation/dev-tools/kcsan.rst | 10 ++++++++++ include/linux/compiler_types.h | 7 +++++++ kernel/kcsan/kcsan_test.c | 17 +++++++++++++++++ 3 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index 94b6802ab0ab..02143f060b22 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -91,6 +91,16 @@ the below options are available: behaviour when encountering a data race is deemed safe. Please see `"Marking Shared-Memory Accesses" in the LKMM`_ for more information. +* Similar to ``data_race(...)``, the type qualifier ``__data_racy`` can be used + to document that all data races due to accesses to a variable are intended + and should be ignored by KCSAN:: + + struct foo { + ... + int __data_racy stats_counter; + ... + }; + * Disabling data race detection for entire functions can be accomplished by using the function attribute ``__no_kcsan``:: diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 2abaa3a825a9..a38162a8590d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -273,9 +273,16 @@ struct ftrace_likely_data { * disable all instrumentation. See Kconfig.kcsan where this is mandatory. */ # define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation +/* + * Type qualifier to mark variables where all data-racy accesses should be + * ignored by KCSAN. Note, the implementation simply marks these variables as + * volatile, since KCSAN will treat such accesses as "marked". + */ +# define __data_racy volatile # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused #else # define __no_kcsan +# define __data_racy #endif #ifndef __no_sanitize_or_inline diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 015586217875..0c17b4c83e1c 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -304,6 +304,7 @@ static long test_array[3 * PAGE_SIZE / sizeof(long)]; static struct { long val[8]; } test_struct; +static long __data_racy test_data_racy; static DEFINE_SEQLOCK(test_seqlock); static DEFINE_SPINLOCK(test_spinlock); static DEFINE_MUTEX(test_mutex); @@ -358,6 +359,8 @@ static noinline void test_kernel_write_uninstrumented(void) { test_var++; } static noinline void test_kernel_data_race(void) { data_race(test_var++); } +static noinline void test_kernel_data_racy_qualifier(void) { test_data_racy++; } + static noinline void test_kernel_assert_writer(void) { ASSERT_EXCLUSIVE_WRITER(test_var); @@ -1009,6 +1012,19 @@ static void test_data_race(struct kunit *test) KUNIT_EXPECT_FALSE(test, match_never); } +/* Test the __data_racy type qualifier. */ +__no_kcsan +static void test_data_racy_qualifier(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_racy_qualifier, test_kernel_data_racy_qualifier); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + __no_kcsan static void test_assert_exclusive_writer(struct kunit *test) { @@ -1424,6 +1440,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw), KCSAN_KUNIT_CASE(test_zero_size_access), KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_data_racy_qualifier), KCSAN_KUNIT_CASE(test_assert_exclusive_writer), KCSAN_KUNIT_CASE(test_assert_exclusive_access), KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), -- cgit v1.2.3 From 75b0fbf15d8466be618a997cae774eef445c0c7d Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 May 2024 14:33:39 +0800 Subject: bpf: Remove redundant page mask of vmf->address As the comment described in "struct vm_fault": ".address" : 'Faulting virtual address - masked' ".real_address" : 'Faulting virtual address - unmasked' The link [1] said: "Whatever the routes, all architectures end up to the invocation of handle_mm_fault() which, in turn, (likely) ends up calling __handle_mm_fault() to carry out the actual work of allocating the page tables." __handle_mm_fault() does address assignment: .address = address & PAGE_MASK, .real_address = address, This is debug dump by running `./test_progs -a "*arena*"`: [ 69.767494] arena fault: vmf->address = 10000001d000, vmf->real_address = 10000001d008 [ 69.767496] arena fault: vmf->address = 10000001c000, vmf->real_address = 10000001c008 [ 69.767499] arena fault: vmf->address = 10000001b000, vmf->real_address = 10000001b008 [ 69.767501] arena fault: vmf->address = 10000001a000, vmf->real_address = 10000001a008 [ 69.767504] arena fault: vmf->address = 100000019000, vmf->real_address = 100000019008 [ 69.769388] arena fault: vmf->address = 10000001e000, vmf->real_address = 10000001e1e8 So we can use the value of 'vmf->address' to do BPF arena kernel address space cast directly. [1] https://docs.kernel.org/mm/page_tables.html Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240507063358.8048-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 6c81630c5293..f5953f1a95cd 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -251,7 +251,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) int ret; kbase = bpf_arena_get_kern_vm_start(arena); - kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + kaddr = kbase + (u32)(vmf->address); guard(mutex)(&arena->lock); page = vmalloc_to_page((void *)kaddr); -- cgit v1.2.3 From d7ad05c86e2191bd66e5b62fca8da53c4a53484f Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Mon, 6 May 2024 05:10:59 +0100 Subject: timers/migration: Prevent out of bounds access on failure When tmigr_setup_groups() fails the level 0 group allocation, then the cleanup derefences index -1 of the local stack array. Prevent this by checking the loop condition first. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Levi Yun Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/20240506041059.86877-1-ppbuk5246@gmail.com --- kernel/time/timer_migration.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index ccba875d2234..84413114db5c 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1596,7 +1596,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) } while (i < tmigr_hierarchy_levels); - do { + while (i > 0) { group = stack[--i]; if (err < 0) { @@ -1645,7 +1645,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) tmigr_connect_child_parent(child, group); } } - } while (i > 0); + } kfree(stack); -- cgit v1.2.3 From 4707c13de3e42a47f0d99fe5fb58fa9dd23b455e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 18 Apr 2024 11:58:43 +0800 Subject: crash: add prefix for crash dumping messages Add pr_fmt() to kernel/crash_core.c to add the module name to debugging message printed as prefix. And also add prefix 'crashkernel:' to two lines of message printing code in kernel/crash_reserve.c. In kernel/crash_reserve.c, almost all debugging messages have 'crashkernel:' prefix or there's keyword crashkernel at the beginning or in the middle, adding pr_fmt() makes it redundant. Link: https://lkml.kernel.org/r/20240418035843.1562887-1-bhe@redhat.com Signed-off-by: Baoquan He Cc: Dave Young Cc: Jiri Slaby Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 ++ kernel/crash_reserve.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..1e7ac977f7c0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,6 +4,8 @@ * Copyright (C) 2002-2004 Eric Biederman */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 066668799f75..5b2722a93a48 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -109,7 +109,7 @@ static int __init parse_crashkernel_mem(char *cmdline, size = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; @@ -132,7 +132,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); + pr_warn("crahskernel: Memory value expected after '@'\n"); return -EINVAL; } } -- cgit v1.2.3 From 602ba77361160d99c77e3dadf7d891364f8c71b3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Apr 2024 23:02:35 -0700 Subject: watchdog: handle comma separated nmi_watchdog command line Per the document, the kernel can accept comma separated command line like nmi_watchdog=nopanic,0. However, the code doesn't really handle it. Fix the kernel to handle it properly. Link: https://lkml.kernel.org/r/20240430060236.1878002-1-song@kernel.org Signed-off-by: Song Liu Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/watchdog.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7b2125503af..7f54484de16f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -71,6 +71,7 @@ void __init hardlockup_detector_disable(void) static int __init hardlockup_panic_setup(char *str) { +next: if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "nopanic", 7)) @@ -79,6 +80,12 @@ static int __init hardlockup_panic_setup(char *str) watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; + while (*(str++)) { + if (*str == ',') { + str++; + goto next; + } + } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); -- cgit v1.2.3 From 393fb313a2e150b768e4850658679e2afff431e9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Apr 2024 23:02:36 -0700 Subject: watchdog: allow nmi watchdog to use raw perf event NMI watchdog permanently consumes one hardware counters per CPU on the system. For systems that use many hardware counters, this causes more aggressive time multiplexing of perf events. OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely used. Add kernel cmdline arg nmi_watchdog=rNNN to configure the watchdog to use raw event. For example, on Intel CPUs, we can use "r300" to configure the watchdog to use ref-cycles event. If the raw event does not work, fall back to use "cycles". [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20240430060236.1878002-2-song@kernel.org Signed-off-by: Song Liu Cc: Peter Zijlstra Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 5 +-- include/linux/nmi.h | 2 ++ kernel/watchdog.c | 2 ++ kernel/watchdog_perf.c | 46 +++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..1fa79a3d0d1a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3773,10 +3773,12 @@ Format: [state][,regs][,debounce][,die] nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels - Format: [panic,][nopanic,][num] + Format: [panic,][nopanic,][rNNN,][num] Valid num: 0 or 1 0 - turn hardlockup detector in nmi_watchdog off 1 - turn hardlockup detector in nmi_watchdog on + rNNN - configure the watchdog with raw perf event 0xNNN + When panic is specified, panic when an NMI watchdog timeout occurs (or 'nopanic' to not panic on an NMI watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set) @@ -7464,4 +7466,3 @@ memory, and other data can't be written using xmon commands. off xmon is disabled. - diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f53438eae815..a8dfb38c9bb6 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -105,10 +105,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_cleanup(void); +extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } +static inline void hardlockup_config_perf_event(const char *str) { } #endif void watchdog_hardlockup_stop(void); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f54484de16f..ab0129b15f25 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -80,6 +80,8 @@ next: watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; + else if (!strncmp(str, "r", 1)) + hardlockup_config_perf_event(str + 1); while (*(str++)) { if (*str == ',') { str++; diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 8ea00c4a24b2..5f7d1f0d4268 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -90,6 +90,14 @@ static struct perf_event_attr wd_hw_attr = { .disabled = 1, }; +static struct perf_event_attr fallback_wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, struct perf_sample_data *data, @@ -122,6 +130,13 @@ static int hardlockup_detector_event_create(void) /* Try to register using hardware perf events */ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + wd_attr = &fallback_wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + } + if (IS_ERR(evt)) { pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, PTR_ERR(evt)); @@ -259,3 +274,34 @@ int __init watchdog_hardlockup_probe(void) } return ret; } + +/** + * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. + * + * @str: number which identifies the raw perf event to use + */ +void __init hardlockup_config_perf_event(const char *str) +{ + u64 config; + char buf[24]; + char *comma = strchr(str, ','); + + if (!comma) { + if (kstrtoull(str, 16, &config)) + return; + } else { + unsigned int len = comma - str; + + if (len >= sizeof(buf)) + return; + + if (strscpy(buf, str, sizeof(buf)) < 0) + return; + buf[len] = 0; + if (kstrtoull(buf, 16, &config)) + return; + } + + wd_hw_attr.type = PERF_TYPE_RAW; + wd_hw_attr.config = config; +} -- cgit v1.2.3 From 8fcb916cac8985188a3f825966ffa99507a90cb1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 May 2024 16:41:07 -0700 Subject: kernel/watchdog_perf.c: tidy up kerneldoc It is unconventional to have a blank line between name-of-function and description-of-args. Cc: Peter Zijlstra Cc: Song Liu Cc: "Matthew Wilcox (Oracle)" Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- kernel/watchdog_perf.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 5f7d1f0d4268..d577c4a8321e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -148,7 +148,6 @@ static int hardlockup_detector_event_create(void) /** * watchdog_hardlockup_enable - Enable the local event - * * @cpu: The CPU to enable hard lockup on. */ void watchdog_hardlockup_enable(unsigned int cpu) @@ -167,7 +166,6 @@ void watchdog_hardlockup_enable(unsigned int cpu) /** * watchdog_hardlockup_disable - Disable the local event - * * @cpu: The CPU to enable hard lockup on. */ void watchdog_hardlockup_disable(unsigned int cpu) @@ -277,7 +275,6 @@ int __init watchdog_hardlockup_probe(void) /** * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. - * * @str: number which identifies the raw perf event to use */ void __init hardlockup_config_perf_event(const char *str) -- cgit v1.2.3 From e406737b11103752838cf50fd197ec8e9352bbf7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 8 May 2024 10:13:41 -0700 Subject: seccomp: Constify sysctl subhelpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The read_actions_logged() and write_actions_logged() helpers called by the sysctl proc handler seccomp_actions_logged_handler() are already expecting their sysctl table argument to be read-only. Actually mark the argument as const in preparation[1] for global constification of the sysctl tables. Suggested-by: Thomas Weißschuh Link: https://lore.kernel.org/lkml/20240423-sysctl-const-handler-v3-11-e0beccb836e2@weissschuh.net/ [1] Reviewed-by: Luis Chamberlain Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20240508171337.work.861-kees@kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index aca7b437882e..f70e031e06a8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2334,7 +2334,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) return true; } -static int read_actions_logged(struct ctl_table *ro_table, void *buffer, +static int read_actions_logged(const struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos) { char names[sizeof(seccomp_actions_avail)]; @@ -2352,7 +2352,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void *buffer, return proc_dostring(&table, 0, buffer, lenp, ppos); } -static int write_actions_logged(struct ctl_table *ro_table, void *buffer, +static int write_actions_logged(const struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; -- cgit v1.2.3 From d927752f287fe10965612541593468ffcfa9231f Mon Sep 17 00:00:00 2001 From: Wardenjohn Date: Tue, 7 May 2024 13:01:11 +0800 Subject: livepatch: Rename KLP_* to KLP_TRANSITION_* The original macros of KLP_* is about the state of the transition. Rename macros of KLP_* to KLP_TRANSITION_* to fix the confusing description of klp transition state. Signed-off-by: Wardenjohn Reviewed-by: Petr Mladek Tested-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Link: https://lore.kernel.org/r/20240507050111.38195-2-zhangwarden@gmail.com Signed-off-by: Petr Mladek --- include/linux/livepatch.h | 6 ++--- init/init_task.c | 2 +- kernel/livepatch/core.c | 4 ++-- kernel/livepatch/patch.c | 4 ++-- kernel/livepatch/transition.c | 54 +++++++++++++++++++++---------------------- 5 files changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 9b9b38e89563..51a258c24ff5 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -18,9 +18,9 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) /* task patch states */ -#define KLP_UNDEFINED -1 -#define KLP_UNPATCHED 0 -#define KLP_PATCHED 1 +#define KLP_TRANSITION_IDLE -1 +#define KLP_TRANSITION_UNPATCHED 0 +#define KLP_TRANSITION_PATCHED 1 /** * struct klp_func - function structure for live patching diff --git a/init/init_task.c b/init/init_task.c index 5727d42149c3..9b41f00d30e2 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -202,7 +202,7 @@ struct task_struct init_task .trace_recursion = 0, #endif #ifdef CONFIG_LIVEPATCH - .patch_state = KLP_UNDEFINED, + .patch_state = KLP_TRANSITION_IDLE, #endif #ifdef CONFIG_SECURITY .security = NULL, diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ecbc9b6aba3a..52426665eecc 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -973,7 +973,7 @@ static int __klp_disable_patch(struct klp_patch *patch) if (klp_transition_patch) return -EBUSY; - klp_init_transition(patch, KLP_UNPATCHED); + klp_init_transition(patch, KLP_TRANSITION_UNPATCHED); klp_for_each_object(patch, obj) if (obj->patched) @@ -1008,7 +1008,7 @@ static int __klp_enable_patch(struct klp_patch *patch) pr_notice("enabling patch '%s'\n", patch->mod->name); - klp_init_transition(patch, KLP_PATCHED); + klp_init_transition(patch, KLP_TRANSITION_PATCHED); /* * Enforce the order of the func->transition writes in diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 4152c71507e2..90408500e5a3 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -95,9 +95,9 @@ static void notrace klp_ftrace_handler(unsigned long ip, patch_state = current->patch_state; - WARN_ON_ONCE(patch_state == KLP_UNDEFINED); + WARN_ON_ONCE(patch_state == KLP_TRANSITION_IDLE); - if (patch_state == KLP_UNPATCHED) { + if (patch_state == KLP_TRANSITION_UNPATCHED) { /* * Use the previously patched version of the function. * If no previous patches exist, continue with the diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index e54c3d60a904..ba069459c101 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -23,7 +23,7 @@ static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); struct klp_patch *klp_transition_patch; -static int klp_target_state = KLP_UNDEFINED; +static int klp_target_state = KLP_TRANSITION_IDLE; static unsigned int klp_signals_cnt; @@ -96,16 +96,16 @@ static void klp_complete_transition(void) pr_debug("'%s': completing %s transition\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); - if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + if (klp_transition_patch->replace && klp_target_state == KLP_TRANSITION_PATCHED) { klp_unpatch_replaced_patches(klp_transition_patch); klp_discard_nops(klp_transition_patch); } - if (klp_target_state == KLP_UNPATCHED) { + if (klp_target_state == KLP_TRANSITION_UNPATCHED) { /* - * All tasks have transitioned to KLP_UNPATCHED so we can now + * All tasks have transitioned to KLP_TRANSITION_UNPATCHED so we can now * remove the new functions from the func_stack. */ klp_unpatch_objects(klp_transition_patch); @@ -123,36 +123,36 @@ static void klp_complete_transition(void) klp_for_each_func(obj, func) func->transition = false; - /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ - if (klp_target_state == KLP_PATCHED) + /* Prevent klp_ftrace_handler() from seeing KLP_TRANSITION_IDLE state */ + if (klp_target_state == KLP_TRANSITION_PATCHED) klp_synchronize_transition(); read_lock(&tasklist_lock); for_each_process_thread(g, task) { WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); - task->patch_state = KLP_UNDEFINED; + task->patch_state = KLP_TRANSITION_IDLE; } read_unlock(&tasklist_lock); for_each_possible_cpu(cpu) { task = idle_task(cpu); WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); - task->patch_state = KLP_UNDEFINED; + task->patch_state = KLP_TRANSITION_IDLE; } klp_for_each_object(klp_transition_patch, obj) { if (!klp_is_object_loaded(obj)) continue; - if (klp_target_state == KLP_PATCHED) + if (klp_target_state == KLP_TRANSITION_PATCHED) klp_post_patch_callback(obj); - else if (klp_target_state == KLP_UNPATCHED) + else if (klp_target_state == KLP_TRANSITION_UNPATCHED) klp_post_unpatch_callback(obj); } pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); - klp_target_state = KLP_UNDEFINED; + klp_target_state = KLP_TRANSITION_IDLE; klp_transition_patch = NULL; } @@ -164,13 +164,13 @@ static void klp_complete_transition(void) */ void klp_cancel_transition(void) { - if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) + if (WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_PATCHED)) return; pr_debug("'%s': canceling patching transition, going to unpatch\n", klp_transition_patch->mod->name); - klp_target_state = KLP_UNPATCHED; + klp_target_state = KLP_TRANSITION_UNPATCHED; klp_complete_transition(); } @@ -218,7 +218,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, struct klp_ops *ops; int i; - if (klp_target_state == KLP_UNPATCHED) { + if (klp_target_state == KLP_TRANSITION_UNPATCHED) { /* * Check for the to-be-unpatched function * (the func itself). @@ -455,7 +455,7 @@ void klp_try_complete_transition(void) struct klp_patch *patch; bool complete = true; - WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE); /* * Try to switch the tasks to the target patch state by walking their @@ -532,11 +532,11 @@ void klp_start_transition(void) struct task_struct *g, *task; unsigned int cpu; - WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE); pr_notice("'%s': starting %s transition\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); /* * Mark all normal tasks as needing a patch state update. They'll @@ -578,7 +578,7 @@ void klp_init_transition(struct klp_patch *patch, int state) struct klp_func *func; int initial_state = !state; - WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_IDLE); klp_transition_patch = patch; @@ -589,7 +589,7 @@ void klp_init_transition(struct klp_patch *patch, int state) klp_target_state = state; pr_debug("'%s': initializing %s transition\n", patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); /* * Initialize all tasks to the initial patch state to prepare them for @@ -597,7 +597,7 @@ void klp_init_transition(struct klp_patch *patch, int state) */ read_lock(&tasklist_lock); for_each_process_thread(g, task) { - WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE); task->patch_state = initial_state; } read_unlock(&tasklist_lock); @@ -607,19 +607,19 @@ void klp_init_transition(struct klp_patch *patch, int state) */ for_each_possible_cpu(cpu) { task = idle_task(cpu); - WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE); task->patch_state = initial_state; } /* * Enforce the order of the task->patch_state initializations and the * func->transition updates to ensure that klp_ftrace_handler() doesn't - * see a func in transition with a task->patch_state of KLP_UNDEFINED. + * see a func in transition with a task->patch_state of KLP_TRANSITION_IDLE. * * Also enforce the order of the klp_target_state write and future * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and * __klp_sched_try_switch() don't set a task->patch_state to - * KLP_UNDEFINED. + * KLP_TRANSITION_IDLE. */ smp_wmb(); @@ -652,7 +652,7 @@ void klp_reverse_transition(void) pr_debug("'%s': reversing transition from %s\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching to unpatching" : + klp_target_state == KLP_TRANSITION_PATCHED ? "patching to unpatching" : "unpatching to patching"); /* @@ -741,7 +741,7 @@ void klp_force_transition(void) klp_update_patch_state(idle_task(cpu)); /* Set forced flag for patches being removed. */ - if (klp_target_state == KLP_UNPATCHED) + if (klp_target_state == KLP_TRANSITION_UNPATCHED) klp_transition_patch->forced = true; else if (klp_transition_patch->replace) { klp_for_each_patch(patch) { -- cgit v1.2.3 From 05037e5f0f17935a86861f9610f941ebf346a95e Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 10 Apr 2024 16:33:11 -0500 Subject: sched/topology: Optimize topology_span_sane() Optimize topology_span_sane() by removing duplicate comparisons. Since topology_span_sane() is called inside of for_each_cpu(), each previous CPU has already been compared against every other CPU. The current CPU only needs to be compared against higher-numbered CPUs. The total number of comparisons is reduced from N * (N - 1) to N * (N - 1) / 2 on each non-NUMA scheduling domain level. Signed-off-by: Kyle Meyer Reviewed-by: Yury Norov Acked-by: Vincent Guittot Signed-off-by: Yury Norov --- kernel/sched/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..b6bcafc09969 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2347,7 +2347,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve static bool topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu) { - int i; + int i = cpu + 1; /* NUMA levels are allowed to overlap */ if (tl->flags & SDTL_OVERLAP) @@ -2359,9 +2359,7 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, * breaking the sched_group lists - i.e. a later get_group() pass * breaks the linking done for an earlier span. */ - for_each_cpu(i, cpu_map) { - if (i == cpu) - continue; + for_each_cpu_from(i, cpu_map) { /* * We should 'and' all those masks with 'cpu_map' to exactly * match the topology we're about to build, but that can only -- cgit v1.2.3 From a6016aac5252da9d22a4dc0b98121b0acdf6d2f5 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 9 May 2024 16:46:16 +0200 Subject: dma: fix DMA sync for drivers not calling dma_set_mask*() There are several reports that the DMA sync shortcut broke non-coherent devices. dev->dma_need_sync is false after the &device allocation and if a driver didn't call dma_set_mask*(), it will still be false even if the device is not DMA-coherent and thus needs synchronizing. Due to historical reasons, there's still a lot of drivers not calling it. Invert the boolean, so that the sync will be performed by default and the shortcut will be enabled only when calling dma_set_mask*(). Reported-by: Steven Price Closes: https://lore.kernel.org/lkml/010686f5-3049-46a1-8230-7752a1b433ff@arm.com Reported-by: Marek Szyprowski Closes: https://lore.kernel.org/lkml/46160534-5003-4809-a408-6b3a3f4921e9@samsung.com Fixes: f406c8e4b770. ("dma: avoid redundant calls for sync operations") Signed-off-by: Alexander Lobakin Signed-off-by: Christoph Hellwig Tested-by: Steven Price Tested-by: Marek Szyprowski --- include/linux/device.h | 4 ++-- include/linux/dma-map-ops.h | 4 ++-- include/linux/dma-mapping.h | 2 +- kernel/dma/mapping.c | 10 +++++----- kernel/dma/swiotlb.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/device.h b/include/linux/device.h index ed95b829f05b..d4b50accff26 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -691,7 +691,7 @@ struct device_physical_location { * and optionall (if the coherent mask is large enough) also * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. - * @dma_need_sync: The device needs performing DMA sync operations. + * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information @@ -805,7 +805,7 @@ struct device { bool dma_ops_bypass : 1; #endif #ifdef CONFIG_DMA_NEED_SYNC - bool dma_need_sync:1; + bool dma_skip_sync:1; #endif }; diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 4893cb89cb52..5217b922d29f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -280,8 +280,8 @@ static inline void dma_reset_need_sync(struct device *dev) { #ifdef CONFIG_DMA_NEED_SYNC /* Reset it only once so that the function can be called on hotpath */ - if (unlikely(!dev->dma_need_sync)) - dev->dma_need_sync = true; + if (unlikely(dev->dma_skip_sync)) + dev->dma_skip_sync = false; #endif } diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index eb4e15893b6c..f693aafe221f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -295,7 +295,7 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr); static inline bool dma_dev_need_sync(const struct device *dev) { /* Always call DMA sync operations when debugging is enabled */ - return dev->dma_need_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG); + return !dev->dma_skip_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG); } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3524bc92c37f..3f77c3f8d16d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -392,7 +392,7 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) if (dma_map_direct(dev, ops)) /* - * dma_need_sync could've been reset on first SWIOTLB buffer + * dma_skip_sync could've been reset on first SWIOTLB buffer * mapping, but @dma_addr is not necessary an SWIOTLB buffer. * In this case, fall back to more granular check. */ @@ -407,20 +407,20 @@ static void dma_setup_need_sync(struct device *dev) if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC)) /* - * dma_need_sync will be reset to %true on first SWIOTLB buffer + * dma_skip_sync will be reset to %false on first SWIOTLB buffer * mapping, if any. During the device initialization, it's * enough to check only for the DMA coherence. */ - dev->dma_need_sync = !dev_is_dma_coherent(dev); + dev->dma_skip_sync = dev_is_dma_coherent(dev); else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu && !ops->sync_sg_for_device && !ops->sync_sg_for_cpu) /* * Synchronization is not possible when none of DMA sync ops * is set. */ - dev->dma_need_sync = false; + dev->dma_skip_sync = true; else - dev->dma_need_sync = true; + dev->dma_skip_sync = false; } #else /* !CONFIG_DMA_NEED_SYNC */ static inline void dma_setup_need_sync(struct device *dev) { } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ae3e593eaadb..068134697cf1 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1409,7 +1409,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, } /* - * If dma_need_sync wasn't set, reset it on first SWIOTLB buffer + * If dma_skip_sync was set, reset it on first SWIOTLB buffer * mapping to always sync SWIOTLB buffers. */ dma_reset_need_sync(dev); -- cgit v1.2.3 From 2ddec2c80b4402c293c7e6e0881cecaaf77e8cec Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 2 May 2024 15:18:52 +0000 Subject: riscv, bpf: inline bpf_get_smp_processor_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline the calls to bpf_get_smp_processor_id() in the riscv bpf jit. RISCV saves the pointer to the CPU's task_struct in the TP (thread pointer) register. This makes it trivial to get the CPU's processor id. As thread_info is the first member of task_struct, we can read the processor id from TP + offsetof(struct thread_info, cpu). RISCV64 JIT output for `call bpf_get_smp_processor_id` ====================================================== Before After -------- ------- auipc t1,0x848c ld a5,32(tp) jalr 604(t1) mv a5,a0 Benchmark using [1] on Qemu. ./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc +---------------+------------------+------------------+--------------+ | Name | Before | After | % change | |---------------+------------------+------------------+--------------| | glob-arr-inc | 1.077 ± 0.006M/s | 1.336 ± 0.010M/s | + 24.04% | | arr-inc | 1.078 ± 0.002M/s | 1.332 ± 0.015M/s | + 23.56% | | hash-inc | 0.494 ± 0.004M/s | 0.653 ± 0.001M/s | + 32.18% | +---------------+------------------+------------------+--------------+ NOTE: This benchmark includes changes from this patch and the previous patch that implemented the per-cpu insn. [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20240502151854.9810-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 26 ++++++++++++++++++++++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 11 +++++++++++ kernel/bpf/verifier.c | 4 ++++ 4 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 1f0159963b3e..a46ec7fb4489 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1493,6 +1493,22 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, bool fixed_addr; u64 addr; + /* Inline calls to bpf_get_smp_processor_id() + * + * RV_REG_TP holds the address of the current CPU's task_struct and thread_info is + * at offset 0 in task_struct. + * Load cpu from thread_info: + * Set R0 to ((struct thread_info *)(RV_REG_TP))->cpu + * + * This replicates the implementation of raw_smp_processor_id() on RISCV + */ + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) { + /* Load current CPU number in R0 */ + emit_ld(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu), + RV_REG_TP, ctx); + break; + } + mark_call(ctx); ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &addr, &fixed_addr); @@ -2062,3 +2078,13 @@ bool bpf_jit_supports_percpu_insn(void) { return true; } + +bool bpf_jit_inlines_helper_call(s32 imm) +{ + switch (imm) { + case BPF_FUNC_get_smp_processor_id: + return true; + default: + return false; + } +} diff --git a/include/linux/filter.h b/include/linux/filter.h index 7a27f19bf44d..3e19bb62ed1a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -993,6 +993,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); +bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 99b8b1c9a248..aa59af9f9bd9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2941,6 +2941,17 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return true if the JIT inlines the call to the helper corresponding to + * the imm. + * + * The verifier will not patch the insn->imm for the call to the helper if + * this returns true. + */ +bool __weak bpf_jit_inlines_helper_call(s32 imm) +{ + return false; +} + /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e3aba08984e..1658ca4136a3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19996,6 +19996,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Skip inlining the helper call if the JIT does it. */ + if (bpf_jit_inlines_helper_call(insn->imm)) + goto next_insn; + if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) -- cgit v1.2.3 From bd125a084091396f3e796bb3dc009940d9771811 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 23 Apr 2024 16:23:37 +0000 Subject: tracing/user_events: Fix non-spaced field matching When the ABI was updated to prevent same name w/different args, it missed an important corner case when fields don't end with a space. Typically, space is used for fields to help separate them, like "u8 field1; u8 field2". If no spaces are used, like "u8 field1;u8 field2", then the parsing works for the first time. However, the match check fails on a subsequent register, leading to confusion. This is because the match check uses argv_split() and assumes that all fields will be split upon the space. When spaces are used, we get back { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. This causes a mismatch, and the user program gets back -EADDRINUSE. Add a method to detect this case before calling argv_split(). If found force a space after the field separator character ';'. This ensures all cases work properly for matching. With this fix, the following are all treated as matching: u8 field1;u8 field2 u8 field1; u8 field2 u8 field1;\tu8 field2 u8 field1;\nu8 field2 Link: https://lore.kernel.org/linux-trace-kernel/20240423162338.292-2-beaub@linux.microsoft.com Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different args event") Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 76 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 70d428c394b6..82b191f33a28 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1989,6 +1989,80 @@ static int user_event_set_tp_name(struct user_event *user) return 0; } +/* + * Counts how many ';' without a trailing space are in the args. + */ +static int count_semis_no_space(char *args) +{ + int count = 0; + + while ((args = strchr(args, ';'))) { + args++; + + if (!isspace(*args)) + count++; + } + + return count; +} + +/* + * Copies the arguments while ensuring all ';' have a trailing space. + */ +static char *insert_space_after_semis(char *args, int count) +{ + char *fixed, *pos; + int len; + + len = strlen(args) + count; + fixed = kmalloc(len + 1, GFP_KERNEL); + + if (!fixed) + return NULL; + + pos = fixed; + + /* Insert a space after ';' if there is no trailing space. */ + while (*args) { + *pos = *args++; + + if (*pos++ == ';' && !isspace(*args)) + *pos++ = ' '; + } + + *pos = '\0'; + + return fixed; +} + +static char **user_event_argv_split(char *args, int *argc) +{ + char **split; + char *fixed; + int count; + + /* Count how many ';' without a trailing space */ + count = count_semis_no_space(args); + + /* No fixup is required */ + if (!count) + return argv_split(GFP_KERNEL, args, argc); + + /* We must fixup 'field;field' to 'field; field' */ + fixed = insert_space_after_semis(args, count); + + if (!fixed) + return NULL; + + /* We do a normal split afterwards */ + split = argv_split(GFP_KERNEL, fixed, argc); + + /* We can free since argv_split makes a copy */ + kfree(fixed); + + return split; +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -2012,7 +2086,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return -EPERM; if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); + argv = user_event_argv_split(args, &argc); if (!argv) return -ENOMEM; -- cgit v1.2.3 From c09d4167b550f91ecf5c3db883eea314edc7f532 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 10 May 2024 15:04:30 +0100 Subject: ring-buffer: Allocate sub-buffers with __GFP_COMP In preparation for the ring-buffer memory mapping, allocate compound pages for the ring-buffer sub-buffers to enable us to map them to user-space with vm_insert_pages(). Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-2-vdonnefort@google.com Acked-by: David Hildenbrand Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6511dc3a00da..a1e011733f6a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1524,7 +1524,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_ZERO, + mflags | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1609,7 +1609,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; @@ -5579,7 +5579,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) goto out; page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, + GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); -- cgit v1.2.3 From 117c39200d9d760cbd5944bb89efb7b9c51965aa Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 10 May 2024 15:04:31 +0100 Subject: ring-buffer: Introducing ring-buffer mapping functions In preparation for allowing the user-space to map a ring-buffer, add a set of mapping functions: ring_buffer_{map,unmap}() And controls on the ring-buffer: ring_buffer_map_get_reader() /* swap reader and head */ Mapping the ring-buffer also involves: A unique ID for each subbuf of the ring-buffer, currently they are only identified through their in-kernel VA. A meta-page, where are stored ring-buffer statistics and a description for the current reader The linear mapping exposes the meta-page, and each subbuf of the ring-buffer, ordered following their unique ID, assigned during the first mapping. Once mapped, no subbuf can get in or out of the ring-buffer: the buffer size will remain unmodified and the splice enabling functions will in reality simply memcpy the data instead of swapping subbufs. Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-3-vdonnefort@google.com CC: Signed-off-by: Vincent Donnefort Acked-by: David Hildenbrand Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 6 + include/uapi/linux/trace_mmap.h | 46 +++++ kernel/trace/ring_buffer.c | 414 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 463 insertions(+), 3 deletions(-) create mode 100644 include/uapi/linux/trace_mmap.h (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dc5ae4e96aee..96d2140b471e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -6,6 +6,8 @@ #include #include +#include + struct trace_buffer; struct ring_buffer_iter; @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); #define trace_rb_cpu_prepare NULL #endif +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma); +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h new file mode 100644 index 000000000000..b682e9925539 --- /dev/null +++ b/include/uapi/linux/trace_mmap.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _TRACE_MMAP_H_ +#define _TRACE_MMAP_H_ + +#include + +/** + * struct trace_buffer_meta - Ring-buffer Meta-page description + * @meta_page_size: Size of this meta-page. + * @meta_struct_len: Size of this structure. + * @subbuf_size: Size of each sub-buffer. + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. + * @reader.lost_events: Number of events lost at the time of the reader swap. + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] + * @reader.read: Number of bytes read on the reader subbuf. + * @flags: Placeholder for now, 0 until new features are supported. + * @entries: Number of entries in the ring-buffer. + * @overrun: Number of entries lost in the ring-buffer. + * @read: Number of entries that have been read. + * @Reserved1: Internal use only. + * @Reserved2: Internal use only. + */ +struct trace_buffer_meta { + __u32 meta_page_size; + __u32 meta_struct_len; + + __u32 subbuf_size; + __u32 nr_subbufs; + + struct { + __u64 lost_events; + __u32 id; + __u32 read; + } reader; + + __u64 flags; + + __u64 entries; + __u64 overrun; + __u64 read; + + __u64 Reserved1; + __u64 Reserved2; +}; + +#endif /* _TRACE_MMAP_H_ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1e011733f6a..cb303052bff2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include @@ -338,6 +340,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id; /* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned int mapped; + struct mutex mapping_lock; + unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct trace_buffer_meta *meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) return buffer->time_stamp_abs; } -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + + meta->reader.read = cpu_buffer->reader_page->read; + meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.lost_events = cpu_buffer->lost_events; + + meta->entries = local_read(&cpu_buffer->entries); + meta->overrun = local_read(&cpu_buffer->overrun); + meta->read = cpu_buffer->read; + + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; + if (cpu_buffer->mapped) + rb_update_meta_page(cpu_buffer); + rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; } @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + /* It's up to the callers to not try to swap mapped buffers */ + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { + ret = -EBUSY; + goto out; + } + /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { + cpu_buffer->reader_page == cpu_buffer->commit_page || + cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer = buffer->buffers[cpu]; + if (cpu_buffer->mapped) { + err = -EBUSY; + goto error; + } + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); @@ -6057,6 +6096,375 @@ error: } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct page *page; + + if (cpu_buffer->meta_page) + return 0; + + page = alloc_page(GFP_USER | __GFP_ZERO); + if (!page) + return -ENOMEM; + + cpu_buffer->meta_page = page_to_virt(page); + + return 0; +} + +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long addr = (unsigned long)cpu_buffer->meta_page; + + free_page(addr); + cpu_buffer->meta_page = NULL; +} + +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *subbuf_ids) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; + struct buffer_page *first_subbuf, *subbuf; + int id = 0; + + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; + cpu_buffer->reader_page->id = id++; + + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); + do { + if (WARN_ON(id >= nr_subbufs)) + break; + + subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf->id = id; + + rb_inc_page(&subbuf); + id++; + } while (subbuf != first_subbuf); + + /* install subbuf ID to kern VA translation */ + cpu_buffer->subbuf_ids = subbuf_ids; + + meta->meta_page_size = PAGE_SIZE; + meta->meta_struct_len = sizeof(*meta); + meta->nr_subbufs = nr_subbufs; + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + + rb_update_meta_page(cpu_buffer); +} + +static struct ring_buffer_per_cpu * +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-EINVAL); + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + mutex_unlock(&cpu_buffer->mapping_lock); + return ERR_PTR(-ENODEV); + } + + return cpu_buffer; +} + +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + mutex_unlock(&cpu_buffer->mapping_lock); +} + +/* + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need + * to be set-up or torn-down. + */ +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, + bool inc) +{ + unsigned long flags; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + if (inc && cpu_buffer->mapped == UINT_MAX) + return -EBUSY; + + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) + return -EINVAL; + + mutex_lock(&cpu_buffer->buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (inc) + cpu_buffer->mapped++; + else + cpu_buffer->mapped--; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + mutex_unlock(&cpu_buffer->buffer->mutex); + + return 0; +} + +/* + * +--------------+ pgoff == 0 + * | meta page | + * +--------------+ pgoff == 1 + * | subbuffer 0 | + * | | + * +--------------+ pgoff == (1 + (1 << subbuf_order)) + * | subbuffer 1 | + * | | + * ... + */ +#ifdef CONFIG_MMU +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; + unsigned int subbuf_pages, subbuf_order; + struct page **pages; + int p = 0, s = 0; + int err; + + /* Refuse MP_PRIVATE or writable mappings */ + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || + !(vma->vm_flags & VM_MAYSHARE)) + return -EPERM; + + /* + * Make sure the mapping cannot become writable later. Also tell the VM + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). + */ + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, + VM_MAYWRITE); + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ + + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (!vma_pages || vma_pages > nr_pages) + return -EINVAL; + + nr_pages = vma_pages; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (!pgoff) { + pages[p++] = virt_to_page(cpu_buffer->meta_page); + + /* + * TODO: Align sub-buffers on their size, once + * vm_insert_pages() supports the zero-page. + */ + } else { + /* Skip the meta-page */ + pgoff--; + + if (pgoff % subbuf_pages) { + err = -EINVAL; + goto out; + } + + s += pgoff / subbuf_pages; + } + + while (p < nr_pages) { + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]); + int off = 0; + + if (WARN_ON_ONCE(s >= nr_subbufs)) { + err = -EINVAL; + goto out; + } + + for (; off < (1 << (subbuf_order)); off++, page++) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } + + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + +out: + kfree(pages); + + return err; +} +#else +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + return -EOPNOTSUPP; +} +#endif + +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags, *subbuf_ids; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (cpu_buffer->mapped) { + err = __rb_map_vma(cpu_buffer, vma); + if (!err) + err = __rb_inc_dec_mapped(cpu_buffer, true); + mutex_unlock(&cpu_buffer->mapping_lock); + return err; + } + + /* prevent another thread from changing buffer/sub-buffer sizes */ + mutex_lock(&buffer->mutex); + + err = rb_alloc_meta_page(cpu_buffer); + if (err) + goto unlock; + + /* subbuf_ids include the reader while nr_pages does not */ + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); + if (!subbuf_ids) { + rb_free_meta_page(cpu_buffer); + err = -ENOMEM; + goto unlock; + } + + atomic_inc(&cpu_buffer->resize_disabled); + + /* + * Lock all readers to block any subbuf swap until the subbuf IDs are + * assigned. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + err = __rb_map_vma(cpu_buffer, vma); + if (!err) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + cpu_buffer->mapped = 1; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } else { + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + } + +unlock: + mutex_unlock(&buffer->mutex); + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + err = -ENODEV; + goto out; + } else if (cpu_buffer->mapped > 1) { + __rb_inc_dec_mapped(cpu_buffer, false); + goto out; + } + + mutex_lock(&buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + cpu_buffer->mapped = 0; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); + +out: + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long reader_size; + unsigned long flags; + + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); + if (IS_ERR(cpu_buffer)) + return (int)PTR_ERR(cpu_buffer); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + +consume: + if (rb_per_cpu_empty(cpu_buffer)) + goto out; + + reader_size = rb_page_size(cpu_buffer->reader_page); + + /* + * There are data to be read on the current reader page, we can + * return to the caller. But before that, we assume the latter will read + * everything. Let's update the kernel reader accordingly. + */ + if (cpu_buffer->reader_page->read < reader_size) { + while (cpu_buffer->reader_page->read < reader_size) + rb_advance_reader(cpu_buffer); + goto out; + } + + if (WARN_ON(!rb_get_reader_page(cpu_buffer))) + goto out; + + goto consume; + +out: + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + + rb_update_meta_page(cpu_buffer); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + rb_put_mapped_buffer(cpu_buffer); + + return 0; +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in -- cgit v1.2.3 From cf9f0f7c4c5bb45e7bb270e48bab6f7837825a64 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 10 May 2024 15:04:32 +0100 Subject: tracing: Allow user-space mapping of the ring-buffer Currently, user-space extracts data from the ring-buffer via splice, which is handy for storage or network sharing. However, due to splice limitations, it is imposible to do real-time analysis without a copy. A solution for that problem is to let the user-space map the ring-buffer directly. The mapping is exposed via the per-CPU file trace_pipe_raw. The first element of the mapping is the meta-page. It is followed by each subbuffer constituting the ring-buffer, ordered by their unique page ID: * Meta-page -- include/uapi/linux/trace_mmap.h for a description * Subbuf ID 0 * Subbuf ID 1 ... It is therefore easy to translate a subbuf ID into an offset in the mapping: reader_id = meta->reader->id; reader_offset = meta->meta_page_size + reader_id * meta->subbuf_size; When new data is available, the mapper must call a newly introduced ioctl: TRACE_MMAP_IOCTL_GET_READER. This will update the Meta-page reader ID to point to the next reader containing unread data. Mapping will prevent snapshot and buffer size modifications. Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-4-vdonnefort@google.com CC: Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/trace_mmap.h | 2 + kernel/trace/trace.c | 104 ++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 1 + 3 files changed, 102 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h index b682e9925539..bd1066754220 100644 --- a/include/uapi/linux/trace_mmap.h +++ b/include/uapi/linux/trace_mmap.h @@ -43,4 +43,6 @@ struct trace_buffer_meta { __u64 Reserved2; }; +#define TRACE_MMAP_IOCTL_GET_READER _IO('T', 0x1) + #endif /* _TRACE_MMAP_H_ */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 233d1af39fff..a35e7f598233 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, return; } + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); @@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) lockdep_assert_held(&trace_types_lock); spin_lock(&tr->snapshot_trigger_lock); - if (tr->snapshot == UINT_MAX) { + if (tr->snapshot == UINT_MAX || tr->mapped) { spin_unlock(&tr->snapshot_trigger_lock); return -EBUSY; } @@ -6068,7 +6074,7 @@ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; - + tr->current_trace->enabled--; if (tr->current_trace->reset) @@ -8194,15 +8200,32 @@ out: return ret; } -/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; + int err; + + if (cmd == TRACE_MMAP_IOCTL_GET_READER) { + if (!(file->f_flags & O_NONBLOCK)) { + err = ring_buffer_wait(iter->array_buffer->buffer, + iter->cpu_file, + iter->tr->buffer_percent, + NULL, NULL); + if (err) + return err; + } - if (cmd) - return -ENOIOCTLCMD; + return ring_buffer_map_get_reader(iter->array_buffer->buffer, + iter->cpu_file); + } else if (cmd) { + return -ENOTTY; + } + /* + * An ioctl call with cmd 0 to the ring buffer file will wake up all + * waiters + */ mutex_lock(&trace_types_lock); /* Make sure the waiters see the new wait_index */ @@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } +#ifdef CONFIG_TRACER_MAX_TRACE +static int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} +static void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} +#else +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +#endif + +static void tracing_buffers_mmap_close(struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + + WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); + put_snapshot_map(iter->tr); +} + +static const struct vm_operations_struct tracing_buffers_vmops = { + .close = tracing_buffers_mmap_close, +}; + +static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + int ret = 0; + + ret = get_snapshot_map(iter->tr); + if (ret) + return ret; + + ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); + if (ret) + put_snapshot_map(iter->tr); + + vma->vm_ops = &tracing_buffers_vmops; + + return ret; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, @@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = { .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, + .mmap = tracing_buffers_mmap, }; static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64450615ca0c..749a182dab48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -336,6 +336,7 @@ struct trace_array { bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; + unsigned int mapped; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; -- cgit v1.2.3 From c5963a0990d1da70d1bd399e6811887ff2231b0d Mon Sep 17 00:00:00 2001 From: Yuran Pereira Date: Mon, 20 Nov 2023 05:46:13 +0530 Subject: ftrace: Replaces simple_strtoul in ftrace The function simple_strtoul performs no error checking in scenarios where the input value overflows the intended output variable. This results in this function successfully returning, even when the output does not match the input string (aka the function returns successfully even when the result is wrong). Or as it was mentioned [1], "...simple_strtol(), simple_strtoll(), simple_strtoul(), and simple_strtoull() functions explicitly ignore overflows, which may lead to unexpected results in callers." Hence, the use of those functions is discouraged. This patch replaces all uses of the simple_strtoul with the safer alternatives kstrtoul and kstruint. Callers affected: - add_rec_by_index - set_graph_max_depth_function Side effects of this patch: - Since `fgraph_max_depth` is an `unsigned int`, this patch uses kstrtouint instead of kstrtoul to avoid any compiler warnings that could originate from calling the latter. - This patch ensures that the callers of kstrtou* return accordingly when kstrtoul and kstruint fail for some reason. In this case, both callers this patch is addressing return 0 on error. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#simple-strtol-simple-strtoll-simple-strtoul-simple-strtoull Link: https://lore.kernel.org/linux-trace-kernel/GV1PR10MB656333529A8D7B8AFB28D238E8B4A@GV1PR10MB6563.EURPRD10.PROD.OUTLOOK.COM Signed-off-by: Yuran Pereira Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..3951c0d0ec63 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4202,12 +4202,12 @@ static int add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, int clear_filter) { - long index = simple_strtoul(func_g->search, NULL, 0); + long index; struct ftrace_page *pg; struct dyn_ftrace *rec; /* The index starts at 1 */ - if (--index < 0) + if (kstrtoul(func_g->search, 0, &index) || --index < 0) return 0; do_for_each_ftrace_rec(pg, rec) { @@ -5817,9 +5817,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function); static int __init set_graph_max_depth_function(char *str) { - if (!str) + if (!str || kstrtouint(str, 0, &fgraph_max_depth)) return 0; - fgraph_max_depth = simple_strtoul(str, NULL, 0); return 1; } __setup("ftrace_graph_max_depth=", set_graph_max_depth_function); -- cgit v1.2.3 From 33f137143e651321f10eb67ae6404a13bfbf69f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 May 2024 16:12:37 -0700 Subject: ftrace: Use asynchronous grace period for register_ftrace_direct() When running heavy test workloads with KASAN enabled, RCU Tasks grace periods can extend for many tens of seconds, significantly slowing trace registration. Therefore, make the registration-side RCU Tasks grace period be asynchronous via call_rcu_tasks(). Link: https://lore.kernel.org/linux-trace-kernel/ac05be77-2972-475b-9b57-56bef15aa00a@paulmck-laptop Reported-by: Jakub Kicinski Reported-by: Alexei Starovoitov Reported-by: Chris Mason Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3951c0d0ec63..f9223513414a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5366,6 +5366,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long } } +static void register_ftrace_direct_cb(struct rcu_head *rhp) +{ + struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu); + + free_ftrace_hash(fhp); +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5464,10 +5471,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) out_unlock: mutex_unlock(&direct_mutex); - if (free_hash && free_hash != EMPTY_HASH) { - synchronize_rcu_tasks(); - free_ftrace_hash(free_hash); - } + if (free_hash && free_hash != EMPTY_HASH) + call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb); if (new_hash) free_ftrace_hash(new_hash); -- cgit v1.2.3 From fe832be05a8eee5f1488cbcc2c562dd82d079fd6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Mar 2024 17:54:05 -0400 Subject: ring-buffer: Have mmapped ring buffer keep track of missed events While testing libtracefs on the mmapped ring buffer, the test that checks if missed events are accounted for failed when using the mapped buffer. This is because the mapped page does not update the missed events that were dropped because the writer filled up the ring buffer before the reader could catch it. Add the missed events to the reader page/sub-buffer when the IOCTL is done and a new reader page is acquired. Note that all accesses to the reader_page via rb_page_commit() had to be switched to rb_page_size(), and rb_page_size() which was just a copy of rb_page_commit() but now it masks out the RB_MISSED bits. This is needed as the mapped reader page is still active in the ring buffer code and where it reads the commit field of the bpage for the size, it now must mask it otherwise the missed bits that are now set will corrupt the size returned. Link: https://lore.kernel.org/linux-trace-kernel/20240312175405.12fb6726@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 53 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cb303052bff2..a02c7a52a0f5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -314,6 +314,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_MASK (3 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -2326,7 +2328,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { - return rb_page_commit(bpage); + return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned @@ -3953,7 +3955,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) return true; /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_commit(reader)) + if (reader->read != rb_page_size(reader)) return false; /* @@ -4424,7 +4426,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && - iter->head == rb_page_commit(cpu_buffer->reader_page))); + iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -5753,7 +5755,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, event = rb_reader_event(cpu_buffer); read = reader->read; - commit = rb_page_commit(reader); + commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; @@ -5830,7 +5832,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += rb_page_commit(reader); + cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); @@ -6422,6 +6424,8 @@ out: int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + unsigned long missed_events; unsigned long reader_size; unsigned long flags; @@ -6448,9 +6452,46 @@ consume: goto out; } - if (WARN_ON(!rb_get_reader_page(cpu_buffer))) + reader = rb_get_reader_page(cpu_buffer); + if (WARN_ON(!reader)) goto out; + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { + if (missed_events) { + struct buffer_data_page *bpage = reader->page; + unsigned int commit; + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + /* + * If there is room at the end of the page to save the + * missed events, then record it there. + */ + commit = rb_page_size(reader); + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + } else { + /* + * There really shouldn't be any missed events if the commit + * is on the reader page. + */ + WARN_ON_ONCE(missed_events); + } + + cpu_buffer->lost_events = 0; + goto consume; out: -- cgit v1.2.3 From 347bd7f072ea8c36e4becf32c76ee7e96bc7b1c3 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 29 Mar 2024 17:02:30 +0100 Subject: tracing: Improve benchmark test performance by using do_div() Partially revert commit d6cb38e10810 ("tracing: Use div64_u64() instead of do_div()") and use do_div() again to utilize its faster 64-by-32 division compared to the 64-by-64 division done by div64_u64(). Explicitly cast the divisor bm_cnt to u32 to prevent a Coccinelle warning reported by do_div.cocci. The warning was removed with commit d6cb38e10810 ("tracing: Use div64_u64() instead of do_div()"). Using the faster 64-by-32 division and casting bm_cnt to u32 is safe because we return early from trace_do_benchmark() if bm_cnt > UINT_MAX. This approach is already used twice in trace_do_benchmark() when calculating the standard deviation: do_div(stddev, (u32)bm_cnt); do_div(stddev, (u32)bm_cnt - 1); Link: https://lore.kernel.org/linux-trace-kernel/20240329160229.4874-2-thorsten.blum@toblux.com Signed-off-by: Thorsten Blum Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 811b08439406..e19c32f2a938 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -104,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - delta = div64_u64(delta, bm_cnt); + do_div(delta, (u32)bm_cnt); avg = delta; if (stddev > 0) { -- cgit v1.2.3 From c9d5b7b8264ba2ead8984a3a4a0c3233911342a4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 4 May 2024 14:23:03 +0100 Subject: ftrace: Remove unused list 'ftrace_direct_funcs' Commit 8788ca164eb4b ("ftrace: Remove the legacy _ftrace_direct API") stopped using 'ftrace_direct_funcs' (and the associated struct ftrace_direct_func). Remove them. Build tested only (on x86-64 with FTRACE and DYNAMIC_FTRACE enabled) Link: https://lore.kernel.org/linux-trace-kernel/20240504132303.67538-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 54d53f345d14..b01cca36147f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -83,7 +83,6 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; -struct ftrace_direct_func; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9223513414a..4613bf67ef2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5318,14 +5318,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -struct ftrace_direct_func { - struct list_head next; - unsigned long addr; - int count; -}; - -static LIST_HEAD(ftrace_direct_funcs); - static int register_ftrace_function_nolock(struct ftrace_ops *ops); /* -- cgit v1.2.3 From d2cc859cc8885e98907cffd47b990491c5321a80 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 7 May 2024 00:33:05 +0100 Subject: ftrace: Remove unused global 'ftrace_direct_func_count' Commit 8788ca164eb4b ("ftrace: Remove the legacy _ftrace_direct API") stopped setting the 'ftrace_direct_func_count' variable, but left it around. Clean it up. Link: https://lore.kernel.org/linux-trace-kernel/20240506233305.215735-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 -- kernel/trace/fgraph.c | 11 ----------- kernel/trace/ftrace.c | 1 - 3 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b01cca36147f..e3a83ebd1b33 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -413,7 +413,6 @@ struct ftrace_func_entry { }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -extern int ftrace_direct_func_count; unsigned long ftrace_find_rec_direct(unsigned long ip); int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, @@ -425,7 +424,6 @@ void ftrace_stub_direct_tramp(void); #else struct ftrace_ops; -# define ftrace_direct_func_count 0 static inline unsigned long ftrace_find_rec_direct(unsigned long ip) { return 0; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c83c005e654e..a130b2d898f7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -125,17 +125,6 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - /* - * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses are unpredictable anyway. - * Ex: BPF trampoline may call original function and may skip frame - * depending on type of BPF programs attached. - */ - if (ftrace_direct_func_count && - ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) - return -EBUSY; -#endif trace.func = func; trace.depth = ++current->curr_ret_depth; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4613bf67ef2c..5a01d72f66db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2538,7 +2538,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); -int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer -- cgit v1.2.3 From 8d0b728840fdcfd0f0bc814c8ac9ef7c677839da Mon Sep 17 00:00:00 2001 From: Yifan Hong Date: Wed, 10 Apr 2024 19:48:02 +0000 Subject: module: allow UNUSED_KSYMS_WHITELIST to be relative against objtree. If UNUSED_KSYMS_WHITELIST is a file generated before Kbuild runs, and the source tree is in a read-only filesystem, the developer must put the file somewhere and specify an absolute path to UNUSED_KSYMS_WHITELIST. This worked, but if IKCONFIG=y, an absolute path is embedded into .config and eventually into vmlinux, causing the build to be less reproducible when building on a different machine. This patch makes the handling of UNUSED_KSYMS_WHITELIST to be similar to MODULE_SIG_KEY. First, check if UNUSED_KSYMS_WHITELIST is an absolute path, just as before this patch. If so, use the path as is. If it is a relative path, use wildcard to check the existence of the file below objtree first. If it does not exist, fall back to the original behavior of adding $(srctree)/ before the value. After this patch, the developer can put the generated file in objtree, then use a relative path against objtree in .config, eradicating any absolute paths that may be evaluated differently on different machines. Signed-off-by: Yifan Hong Reviewed-by: Elliot Berman Signed-off-by: Luis Chamberlain --- kernel/module/Kconfig | 2 +- scripts/Makefile.modpost | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index f3e0329337f6..cb8377a18927 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -392,7 +392,7 @@ config UNUSED_KSYMS_WHITELIST exported at all times, even in absence of in-tree users. The value to set here is the path to a text file containing the list of symbols, one per line. The path can be absolute, or relative to the kernel - source tree. + source or obj tree. config MODULES_TREE_LOOKUP def_bool y diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 739402f45509..36952638bbc6 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -94,7 +94,7 @@ targets += .vmlinux.objs ifdef CONFIG_TRIM_UNUSED_KSYMS ksym-wl := $(CONFIG_UNUSED_KSYMS_WHITELIST) -ksym-wl := $(if $(filter-out /%, $(ksym-wl)),$(srctree)/)$(ksym-wl) +ksym-wl := $(if $(filter-out /%, $(ksym-wl)),$(if $(wildcard $(ksym-wl)),,$(srctree)/))$(ksym-wl) modpost-args += -t $(addprefix -u , $(ksym-wl)) modpost-deps += $(ksym-wl) endif -- cgit v1.2.3 From 086437d94aa3591b459e64bffed657b88dcc46a7 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 12 Apr 2024 18:53:47 +0000 Subject: kallsyms: replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. The goal is to remove its use completely [2]. namebuf is eventually cleaned of any trailing llvm suffixes using strstr(). This hints that namebuf should be NUL-terminated. static void cleanup_symbol_name(char *s) { char *res; ... res = strstr(s, ".llvm."); ... } Due to this, use strscpy() over strncpy() as it guarantees NUL-termination on the destination buffer. Drop the -1 from the length calculation as it is no longer needed to ensure NUL-termination. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html Link: https://github.com/KSPP/linux/issues/90 [2] Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Signed-off-by: Luis Chamberlain --- kernel/module/kallsyms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index ef73ae7c8909..62fb57bb9f16 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -348,7 +348,7 @@ const char *module_address_lookup(unsigned long addr, } /* Make a copy in here where it's safe */ if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + strscpy(namebuf, ret, KSYM_NAME_LEN); ret = namebuf; } preempt_enable(); -- cgit v1.2.3 From bc6b94d3ea062454ca889884db99e145efffcb93 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:17 +0300 Subject: module: make module_memory_{alloc,free} more self-contained MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the logic related to the memory allocation and freeing into module_memory_alloc() and module_memory_free(). Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Masami Hiramatsu (Google) Acked-by: Song Liu Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index e1e8a7a9d6c1..5b82b069e0d3 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type) mod_mem_type_is_core_data(type); } -static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) +static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { + unsigned int size = PAGE_ALIGN(mod->mem[type].size); + void *ptr; + + mod->mem[type].size = size; + if (mod_mem_use_vmalloc(type)) - return vzalloc(size); - return module_alloc(size); + ptr = vmalloc(size); + else + ptr = module_alloc(size); + + if (!ptr) + return -ENOMEM; + + /* + * The pointer to these blocks of memory are stored on the module + * structure and we keep that around so long as the module is + * around. We only free that memory when we unload the module. + * Just mark them as not being a leak then. The .init* ELF + * sections *do* get freed after boot so we *could* treat them + * slightly differently with kmemleak_ignore() and only grey + * them out as they work as typical memory allocations which + * *do* eventually get freed, but let's just keep things simple + * and avoid *any* false positives. + */ + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; + + return 0; } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { + void *ptr = mod->mem[type].base; + if (mod_mem_use_vmalloc(type)) vfree(ptr); else @@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ @@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - void *ptr; enum mod_mem_type t = 0; int ret = -ENOMEM; @@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct load_info *info) mod->mem[type].base = NULL; continue; } - mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size); - ptr = module_memory_alloc(mod->mem[type].size, type); - /* - * The pointer to these blocks of memory are stored on the module - * structure and we keep that around so long as the module is - * around. We only free that memory when we unload the module. - * Just mark them as not being a leak then. The .init* ELF - * sections *do* get freed after boot so we *could* treat them - * slightly differently with kmemleak_ignore() and only grey - * them out as they work as typical memory allocations which - * *do* eventually get freed, but let's just keep things simple - * and avoid *any* false positives. - */ - kmemleak_not_leak(ptr); - if (!ptr) { + + ret = module_memory_alloc(mod, type); + if (ret) { t = type; goto out_enomem; } - memset(ptr, 0, mod->mem[type].size); - mod->mem[type].base = ptr; } /* Transfer each section which specifies SHF_ALLOC */ @@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod, t); return ret; } -- cgit v1.2.3 From 12af2b83d0b17ec8b379b721dd4a8fbcd5d791f3 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:18 +0300 Subject: mm: introduce execmem_alloc() and execmem_free() module_alloc() is used everywhere as a mean to allocate memory for code. Beside being semantically wrong, this unnecessarily ties all subsystems that need to allocate code, such as ftrace, kprobes and BPF to modules and puts the burden of code allocation to the modules code. Several architectures override module_alloc() because of various constraints where the executable memory can be located and this causes additional obstacles for improvements of code allocation. Start splitting code allocation from modules by introducing execmem_alloc() and execmem_free() APIs. Initially, execmem_alloc() is a wrapper for module_alloc() and execmem_free() is a replacement of module_memfree() to allow updating all call sites to use the new APIs. Since architectures define different restrictions on placement, permissions, alignment and other parameters for memory that can be used by different subsystems that allocate executable memory, execmem_alloc() takes a type argument, that will be used to identify the calling subsystem and to allow architectures define parameters for ranges suitable for that subsystem. No functional changes. Signed-off-by: Mike Rapoport (IBM) Acked-by: Masami Hiramatsu (Google) Acked-by: Song Liu Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- arch/powerpc/kernel/kprobes.c | 6 ++--- arch/s390/kernel/ftrace.c | 4 +-- arch/s390/kernel/kprobes.c | 4 +-- arch/s390/kernel/module.c | 5 ++-- arch/sparc/net/bpf_jit_comp_32.c | 8 +++--- arch/x86/kernel/ftrace.c | 6 ++--- arch/x86/kernel/kprobes/core.c | 4 +-- include/linux/execmem.h | 57 ++++++++++++++++++++++++++++++++++++++++ include/linux/moduleloader.h | 3 --- kernel/bpf/core.c | 6 ++--- kernel/kprobes.c | 8 +++--- kernel/module/Kconfig | 1 + kernel/module/main.c | 25 ++++++------------ mm/Kconfig | 3 +++ mm/Makefile | 1 + mm/execmem.c | 32 ++++++++++++++++++++++ 16 files changed, 128 insertions(+), 45 deletions(-) create mode 100644 include/linux/execmem.h create mode 100644 mm/execmem.c (limited to 'kernel') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bbca90a5e2ec..9fcd01bb2ce6 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -130,7 +130,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; @@ -142,7 +142,7 @@ void *alloc_insn_page(void) } return page; error: - module_memfree(page); + execmem_free(page); return NULL; } diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c46381ea04ec..798249ef5646 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -7,13 +7,13 @@ * Author(s): Martin Schwidefsky */ -#include #include #include #include #include #include #include +#include #include #include #include @@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void) { const char *start, *end; - ftrace_plt = module_alloc(PAGE_SIZE); + ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE); if (!ftrace_plt) panic("cannot allocate ftrace plt\n"); diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index f0cf20d4b3c5..3c1b1be744de 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -9,7 +9,6 @@ #define pr_fmt(fmt) "kprobes: " fmt -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +38,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; set_memory_rox((unsigned long)page, 1); diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 42215f9404af..ac97a905e8cd 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,7 @@ void *module_alloc(unsigned long size) #ifdef CONFIG_FUNCTION_TRACER void module_arch_cleanup(struct module *mod) { - module_memfree(mod->arch.trampolines_start); + execmem_free(mod->arch.trampolines_start); } #endif @@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); numpages = DIV_ROUND_UP(size, PAGE_SIZE); - start = module_alloc(numpages * PAGE_SIZE); + start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE); if (!start) return -ENOMEM; set_memory_rox((unsigned long)start, numpages); diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index da2df1e84ed4..bda2dbd3f4c5 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include #include #include #include #include +#include #include #include @@ -713,7 +713,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (unlikely(proglen + ilen > oldproglen)) { pr_err("bpb_jit_compile fatal error\n"); kfree(addrs); - module_memfree(image); + execmem_free(image); return; } memcpy(image + proglen, temp, ilen); @@ -736,7 +736,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; break; } if (proglen == oldproglen) { - image = module_alloc(proglen); + image = execmem_alloc(EXECMEM_BPF, proglen); if (!image) goto out; } @@ -758,7 +758,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_memfree(fp->bpf_func); + execmem_free(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 70139d9d2e01..c8ddb7abda7c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -261,15 +262,14 @@ void arch_ftrace_update_code(int command) #ifdef CONFIG_X86_64 #ifdef CONFIG_MODULES -#include /* Module allocation simplifies allocating memory for code */ static inline void *alloc_tramp(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_FTRACE, size); } static inline void tramp_free(void *tramp) { - module_memfree(tramp); + execmem_free(tramp); } #else /* Trampolines can only be created if modules are supported */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d0e49bd7c6f3..72e6a45e7ec2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -40,12 +40,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include @@ -495,7 +495,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; diff --git a/include/linux/execmem.h b/include/linux/execmem.h new file mode 100644 index 000000000000..8eebc8ef66e7 --- /dev/null +++ b/include/linux/execmem.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_EXECMEM_ALLOC_H +#define _LINUX_EXECMEM_ALLOC_H + +#include +#include + +/** + * enum execmem_type - types of executable memory ranges + * + * There are several subsystems that allocate executable memory. + * Architectures define different restrictions on placement, + * permissions, alignment and other parameters for memory that can be used + * by these subsystems. + * Types in this enum identify subsystems that allocate executable memory + * and let architectures define parameters for ranges suitable for + * allocations by each subsystem. + * + * @EXECMEM_DEFAULT: default parameters that would be used for types that + * are not explicitly defined. + * @EXECMEM_MODULE_TEXT: parameters for module text sections + * @EXECMEM_KPROBES: parameters for kprobes + * @EXECMEM_FTRACE: parameters for ftrace + * @EXECMEM_BPF: parameters for BPF + * @EXECMEM_TYPE_MAX: + */ +enum execmem_type { + EXECMEM_DEFAULT, + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT, + EXECMEM_KPROBES, + EXECMEM_FTRACE, + EXECMEM_BPF, + EXECMEM_TYPE_MAX, +}; + +/** + * execmem_alloc - allocate executable memory + * @type: type of the allocation + * @size: how many bytes of memory are required + * + * Allocates memory that will contain executable code, either generated or + * loaded from kernel modules. + * + * The memory will have protections defined by architecture for executable + * region of the @type. + * + * Return: a pointer to the allocated memory or %NULL + */ +void *execmem_alloc(enum execmem_type type, size_t size); + +/** + * execmem_free - free executable memory + * @ptr: pointer to the memory that should be freed + */ +void execmem_free(void *ptr); + +#endif /* _LINUX_EXECMEM_ALLOC_H */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 89b1e0ed9811..a3b8caee9405 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -29,9 +29,6 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); sections. Returns NULL on failure. */ void *module_alloc(unsigned long size); -/* Free memory returned from module_alloc. */ -void module_memfree(void *module_region); - /* Determines if the section name is an init section (that is only used during * module loading). */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1ea5ce5bb599..892e50afda59 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,7 @@ #include #include #include +#include #include #include @@ -1050,12 +1050,12 @@ void bpf_jit_uncharge_modmem(u32 size) void *__weak bpf_jit_alloc_exec(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { - module_memfree(addr); + execmem_free(addr); } struct bpf_binary_header * diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 65adc815fc6e..ddd7cdc16edf 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -39,6 +38,7 @@ #include #include #include +#include #include #include @@ -113,17 +113,17 @@ enum kprobe_slot_state { void __weak *alloc_insn_page(void) { /* - * Use module_alloc() so this page is within +/- 2GB of where the + * Use execmem_alloc() so this page is within +/- 2GB of where the * kernel image and loaded module images reside. This is required * for most of the architectures. * (e.g. x86-64 needs this to handle the %rip-relative fixups.) */ - return module_alloc(PAGE_SIZE); + return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); } static void free_insn_page(void *page) { - module_memfree(page); + execmem_free(page); } struct kprobe_insn_cache kprobe_insn_slots = { diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index cb8377a18927..4047b6d48255 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -2,6 +2,7 @@ menuconfig MODULES bool "Enable loadable module support" modules + select EXECMEM help Kernel modules are small pieces of compiled code which can be inserted in the running kernel, rather than being diff --git a/kernel/module/main.c b/kernel/module/main.c index 5b82b069e0d3..d56b7df0cbb6 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1179,16 +1180,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -void __weak module_memfree(void *module_region) -{ - /* - * This memory may be RO, and freeing RO memory in an interrupt is not - * supported by vmalloc. - */ - WARN_ON(in_interrupt()); - vfree(module_region); -} - void __weak module_arch_cleanup(struct module *mod) { } @@ -1213,7 +1204,7 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (mod_mem_use_vmalloc(type)) ptr = vmalloc(size); else - ptr = module_alloc(size); + ptr = execmem_alloc(EXECMEM_MODULE_TEXT, size); if (!ptr) return -ENOMEM; @@ -1244,7 +1235,7 @@ static void module_memory_free(struct module *mod, enum mod_mem_type type) if (mod_mem_use_vmalloc(type)) vfree(ptr); else - module_memfree(ptr); + execmem_free(ptr); } static void free_mod_mem(struct module *mod) @@ -2496,9 +2487,9 @@ static void do_free_init(struct work_struct *w) llist_for_each_safe(pos, n, list) { initfree = container_of(pos, struct mod_initfree, node); - module_memfree(initfree->init_text); - module_memfree(initfree->init_data); - module_memfree(initfree->init_rodata); + execmem_free(initfree->init_text); + execmem_free(initfree->init_data); + execmem_free(initfree->init_rodata); kfree(initfree); } } @@ -2608,10 +2599,10 @@ static noinline int do_init_module(struct module *mod) * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success - * path. module_memfree() cannot be called in an interrupt, so do the + * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. * - * Note that module_alloc() on most architectures creates W+X page + * Note that execmem_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work by invoking diff --git a/mm/Kconfig b/mm/Kconfig index f30a18a0e37d..8bb60fdaba3c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1240,6 +1240,9 @@ config LOCK_MM_AND_FIND_VMA config IOMMU_MM_DATA bool +config EXECMEM + bool + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 4abb40b911ec..001336c91864 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_EXECMEM) += execmem.o diff --git a/mm/execmem.c b/mm/execmem.c new file mode 100644 index 000000000000..480adc69b20d --- /dev/null +++ b/mm/execmem.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 Richard Henderson + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + * Copyright (C) 2023 Luis Chamberlain + * Copyright (C) 2024 Mike Rapoport IBM. + */ + +#include +#include +#include +#include + +static void *__execmem_alloc(size_t size) +{ + return module_alloc(size); +} + +void *execmem_alloc(enum execmem_type type, size_t size) +{ + return __execmem_alloc(size); +} + +void execmem_free(void *ptr) +{ + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); + vfree(ptr); +} -- cgit v1.2.3 From 223b5e57d0d50b0c07b933350dbcde92018d3080 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:20 +0300 Subject: mm/execmem, arch: convert remaining overrides of module_alloc to execmem Extend execmem parameters to accommodate more complex overrides of module_alloc() by architectures. This includes specification of a fallback range required by arm, arm64 and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for allocation of KASAN shadow required by s390 and x86 and support for late initialization of execmem required by arm64. The core implementation of execmem_alloc() takes care of suppressing warnings when the initial allocation fails but there is a fallback range defined. Signed-off-by: Mike Rapoport (IBM) Acked-by: Will Deacon Acked-by: Song Liu Tested-by: Liviu Dudau Signed-off-by: Luis Chamberlain --- arch/Kconfig | 8 +++++ arch/arm/kernel/module.c | 41 ++++++++++++++---------- arch/arm64/Kconfig | 1 + arch/arm64/kernel/module.c | 55 ++++++++++++++++++-------------- arch/powerpc/kernel/module.c | 60 ++++++++++++++++++++++------------- arch/s390/kernel/module.c | 54 +++++++++++++------------------- arch/x86/kernel/module.c | 70 ++++++++++++++--------------------------- include/linux/execmem.h | 30 +++++++++++++++++- include/linux/moduleloader.h | 12 ------- kernel/module/main.c | 26 ++++------------ mm/execmem.c | 74 +++++++++++++++++++++++++++++++++++++------- 11 files changed, 246 insertions(+), 185 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 93404c802d29..ee1ccbde50ef 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -977,6 +977,14 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC For architectures like powerpc/32 which have constraints on module allocation and need to allocate module data outside of module area. +config ARCH_WANTS_EXECMEM_LATE + bool + help + For architectures that do not allocate executable memory early on + boot, but rather require its initialization late when there is + enough entropy for module space randomization, for instance + arm64. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index e74d84f58b77..a98fdf6ff26c 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -34,23 +35,31 @@ #endif #ifdef CONFIG_MMU -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; - - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) - return p; - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + unsigned long fallback_start = 0, fallback_end = 0; + + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { + fallback_start = VMALLOC_START; + fallback_end = VMALLOC_END; + } + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end = MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + }, + }; + + return &execmem_info; } #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b11c98b3e84..74b34a78b7ac 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -105,6 +105,7 @@ config ARM64 select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANTS_EXECMEM_LATE if EXECMEM select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES select ARCH_HAS_UBSAN diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index e92da4da1b2a..b7a7a23f9f8f 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -108,41 +109,47 @@ static int __init module_init_limits(void) return 0; } -subsys_initcall(module_init_limits); -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - void *p = NULL; + unsigned long fallback_start = 0, fallback_end = 0; + unsigned long start = 0, end = 0; + + module_init_limits(); /* * Where possible, prefer to allocate within direct branch range of the * kernel such that no PLTs are necessary. */ if (module_direct_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, - module_direct_base, - module_direct_base + SZ_128M, - GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - } + start = module_direct_base; + end = module_direct_base + SZ_128M; - if (!p && module_plt_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, - module_plt_base, - module_plt_base + SZ_2G, - GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - } - - if (!p) { - pr_warn_ratelimited("%s: unable to allocate memory\n", - __func__); + if (module_plt_base) { + fallback_start = module_plt_base; + fallback_end = module_plt_base + SZ_2G; + } + } else if (module_plt_base) { + start = module_plt_base; + end = module_plt_base + SZ_2G; } - /* Memory is intended to be executable, reset the pointer tag. */ - return kasan_reset_tag(p); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = start, + .end = end, + .pgprot = PAGE_KERNEL, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + }, + }; + + return &execmem_info; } enum aarch64_reloc_op { diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index f6d6ae0a1692..ac80559015a3 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -89,39 +90,56 @@ int module_finalize(const Elf_Ehdr *hdr, return 0; } -static __always_inline void * -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); + unsigned long fallback_start = 0, fallback_end = 0; + unsigned long start, end; /* - * Don't do huge page allocations for modules yet until more testing - * is done. STRICT_MODULE_RWX may require extra work to support this - * too. + * BOOK3S_32 and 8xx define MODULES_VADDR for text allocations and + * allow allocating data in the entire vmalloc space */ - return __vmalloc_node_range(size, 1, start, end, gfp, prot, - VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - -void *module_alloc(unsigned long size) -{ #ifdef MODULES_VADDR unsigned long limit = (unsigned long)_etext - SZ_32M; - void *ptr = NULL; BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); /* First try within 32M limit from _etext to avoid branch trampolines */ - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) - ptr = __module_alloc(size, limit, MODULES_END, true); - - if (!ptr) - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) { + start = limit; + fallback_start = MODULES_VADDR; + fallback_end = MODULES_END; + } else { + start = MODULES_VADDR; + } - return ptr; + end = MODULES_END; #else - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); + start = VMALLOC_START; + end = VMALLOC_END; #endif + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = start, + .end = end, + .pgprot = prot, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + [EXECMEM_MODULE_DATA] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return &execmem_info; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index ac97a905e8cd..7fee64fdc1bb 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -37,41 +37,31 @@ #define PLT_ENTRY_SIZE 22 -static unsigned long get_module_load_offset(void) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - static DEFINE_MUTEX(module_kaslr_mutex); - static unsigned long module_load_offset; - - if (!kaslr_enabled()) - return 0; - /* - * Calculate the module_load_offset the first time this code - * is called. Once calculated it stays the same until reboot. - */ - mutex_lock(&module_kaslr_mutex); - if (!module_load_offset) + unsigned long module_load_offset = 0; + unsigned long start; + + if (kaslr_enabled()) module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; - mutex_unlock(&module_kaslr_mutex); - return module_load_offset; -} -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, PAGE_KERNEL, - VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - return p; + start = MODULES_VADDR + module_load_offset; + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + }, + }; + + return &execmem_info; } #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e18914c0e38a..45b1a7c03379 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -36,55 +37,30 @@ do { \ } while (0) #endif -#ifdef CONFIG_RANDOMIZE_BASE -static unsigned long module_load_offset; +static struct execmem_info execmem_info __ro_after_init; -/* Mutex protects the module_load_offset. */ -static DEFINE_MUTEX(module_kaslr_mutex); - -static unsigned long int get_module_load_offset(void) -{ - if (kaslr_enabled()) { - mutex_lock(&module_kaslr_mutex); - /* - * Calculate the module_load_offset the first time this - * code is called. Once calculated it stays the same until - * reboot. - */ - if (module_load_offset == 0) - module_load_offset = - get_random_u32_inclusive(1, 1024) * PAGE_SIZE; - mutex_unlock(&module_kaslr_mutex); - } - return module_load_offset; -} -#else -static unsigned long int get_module_load_offset(void) +struct execmem_info __init *execmem_arch_setup(void) { - return 0; -} -#endif - -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - - p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, PAGE_KERNEL, - VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); - - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - - return p; + unsigned long start, offset = 0; + + if (kaslr_enabled()) + offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + + start = MODULES_VADDR + offset; + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + }, + }; + + return &execmem_info; } #ifdef CONFIG_X86_32 diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 96fc59258467..32cef1144117 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -5,6 +5,14 @@ #include #include +#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ + !defined(CONFIG_KASAN_VMALLOC) +#include +#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) +#else +#define MODULE_ALIGN PAGE_SIZE +#endif + /** * enum execmem_type - types of executable memory ranges * @@ -22,6 +30,7 @@ * @EXECMEM_KPROBES: parameters for kprobes * @EXECMEM_FTRACE: parameters for ftrace * @EXECMEM_BPF: parameters for BPF + * @EXECMEM_MODULE_DATA: parameters for module data sections * @EXECMEM_TYPE_MAX: */ enum execmem_type { @@ -30,22 +39,38 @@ enum execmem_type { EXECMEM_KPROBES, EXECMEM_FTRACE, EXECMEM_BPF, + EXECMEM_MODULE_DATA, EXECMEM_TYPE_MAX, }; +/** + * enum execmem_range_flags - options for executable memory allocations + * @EXECMEM_KASAN_SHADOW: allocate kasan shadow + */ +enum execmem_range_flags { + EXECMEM_KASAN_SHADOW = (1 << 0), +}; + /** * struct execmem_range - definition of an address space suitable for code and * related data allocations * @start: address space start * @end: address space end (inclusive) + * @fallback_start: start of the secondary address space range for fallback + * allocations on architectures that require it + * @fallback_end: start of the secondary address space (inclusive) * @pgprot: permissions for memory in this address space * @alignment: alignment required for text allocations + * @flags: options for memory allocations for this range */ struct execmem_range { unsigned long start; unsigned long end; + unsigned long fallback_start; + unsigned long fallback_end; pgprot_t pgprot; unsigned int alignment; + enum execmem_range_flags flags; }; /** @@ -82,6 +107,9 @@ struct execmem_info *execmem_arch_setup(void); * Allocates memory that will contain executable code, either generated or * loaded from kernel modules. * + * Allocates memory that will contain data coupled with executable code, + * like data sections in kernel modules. + * * The memory will have protections defined by architecture for executable * region of the @type. * @@ -95,7 +123,7 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); -#ifdef CONFIG_EXECMEM +#if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE) void execmem_init(void); #else static inline void execmem_init(void) {} diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index a3b8caee9405..e395461d59e5 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -25,10 +25,6 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, /* Additional bytes needed by arch in front of individual sections */ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); -/* Allocator used for allocating struct module, core sections and init - sections. Returns NULL on failure. */ -void *module_alloc(unsigned long size); - /* Determines if the section name is an init section (that is only used during * module loading). */ @@ -126,12 +122,4 @@ void module_arch_cleanup(struct module *mod); /* Any cleanup before freeing mod->module_init */ void module_arch_freeing_init(struct module *mod); -#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ - !defined(CONFIG_KASAN_VMALLOC) -#include -#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) -#else -#define MODULE_ALIGN PAGE_SIZE -#endif - #endif diff --git a/kernel/module/main.c b/kernel/module/main.c index d56b7df0cbb6..91e185607d4b 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1188,24 +1188,20 @@ void __weak module_arch_freeing_init(struct module *mod) { } -static bool mod_mem_use_vmalloc(enum mod_mem_type type) -{ - return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) && - mod_mem_type_is_core_data(type); -} - static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); + enum execmem_type execmem_type; void *ptr; mod->mem[type].size = size; - if (mod_mem_use_vmalloc(type)) - ptr = vmalloc(size); + if (mod_mem_type_is_data(type)) + execmem_type = EXECMEM_MODULE_DATA; else - ptr = execmem_alloc(EXECMEM_MODULE_TEXT, size); + execmem_type = EXECMEM_MODULE_TEXT; + ptr = execmem_alloc(execmem_type, size); if (!ptr) return -ENOMEM; @@ -1232,10 +1228,7 @@ static void module_memory_free(struct module *mod, enum mod_mem_type type) { void *ptr = mod->mem[type].base; - if (mod_mem_use_vmalloc(type)) - vfree(ptr); - else - execmem_free(ptr); + execmem_free(ptr); } static void free_mod_mem(struct module *mod) @@ -1630,13 +1623,6 @@ static void free_modinfo(struct module *mod) } } -void * __weak module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - bool __weak module_init_section(const char *name) { return strstarts(name, ".init"); diff --git a/mm/execmem.c b/mm/execmem.c index 80e61c1e7319..0c4b36bc6d10 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -12,27 +12,49 @@ #include static struct execmem_info *execmem_info __ro_after_init; +static struct execmem_info default_execmem_info __ro_after_init; static void *__execmem_alloc(struct execmem_range *range, size_t size) { + bool kasan = range->flags & EXECMEM_KASAN_SHADOW; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; unsigned long start = range->start; unsigned long end = range->end; unsigned int align = range->alignment; pgprot_t pgprot = range->pgprot; + void *p; + + if (kasan) + vm_flags |= VM_DEFER_KMEMLEAK; + + p = __vmalloc_node_range(size, align, start, end, gfp_flags, + pgprot, vm_flags, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!p && range->fallback_start) { + start = range->fallback_start; + end = range->fallback_end; + p = __vmalloc_node_range(size, align, start, end, gfp_flags, + pgprot, vm_flags, NUMA_NO_NODE, + __builtin_return_address(0)); + } + + if (!p) { + pr_warn_ratelimited("execmem: unable to allocate memory\n"); + return NULL; + } + + if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { + vfree(p); + return NULL; + } - return __vmalloc_node_range(size, align, start, end, - GFP_KERNEL, pgprot, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); + return kasan_reset_tag(p); } void *execmem_alloc(enum execmem_type type, size_t size) { - struct execmem_range *range; - - if (!execmem_info) - return module_alloc(size); - - range = &execmem_info->ranges[type]; + struct execmem_range *range = &execmem_info->ranges[type]; return __execmem_alloc(range, size); } @@ -67,10 +89,16 @@ static void execmem_init_missing(struct execmem_info *info) struct execmem_range *r = &info->ranges[i]; if (!r->start) { - r->pgprot = default_range->pgprot; + if (i == EXECMEM_MODULE_DATA) + r->pgprot = PAGE_KERNEL; + else + r->pgprot = default_range->pgprot; r->alignment = default_range->alignment; r->start = default_range->start; r->end = default_range->end; + r->flags = default_range->flags; + r->fallback_start = default_range->fallback_start; + r->fallback_end = default_range->fallback_end; } } } @@ -80,14 +108,36 @@ struct execmem_info * __weak execmem_arch_setup(void) return NULL; } -void __init execmem_init(void) +static void __init __execmem_init(void) { struct execmem_info *info = execmem_arch_setup(); - if (!info || !execmem_validate(info)) + if (!info) { + info = execmem_info = &default_execmem_info; + info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; + info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; + info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; + info->ranges[EXECMEM_DEFAULT].alignment = 1; + } + + if (!execmem_validate(info)) return; execmem_init_missing(info); execmem_info = info; } + +#ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE +static int __init execmem_late_init(void) +{ + __execmem_init(); + return 0; +} +core_initcall(execmem_late_init); +#else +void __init execmem_init(void) +{ + __execmem_init(); +} +#endif -- cgit v1.2.3 From 7582b7be16d0ba90e3dbd9575a730cabd9eb852a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:27 +0300 Subject: kprobes: remove dependency on CONFIG_MODULES kprobes depended on CONFIG_MODULES because it has to allocate memory for code. Since code allocations are now implemented with execmem, kprobes can be enabled in non-modular kernels. Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the dependency of CONFIG_KPROBES on CONFIG_MODULES. Signed-off-by: Mike Rapoport (IBM) Acked-by: Masami Hiramatsu (Google) [mcgrof: rebase in light of NEED_TASKS_RCU ] Signed-off-by: Luis Chamberlain --- arch/Kconfig | 2 +- include/linux/module.h | 9 ++++++++ kernel/kprobes.c | 55 ++++++++++++++++++++++++++++----------------- kernel/trace/trace_kprobe.c | 20 ++++++++++++++++- 4 files changed, 63 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index ee1ccbde50ef..b34946d90e4b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -60,9 +60,9 @@ config GENERIC_ENTRY config KPROBES bool "Kprobes" - depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select EXECMEM select NEED_TASKS_RCU help Kprobes allows you to trap at almost any kernel address and diff --git a/include/linux/module.h b/include/linux/module.h index 1153b0d99a80..ffa1c603163c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod) return mod->state != MODULE_STATE_GOING; } +static inline bool module_is_coming(struct module *mod) +{ + return mod->state == MODULE_STATE_COMING; +} + struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); @@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr) return ptr; } +static inline bool module_is_coming(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ddd7cdc16edf..ca2c6cbd42d2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* Get module refcount and reject __init functions for loaded modules. */ - if (*probed_mod) { + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p, * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && - (*probed_mod)->state != MODULE_STATE_COMING) { + !module_is_coming(*probed_mod)) { module_put(*probed_mod); *probed_mod = NULL; ret = -ENOENT; } } + out: preempt_enable(); jump_label_unlock(); @@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } -/* Remove all symbols in given area from kprobe blacklist */ -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) -{ - struct kprobe_blacklist_entry *ent, *n; - - list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { - if (ent->start_addr < start || ent->start_addr >= end) - continue; - list_del(&ent->list); - kfree(ent); - } -} - -static void kprobe_remove_ksym_blacklist(unsigned long entry) -{ - kprobe_remove_area_blacklist(entry, entry + 1); -} - int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) { @@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +#ifdef CONFIG_MODULES +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; @@ -2672,6 +2674,17 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +static int kprobe_register_module_notifier(void) +{ + return register_module_notifier(&kprobe_module_nb); +} +#else +static int kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + void kprobe_free_init_mem(void) { void *start = (void *)(&__init_begin); @@ -2731,7 +2744,7 @@ static int __init init_kprobes(void) if (!err) err = register_die_notifier(&kprobe_exceptions_nb); if (!err) - err = register_module_notifier(&kprobe_module_nb); + err = kprobe_register_module_notifier(); kprobes_initialized = (err == 0); kprobe_sysctls_init(); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14099cc17fc9..2cb2a3951b4f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } +#ifdef CONFIG_MODULES static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { char *p; @@ -129,6 +130,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) return ret; } +#else +static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) +{ + return false; +} +#endif static bool trace_kprobe_is_busy(struct dyn_event *ev) { @@ -670,6 +677,7 @@ end: return ret; } +#ifdef CONFIG_MODULES /* Module notifier call back, checking event on the module */ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -704,6 +712,16 @@ static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; +static int trace_kprobe_register_module_notifier(void) +{ + return register_module_notifier(&trace_kprobe_module_nb); +} +#else +static int trace_kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ static int count_symbols(void *data, unsigned long unused) { @@ -1933,7 +1951,7 @@ static __init int init_kprobe_trace_early(void) if (ret) return ret; - if (register_module_notifier(&trace_kprobe_module_nb)) + if (trace_kprobe_register_module_notifier()) return -EINVAL; return 0; -- cgit v1.2.3 From 2c9e5d4a008293407836d29d35dfd4353615bd2f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:28 +0300 Subject: bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF just-in-time compiler depended on CONFIG_MODULES because it used module_alloc() to allocate memory for the generated code. Since code allocations are now implemented with execmem, drop dependency of CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM. Suggested-by: Björn Töpel Signed-off-by: Mike Rapoport (IBM) Signed-off-by: Luis Chamberlain --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 4100df44c665..17067dcb4386 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -43,7 +43,7 @@ config BPF_JIT bool "Enable BPF Just In Time compiler" depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT - depends on MODULES + select EXECMEM help BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded -- cgit v1.2.3 From e60b613df8b6253def41215402f72986fee3fc8d Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Fri, 10 May 2024 03:28:59 +0800 Subject: ftrace: Fix possible use-after-free issue in ftrace_location() KASAN reports a bug: BUG: KASAN: use-after-free in ftrace_location+0x90/0x120 Read of size 8 at addr ffff888141d40010 by task insmod/424 CPU: 8 PID: 424 Comm: insmod Tainted: G W 6.9.0-rc2+ [...] Call Trace: dump_stack_lvl+0x68/0xa0 print_report+0xcf/0x610 kasan_report+0xb5/0xe0 ftrace_location+0x90/0x120 register_kprobe+0x14b/0xa40 kprobe_init+0x2d/0xff0 [kprobe_example] do_one_initcall+0x8f/0x2d0 do_init_module+0x13a/0x3c0 load_module+0x3082/0x33d0 init_module_from_file+0xd2/0x130 __x64_sys_finit_module+0x306/0x440 do_syscall_64+0x68/0x140 entry_SYSCALL_64_after_hwframe+0x71/0x79 The root cause is that, in lookup_rec(), ftrace record of some address is being searched in ftrace pages of some module, but those ftrace pages at the same time is being freed in ftrace_release_mod() as the corresponding module is being deleted: CPU1 | CPU2 register_kprobes() { | delete_module() { check_kprobe_address_safe() { | arch_check_ftrace_location() { | ftrace_location() { | lookup_rec() // USE! | ftrace_release_mod() // Free! To fix this issue: 1. Hold rcu lock as accessing ftrace pages in ftrace_location_range(); 2. Use ftrace_location_range() instead of lookup_rec() in ftrace_location(); 3. Call synchronize_rcu() before freeing any ftrace pages both in ftrace_process_locs()/ftrace_release_mod()/ftrace_free_mem(). Link: https://lore.kernel.org/linux-trace-kernel/20240509192859.1273558-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Cc: Cc: Cc: Fixes: ae6aa16fdc16 ("kprobes: introduce ftrace based optimization") Suggested-by: Steven Rostedt Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5a01d72f66db..2308c0a2fd29 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1595,12 +1595,15 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct dyn_ftrace *rec; + unsigned long ip = 0; + rcu_read_lock(); rec = lookup_rec(start, end); if (rec) - return rec->ip; + ip = rec->ip; + rcu_read_unlock(); - return 0; + return ip; } /** @@ -1614,25 +1617,22 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) */ unsigned long ftrace_location(unsigned long ip) { - struct dyn_ftrace *rec; + unsigned long loc; unsigned long offset; unsigned long size; - rec = lookup_rec(ip, ip); - if (!rec) { + loc = ftrace_location_range(ip, ip); + if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) goto out; /* map sym+0 to __fentry__ */ if (!offset) - rec = lookup_rec(ip, ip + size - 1); + loc = ftrace_location_range(ip, ip + size - 1); } - if (rec) - return rec->ip; - out: - return 0; + return loc; } /** @@ -6591,6 +6591,8 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { WARN_ON(!skipped); + /* Need to synchronize with ftrace_location_range() */ + synchronize_rcu(); ftrace_free_pages(pg_unuse); } return ret; @@ -6804,6 +6806,9 @@ void ftrace_release_mod(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) + synchronize_rcu(); for (pg = tmp_page; pg; pg = tmp_page) { /* Needs to be called outside of ftrace_lock */ @@ -7137,6 +7142,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) unsigned long start = (unsigned long)(start_ptr); unsigned long end = (unsigned long)(end_ptr); struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; @@ -7178,12 +7184,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - ftrace_number_of_groups--; - kfree(pg); + pg->next = tmp_page; + tmp_page = pg; pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) ftrace_pages = pg; @@ -7200,6 +7202,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) clear_func_from_hashes(func); kfree(func); } + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) { + synchronize_rcu(); + ftrace_free_pages(tmp_page); + } } void __init ftrace_free_init_mem(void) -- cgit v1.2.3 From 21c38a3bd4ee3fb7337d013a638302fb5e5f9dc2 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 May 2024 16:04:11 +0200 Subject: cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints This closely resembles helpers added for the global cgroup_rstat_lock in commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock. Based on production workloads, we observe the fast-path "update" function cgroup_rstat_updated() is invoked around 3 million times per sec, while the "flush" function cgroup_rstat_flush_locked(), walking each possible CPU, can see periodic spikes of 700 invocations/sec. For this reason, the tracepoints are split into normal and fastpath versions for this per-CPU lock. Making it feasible for production to continuously monitor the non-fastpath tracepoint to detect lock contention issues. The reason for monitoring is that lock disables IRQs which can disturb e.g. softirq processing on the local CPUs involved. When the global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex), this per CPU lock becomes the next bottleneck that can introduce latency variations. A practical bpftrace script for monitoring contention latency: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_cpu_lock_contended { @start[tid]=nsecs; @cnt[probe]=count()} tracepoint:cgroup:cgroup_rstat_cpu_locked { if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);} @cnt[probe]=count()} interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}' Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Tejun Heo --- include/trace/events/cgroup.h | 56 ++++++++++++++++++++++++++++++---- kernel/cgroup/rstat.c | 70 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 108 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index 13f375800135..0b95865a90f3 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, DECLARE_EVENT_CLASS(cgroup_rstat, - TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended), + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - TP_ARGS(cgrp, cpu_in_loop, contended), + TP_ARGS(cgrp, cpu, contended), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) - __field( int, cpu_in_loop ) + __field( int, cpu ) __field( bool, contended ) ), @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat, __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; - __entry->cpu_in_loop = cpu_in_loop; + __entry->cpu = cpu; __entry->contended = contended; ), - TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d", + TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d", __entry->root, __entry->id, __entry->level, - __entry->cpu_in_loop, __entry->contended) + __entry->cpu, __entry->contended) ); +/* Related to global: cgroup_rstat_lock */ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock, TP_ARGS(cgrp, cpu, contended) ); +/* Related to per CPU: cgroup_rstat_cpu_lock */ +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, + + TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), + + TP_ARGS(cgrp, cpu, contended) +); + #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 52e3b0ed1cee..fb8b49437573 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, const bool fast_path) +{ + unsigned long flags; + bool contended; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + contended = !raw_spin_trylock_irqsave(cpu_lock, flags); + if (contended) { + if (fast_path) + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + + raw_spin_lock_irqsave(cpu_lock, flags); + } + + if (fast_path) + trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + + return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, unsigned long flags, + const bool fast_path) +{ + if (fast_path) + trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); + else + trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); } /** @@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); return head; } -- cgit v1.2.3 From b9c6820f029abaabbc37646093866aa730ca0928 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 15 May 2024 01:05:58 -0400 Subject: ring-buffer: Add cast to unsigned long addr passed to virt_to_page() The sub-buffer pages are held in an unsigned long array, and when it is passed to virt_to_page() a cast is needed. Link: https://lore.kernel.org/all/20240515124808.06279d04@canb.auug.org.au/ Link: https://lore.kernel.org/linux-trace-kernel/20240515010558.4abaefdd@rorschach.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 117c39200d9d ("ring-buffer: Introducing ring-buffer mapping functions") Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a02c7a52a0f5..7345a8b625fb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6283,7 +6283,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { - struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]); + struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) { -- cgit v1.2.3 From 9ee98229083186837199912a7debb666146b8c17 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 May 2024 23:24:39 -0700 Subject: bpf: save extended inner map info for percpu array maps as well ARRAY_OF_MAPS and HASH_OF_MAPS map types have special logic to save a few extra fields required for correct operations of ARRAY maps, when they are used as inner maps. PERCPU_ARRAY maps have similar requirements as they now support generating inline element lookup logic. So make sure that both classes of maps are handled correctly. Reported-by: Jakub Kicinski Fixes: db69718b8efa ("bpf: inline bpf_map_lookup_elem() for PERCPU_ARRAY maps") Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240515062440.846086-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 8ef269e66ba5..b4f18c85d7bc 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -32,7 +32,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ - if (inner_map->ops == &array_map_ops) + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); @@ -68,7 +68,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; - if (inner_map->ops == &array_map_ops) { + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); -- cgit v1.2.3 From 1a7d0890dd4a502a202aaec792a6c04e6e049547 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 1 May 2024 09:29:56 -0700 Subject: kprobe/ftrace: bail out if ftrace was killed If an error happens in ftrace, ftrace_kill() will prevent disarming kprobes. Eventually, the ftrace_ops associated with the kprobes will be freed, yet the kprobes will still be active, and when triggered, they will use the freed memory, likely resulting in a page fault and panic. This behavior can be reproduced quite easily, by creating a kprobe and then triggering a ftrace_kill(). For simplicity, we can simulate an ftrace error with a kernel module like [1]: [1]: https://github.com/brenns10/kernel_stuff/tree/master/ftrace_killer sudo perf probe --add commit_creds sudo perf trace -e probe:commit_creds # In another terminal make sudo insmod ftrace_killer.ko # calls ftrace_kill(), simulating bug # Back to perf terminal # ctrl-c sudo perf probe --del commit_creds After a short period, a page fault and panic would occur as the kprobe continues to execute and uses the freed ftrace_ops. While ftrace_kill() is supposed to be used only in extreme circumstances, it is invoked in FTRACE_WARN_ON() and so there are many places where an unexpected bug could be triggered, yet the system may continue operating, possibly without the administrator noticing. If ftrace_kill() does not panic the system, then we should do everything we can to continue operating, rather than leave a ticking time bomb. Link: https://lore.kernel.org/all/20240501162956.229427-1-stephen.s.brennan@oracle.com/ Signed-off-by: Stephen Brennan Acked-by: Masami Hiramatsu (Google) Acked-by: Guo Ren Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- arch/csky/kernel/probes/ftrace.c | 3 +++ arch/loongarch/kernel/ftrace_dyn.c | 3 +++ arch/parisc/kernel/ftrace.c | 3 +++ arch/powerpc/kernel/kprobes-ftrace.c | 3 +++ arch/riscv/kernel/probes/ftrace.c | 3 +++ arch/s390/kernel/ftrace.c | 3 +++ arch/x86/kernel/kprobes/ftrace.c | 3 +++ include/linux/kprobes.h | 7 +++++++ kernel/kprobes.c | 6 ++++++ kernel/trace/ftrace.c | 1 + 10 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index 834cffcfbce3..7ba4b98076de 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -12,6 +12,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; struct pt_regs *regs; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index 73858c9029cc..bff058317062 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -287,6 +287,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; struct kprobe_ctlblk *kcb; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 621a4b386ae4..c91f9c2e61ed 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -206,6 +206,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 072ebe7f290b..f8208c027148 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct pt_regs *regs; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index 7142ec42e889..a69dfa610aa8 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -11,6 +11,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c46381ea04ec..7f6f8c438c26 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -296,6 +296,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index dd2ec14adb77..15af7e98e161 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0ff44d6633e3..5fcbc254d186 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -378,11 +378,15 @@ static inline void wait_for_kprobe_optimizer(void) { } extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); +/* Set when ftrace has been killed: kprobes on ftrace must be disabled for safety */ +extern bool kprobe_ftrace_disabled __read_mostly; +extern void kprobe_ftrace_kill(void); #else static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) { return -EINVAL; } +static inline void kprobe_ftrace_kill(void) {} #endif /* CONFIG_KPROBES_ON_FTRACE */ /* Get the kprobe at this addr (if any) - called with preemption disabled */ @@ -495,6 +499,9 @@ static inline void kprobe_flush_task(struct task_struct *tk) static inline void kprobe_free_init_mem(void) { } +static inline void kprobe_ftrace_kill(void) +{ +} static inline int disable_kprobe(struct kprobe *kp) { return -EOPNOTSUPP; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 65adc815fc6e..166ebf81dc45 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1068,6 +1068,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; +bool kprobe_ftrace_disabled; static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1136,6 +1137,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } + +void kprobe_ftrace_kill() +{ + kprobe_ftrace_disabled = true; +} #else /* !CONFIG_KPROBES_ON_FTRACE */ static inline int arm_kprobe_ftrace(struct kprobe *p) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..96db99c347b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7895,6 +7895,7 @@ void ftrace_kill(void) ftrace_disabled = 1; ftrace_enabled = 0; ftrace_trace_function = ftrace_stub; + kprobe_ftrace_kill(); } /** -- cgit v1.2.3 From a1fd0b9d751f840df23ef0e75b691fc00cfd4743 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:23 +0300 Subject: sched/fair: Allow disabling sched_balance_newidle with sched_relax_domain_level Change relax_domain_level checks so that it would be possible to include or exclude all domains from newidle balancing. This matches the behavior described in the documentation: -1 no request. use system default or follow request of others. 0 no search. 1 search siblings (hyperthreads in a core). "2" enables levels 0 and 1, level_max excludes the last (level_max) level, and level_max+1 includes all levels. Fixes: 1d3504fcf560 ("sched, cpuset: customize sched domains, core") Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Tested-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/bd6de28e80073c79466ec6401cdeae78f0d4423d.1714488502.git.vitaly@bursov.com --- kernel/cgroup/cpuset.c | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..da24187c4e02 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2948,7 +2948,7 @@ bool current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) + if (val < -1 || val > sched_domain_level_max + 1) return -EINVAL; #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63aecd2a7a9f..67a777b31743 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1475,7 +1475,7 @@ static void set_domain_attribute(struct sched_domain *sd, } else request = attr->relax_domain_level; - if (sd->level > request) { + if (sd->level >= request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } -- cgit v1.2.3 From 287372fa39f579a61e17b000aa74c8418d230528 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:24 +0300 Subject: sched/debug: Dump domains' level Knowing domain's level exactly can be useful when setting relax_domain_level or cpuset.sched_relax_domain_level Usage: cat /debug/sched/domains/cpu0/domain1/level to dump cpu0 domain1's level. SDM macro is not used because sd->level is 'int' and it would hide the type mismatch between 'int' and 'u32'. Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Tested-by: Dietmar Eggemann Acked-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/9489b6475f6dd6fbc67c617752d4216fa094da53.1714488502.git.vitaly@bursov.com --- kernel/sched/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a5834d..c1eb9a1afd13 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -425,6 +425,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); } void update_sched_domain_debugfs(void) -- cgit v1.2.3 From 72bffbf57c5247ac6146d1103ef42e9f8d094bc8 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Thu, 14 Mar 2024 18:59:16 -0700 Subject: sched/fair: Fix initial util_avg calculation Change se->load.weight to se_weight(se) in the calculation for the initial util_avg to avoid unnecessarily inflating the util_avg by 1024 times. The reason is that se->load.weight has the unit/scale as the scaled-up load, while cfs_rg->avg.load_avg has the unit/scale as the true task weight (as mapped directly from the task's nice/priority value). With CONFIG_32BIT, the scaled-up load is equal to the true task weight. With CONFIG_64BIT, the scaled-up load is 1024 times the true task weight. Thus, the current code may inflate the util_avg by 1024 times. The follow-up capping will not allow the util_avg value to go wild. But the calculation should have the correct logic. Signed-off-by: Dawei Li Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Vishal Chourasia Link: https://lore.kernel.org/r/20240315015916.21545-1-daweilics@gmail.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 146ecf9cc3af..900978741a81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1031,7 +1031,8 @@ void init_entity_runnable_average(struct sched_entity *se) * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -1078,7 +1079,7 @@ void post_init_entity_util_avg(struct task_struct *p) if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); sa->util_avg /= (cfs_rq->avg.load_avg + 1); if (sa->util_avg > cap) -- cgit v1.2.3 From 7cb7fb5b49399fc59f1c44686d82c0df0776c8c6 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 5 Mar 2024 15:18:20 +0000 Subject: sched/fair: Remove stale FREQUENCY_UTIL comment On 05/03/2024 15:05, Vincent Guittot wrote: I'm fine with either and that was my first thought here, too, but it did seem like the comment was mostly placed there to justify the 'unexpected' high utilization when explicitly passing FREQUENCY_UTIL and the need to clamp it then. So removing did feel slightly more natural to me anyway. So alternatively: From: Christian Loehle Date: Tue, 5 Mar 2024 09:34:41 +0000 Subject: [PATCH] sched/fair: Remove stale FREQUENCY_UTIL mention effective_cpu_util() flags were removed, so remove mentioning of the flag. commit 9c0b4bb7f6303 ("sched/cpufreq: Rework schedutil governor performance estimation") reworked effective_cpu_util() removing enum cpu_util_type. Modify the comment accordingly. Signed-off-by: Christian Loehle Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/0e2833ee-0939-44e0-82a2-520a585a0153@arm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 900978741a81..9744b5036dd8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7900,8 +7900,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ eff_util = effective_cpu_util(cpu, util, &min, &max); -- cgit v1.2.3 From 49217ea147df7647cb89161b805c797487783fc0 Mon Sep 17 00:00:00 2001 From: Cheng Yu Date: Wed, 24 Apr 2024 21:24:38 +0800 Subject: sched/core: Fix incorrect initialization of the 'burst' parameter in cpu_max_write() In the cgroup v2 CPU subsystem, assuming we have a cgroup named 'test', and we set cpu.max and cpu.max.burst: # echo 1000000 > /sys/fs/cgroup/test/cpu.max # echo 1000000 > /sys/fs/cgroup/test/cpu.max.burst then we check cpu.max and cpu.max.burst: # cat /sys/fs/cgroup/test/cpu.max 1000000 100000 # cat /sys/fs/cgroup/test/cpu.max.burst 1000000 Next we set cpu.max again and check cpu.max and cpu.max.burst: # echo 2000000 > /sys/fs/cgroup/test/cpu.max # cat /sys/fs/cgroup/test/cpu.max 2000000 100000 # cat /sys/fs/cgroup/test/cpu.max.burst 1000 ... we find that the cpu.max.burst value changed unexpectedly. In cpu_max_write(), the unit of the burst value returned by tg_get_cfs_burst() is microseconds, while in cpu_max_write(), the burst unit used for calculation should be nanoseconds, which leads to the bug. To fix it, get the burst value directly from tg->cfs_bandwidth.burst. Fixes: f4183717b370 ("sched/fair: Introduce the burstable CFS controller") Reported-by: Qixin Liao Signed-off-by: Cheng Yu Signed-off-by: Zhang Qiao Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Tested-by: Vincent Guittot Link: https://lore.kernel.org/r/20240424132438.514720-1-serein.chengyu@huawei.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a914388144a..f88f50552acb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11402,7 +11402,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); - u64 burst = tg_get_cfs_burst(tg); + u64 burst = tg->cfs_bandwidth.burst; u64 quota; int ret; -- cgit v1.2.3 From 4b377b4868ef17b040065bd468668c707d2477a5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 17 May 2024 19:17:55 -0700 Subject: kprobe/ftrace: fix build error due to bad function definition Commit 1a7d0890dd4a ("kprobe/ftrace: bail out if ftrace was killed") introduced a bad K&R function definition, which we haven't accepted in a long long time. Gcc seems to let it slide, but clang notices with the appropriate error: kernel/kprobes.c:1140:24: error: a function declaration without a prototype is deprecated in all > 1140 | void kprobe_ftrace_kill() | ^ | void but this commit was apparently never in linux-next before it was sent upstream, so it didn't get the appropriate build test coverage. Fixes: 1a7d0890dd4a kprobe/ftrace: bail out if ftrace was killed Cc: Stephen Brennan Cc: Masami Hiramatsu (Google) Cc: Guo Ren Cc: Steven Rostedt (Google) Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f2cea3c80cb5..6a76a8100073 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1137,7 +1137,7 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } -void kprobe_ftrace_kill() +void kprobe_ftrace_kill(void) { kprobe_ftrace_disabled = true; } -- cgit v1.2.3 From ea70a9628e93168bb5a07b35f7b9a162a65afe9e Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 17 May 2024 15:40:07 +0200 Subject: ring-buffer: Correct stale comments related to non-consuming readers Adjust the following code documentation: * Kernel-doc comments for ring_buffer_read_prepare() and ring_buffer_read_finish() mention that recording to the ring buffer is disabled when the read is active. Remove mention of this restriction because it was already lifted in commit 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator"). * Function ring_buffer_read_finish() performs a self-check of the ring-buffer by locking cpu_buffer->reader_lock and then calling rb_check_pages(). The preceding comment explains that the lock is needed because rb_check_pages() clears the HEAD flag required by readers which might be running in parallel. Remove this explanation because commit 8843e06f67b1 ("ring-buffer: Handle race between rb_move_tail and rb_check_pages") simplified the function so it no longer resets the mentioned flag. Nonetheless, the lock is still needed because a reader swapping a page into the ring buffer can make the underlying doubly-linked list temporarily inconsistent. This is a non-functional change. Link: https://lore.kernel.org/linux-trace-kernel/20240517134008.24529-2-petr.pavlu@suse.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7345a8b625fb..42227727a49d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5046,13 +5046,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer recording + * through the buffer. Memory is allocated, buffer resizing * is disabled, and the iterator pointer is returned to the caller. * - * Disabling buffer recording prevents the reading from being - * corrupted. This is not a consuming read, so a producer is not - * expected. - * * After a sequence of ring_buffer_read_prepare calls, the user is * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going @@ -5139,8 +5135,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_start); * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * - * This re-enables the recording to the buffer, and frees the - * iterator. + * This re-enables resizing of the buffer, and frees the iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) @@ -5148,12 +5143,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - /* - * Ring buffer is disabled from recording, here's a good place - * to check the integrity of the ring buffer. - * Must prevent readers from trying to read, as the check - * clears the HEAD page and readers require it. - */ + /* Use this opportunity to check the integrity of the ring buffer. */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v1.2.3 From c2274b908db05529980ec056359fae916939fdaa Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 17 May 2024 15:40:08 +0200 Subject: ring-buffer: Fix a race between readers and resize checks The reader code in rb_get_reader_page() swaps a new reader page into the ring buffer by doing cmpxchg on old->list.prev->next to point it to the new page. Following that, if the operation is successful, old->list.next->prev gets updated too. This means the underlying doubly-linked list is temporarily inconsistent, page->prev->next or page->next->prev might not be equal back to page for some page in the ring buffer. The resize operation in ring_buffer_resize() can be invoked in parallel. It calls rb_check_pages() which can detect the described inconsistency and stop further tracing: [ 190.271762] ------------[ cut here ]------------ [ 190.271771] WARNING: CPU: 1 PID: 6186 at kernel/trace/ring_buffer.c:1467 rb_check_pages.isra.0+0x6a/0xa0 [ 190.271789] Modules linked in: [...] [ 190.271991] Unloaded tainted modules: intel_uncore_frequency(E):1 skx_edac(E):1 [ 190.272002] CPU: 1 PID: 6186 Comm: cmd.sh Kdump: loaded Tainted: G E 6.9.0-rc6-default #5 158d3e1e6d0b091c34c3b96bfd99a1c58306d79f [ 190.272011] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552c-rebuilt.opensuse.org 04/01/2014 [ 190.272015] RIP: 0010:rb_check_pages.isra.0+0x6a/0xa0 [ 190.272023] Code: [...] [ 190.272028] RSP: 0018:ffff9c37463abb70 EFLAGS: 00010206 [ 190.272034] RAX: ffff8eba04b6cb80 RBX: 0000000000000007 RCX: ffff8eba01f13d80 [ 190.272038] RDX: ffff8eba01f130c0 RSI: ffff8eba04b6cd00 RDI: ffff8eba0004c700 [ 190.272042] RBP: ffff8eba0004c700 R08: 0000000000010002 R09: 0000000000000000 [ 190.272045] R10: 00000000ffff7f52 R11: ffff8eba7f600000 R12: ffff8eba0004c720 [ 190.272049] R13: ffff8eba00223a00 R14: 0000000000000008 R15: ffff8eba067a8000 [ 190.272053] FS: 00007f1bd64752c0(0000) GS:ffff8eba7f680000(0000) knlGS:0000000000000000 [ 190.272057] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 190.272061] CR2: 00007f1bd6662590 CR3: 000000010291e001 CR4: 0000000000370ef0 [ 190.272070] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 190.272073] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 190.272077] Call Trace: [ 190.272098] [ 190.272189] ring_buffer_resize+0x2ab/0x460 [ 190.272199] __tracing_resize_ring_buffer.part.0+0x23/0xa0 [ 190.272206] tracing_resize_ring_buffer+0x65/0x90 [ 190.272216] tracing_entries_write+0x74/0xc0 [ 190.272225] vfs_write+0xf5/0x420 [ 190.272248] ksys_write+0x67/0xe0 [ 190.272256] do_syscall_64+0x82/0x170 [ 190.272363] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 190.272373] RIP: 0033:0x7f1bd657d263 [ 190.272381] Code: [...] [ 190.272385] RSP: 002b:00007ffe72b643f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 190.272391] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f1bd657d263 [ 190.272395] RDX: 0000000000000002 RSI: 0000555a6eb538e0 RDI: 0000000000000001 [ 190.272398] RBP: 0000555a6eb538e0 R08: 000000000000000a R09: 0000000000000000 [ 190.272401] R10: 0000555a6eb55190 R11: 0000000000000246 R12: 00007f1bd6662500 [ 190.272404] R13: 0000000000000002 R14: 00007f1bd6667c00 R15: 0000000000000002 [ 190.272412] [ 190.272414] ---[ end trace 0000000000000000 ]--- Note that ring_buffer_resize() calls rb_check_pages() only if the parent trace_buffer has recording disabled. Recent commit d78ab792705c ("tracing: Stop current tracer when resizing buffer") causes that it is now always the case which makes it more likely to experience this issue. The window to hit this race is nonetheless very small. To help reproducing it, one can add a delay loop in rb_get_reader_page(): ret = rb_head_page_replace(reader, cpu_buffer->reader_page); if (!ret) goto spin; for (unsigned i = 0; i < 1U << 26; i++) /* inserted delay loop */ __asm__ __volatile__ ("" : : : "memory"); rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; .. and then run the following commands on the target system: echo 1 > /sys/kernel/tracing/events/sched/sched_switch/enable while true; do echo 16 > /sys/kernel/tracing/buffer_size_kb; sleep 0.1 echo 8 > /sys/kernel/tracing/buffer_size_kb; sleep 0.1 done & while true; do for i in /sys/kernel/tracing/per_cpu/*; do timeout 0.1 cat $i/trace_pipe; sleep 0.2 done done To fix the problem, make sure ring_buffer_resize() doesn't invoke rb_check_pages() concurrently with a reader operating on the same ring_buffer_per_cpu by taking its cpu_buffer->reader_lock. Link: https://lore.kernel.org/linux-trace-kernel/20240517134008.24529-3-petr.pavlu@suse.com Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 659f451ff213 ("ring-buffer: Add integrity check at end of iter read") Signed-off-by: Petr Pavlu [ Fixed whitespace ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 42227727a49d..28853966aa9a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1460,6 +1460,11 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, * * As a safety measure we check to make sure the data pages have not * been corrupted. + * + * Callers of this function need to guarantee that the list of pages doesn't get + * modified during the check. In particular, if it's possible that the function + * is invoked with concurrent readers which can swap in a new reader page then + * the caller should take cpu_buffer->reader_lock. */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { @@ -2210,8 +2215,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { + unsigned long flags; + cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } atomic_dec(&buffer->record_disabled); } -- cgit v1.2.3 From 23748e3e0fbfe471eff5ce439921629f6a427828 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sat, 18 May 2024 15:54:49 -0700 Subject: tracing: Add MODULE_DESCRIPTION() to preemptirq_delay_test Fix the 'make W=1' warning: WARNING: modpost: missing MODULE_DESCRIPTION() in kernel/trace/preemptirq_delay_test.o Link: https://lore.kernel.org/linux-trace-kernel/20240518-md-preemptirq_delay_test-v1-1-387d11b30d85@quicinc.com Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Fixes: f96e8577da10 ("lib: Add module for testing preemptoff/irqsoff latency tracers") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jeff Johnson Signed-off-by: Steven Rostedt (Google) --- kernel/trace/preemptirq_delay_test.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 8c4ffd076162..cb0871fbdb07 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -215,4 +215,5 @@ static void __exit preemptirq_delay_exit(void) module_init(preemptirq_delay_init) module_exit(preemptirq_delay_exit) +MODULE_DESCRIPTION("Preempt / IRQ disable delay thread to test latency tracers"); MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 1e8b7b3dbb3103d577a586ca72bc329f7b67120b Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 20 May 2024 13:42:39 +0800 Subject: rv: Update rv_en(dis)able_monitor doc to match kernel-doc The patch updates the function documentation comment for rv_en(dis)able_monitor to adhere to the kernel-doc specification. Link: https://lore.kernel.org/linux-trace-kernel/20240520054239.61784-1-yang.lee@linux.alibaba.com Fixes: 102227b970a15 ("rv: Add Runtime Verification (RV) interface") Signed-off-by: Yang Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 2f68e93fff0b..df0745a42a3f 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -245,6 +245,7 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) /** * rv_disable_monitor - disable a given runtime monitor + * @mdef: Pointer to the monitor definition structure. * * Returns 0 on success. */ @@ -256,6 +257,7 @@ int rv_disable_monitor(struct rv_monitor_def *mdef) /** * rv_enable_monitor - enable a given runtime monitor + * @mdef: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ -- cgit v1.2.3 From db5247d9bf5c6ade9fd70b4e4897441e0269b233 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 15 Mar 2024 19:47:06 -0500 Subject: vhost_task: Handle SIGKILL by flushing work and exiting Instead of lingering until the device is closed, this has us handle SIGKILL by: 1. marking the worker as killed so we no longer try to use it with new virtqueues and new flush operations. 2. setting the virtqueue to worker mapping so no new works are queued. 3. running all the exiting works. Suggested-by: Edward Adam Davis Reported-and-tested-by: syzbot+98edc2df894917b3431f@syzkaller.appspotmail.com Message-Id: Signed-off-by: Mike Christie Message-Id: <20240316004707.45557-9-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 54 +++++++++++++++++++++++++++++++++++++--- drivers/vhost/vhost.h | 2 ++ include/linux/sched/vhost_task.h | 3 ++- kernel/vhost_task.c | 53 +++++++++++++++++++++++++-------------- 4 files changed, 88 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c6448ff37768..b60955682474 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -273,7 +273,7 @@ static void __vhost_worker_flush(struct vhost_worker *worker) { struct vhost_flush_struct flush; - if (!worker->attachment_cnt) + if (!worker->attachment_cnt || worker->killed) return; init_completion(&flush.wait_event); @@ -388,7 +388,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } -static bool vhost_worker(void *data) +static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; struct vhost_work *work, *work_next; @@ -413,6 +413,40 @@ static bool vhost_worker(void *data) return !!node; } +static void vhost_worker_killed(void *data) +{ + struct vhost_worker *worker = data; + struct vhost_dev *dev = worker->dev; + struct vhost_virtqueue *vq; + int i, attach_cnt = 0; + + mutex_lock(&worker->mutex); + worker->killed = true; + + for (i = 0; i < dev->nvqs; i++) { + vq = dev->vqs[i]; + + mutex_lock(&vq->mutex); + if (worker == + rcu_dereference_check(vq->worker, + lockdep_is_held(&vq->mutex))) { + rcu_assign_pointer(vq->worker, NULL); + attach_cnt++; + } + mutex_unlock(&vq->mutex); + } + + worker->attachment_cnt -= attach_cnt; + if (attach_cnt) + synchronize_rcu(); + /* + * Finish vhost_worker_flush calls and any other works that snuck in + * before the synchronize_rcu. + */ + vhost_run_work_list(worker); + mutex_unlock(&worker->mutex); +} + static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) { kfree(vq->indirect); @@ -627,9 +661,11 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) if (!worker) return NULL; + worker->dev = dev; snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_worker, worker, name); + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, + worker, name); if (!vtsk) goto free_worker; @@ -661,6 +697,11 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, struct vhost_worker *old_worker; mutex_lock(&worker->mutex); + if (worker->killed) { + mutex_unlock(&worker->mutex); + return; + } + mutex_lock(&vq->mutex); old_worker = rcu_dereference_check(vq->worker, @@ -681,6 +722,11 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, * device wide flushes which doesn't use RCU for execution. */ mutex_lock(&old_worker->mutex); + if (old_worker->killed) { + mutex_unlock(&old_worker->mutex); + return; + } + /* * We don't want to call synchronize_rcu for every vq during setup * because it will slow down VM startup. If we haven't done @@ -758,7 +804,7 @@ static int vhost_free_worker(struct vhost_dev *dev, return -ENODEV; mutex_lock(&worker->mutex); - if (worker->attachment_cnt) { + if (worker->attachment_cnt || worker->killed) { mutex_unlock(&worker->mutex); return -EBUSY; } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 91ade037f08e..bb75a292d50c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -28,12 +28,14 @@ struct vhost_work { struct vhost_worker { struct vhost_task *vtsk; + struct vhost_dev *dev; /* Used to serialize device wide flushing with worker swapping. */ struct mutex mutex; struct llist_head work_list; u64 kcov_handle; u32 id; int attachment_cnt; + bool killed; }; /* Poll a file (eventfd or socket) */ diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index bc60243d43b3..25446c5d3508 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -4,7 +4,8 @@ struct vhost_task; -struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), + void (*handle_kill)(void *), void *arg, const char *name); void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index da35e5b7f047..8800f5acc007 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -10,38 +10,32 @@ enum vhost_task_flags { VHOST_TASK_FLAGS_STOP, + VHOST_TASK_FLAGS_KILLED, }; struct vhost_task { bool (*fn)(void *data); + void (*handle_sigkill)(void *data); void *data; struct completion exited; unsigned long flags; struct task_struct *task; + /* serialize SIGKILL and vhost_task_stop calls */ + struct mutex exit_mutex; }; static int vhost_task_fn(void *data) { struct vhost_task *vtsk = data; - bool dead = false; for (;;) { bool did_work; - if (!dead && signal_pending(current)) { + if (signal_pending(current)) { struct ksignal ksig; - /* - * Calling get_signal will block in SIGSTOP, - * or clear fatal_signal_pending, but remember - * what was set. - * - * This thread won't actually exit until all - * of the file descriptors are closed, and - * the release function is called. - */ - dead = get_signal(&ksig); - if (dead) - clear_thread_flag(TIF_SIGPENDING); + + if (get_signal(&ksig)) + break; } /* mb paired w/ vhost_task_stop */ @@ -57,7 +51,19 @@ static int vhost_task_fn(void *data) schedule(); } + mutex_lock(&vtsk->exit_mutex); + /* + * If a vhost_task_stop and SIGKILL race, we can ignore the SIGKILL. + * When the vhost layer has called vhost_task_stop it's already stopped + * new work and flushed. + */ + if (!test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { + set_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags); + vtsk->handle_sigkill(vtsk->data); + } + mutex_unlock(&vtsk->exit_mutex); complete(&vtsk->exited); + do_exit(0); } @@ -78,12 +84,17 @@ EXPORT_SYMBOL_GPL(vhost_task_wake); * @vtsk: vhost_task to stop * * vhost_task_fn ensures the worker thread exits after - * VHOST_TASK_FLAGS_SOP becomes true. + * VHOST_TASK_FLAGS_STOP becomes true. */ void vhost_task_stop(struct vhost_task *vtsk) { - set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); - vhost_task_wake(vtsk); + mutex_lock(&vtsk->exit_mutex); + if (!test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags)) { + set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); + vhost_task_wake(vtsk); + } + mutex_unlock(&vtsk->exit_mutex); + /* * Make sure vhost_task_fn is no longer accessing the vhost_task before * freeing it below. @@ -96,14 +107,16 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); /** * vhost_task_create - create a copy of a task to be used by the kernel * @fn: vhost worker function - * @arg: data to be passed to fn + * @handle_sigkill: vhost function to handle when we are killed + * @arg: data to be passed to fn and handled_kill * @name: the thread's name * * This returns a specialized task for use by the vhost layer or NULL on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ -struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), + void (*handle_sigkill)(void *), void *arg, const char *name) { struct kernel_clone_args args = { @@ -122,8 +135,10 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, if (!vtsk) return NULL; init_completion(&vtsk->exited); + mutex_init(&vtsk->exit_mutex); vtsk->data = arg; vtsk->fn = fn; + vtsk->handle_sigkill = handle_sigkill; args.fn_arg = vtsk; -- cgit v1.2.3 From 240a1853b4d2bce51e5cac9ba65cd646152ab6d6 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 15 Mar 2024 19:47:07 -0500 Subject: kernel: Remove signal hacks for vhost_tasks This removes the signal/coredump hacks added for vhost_tasks in: Commit f9010dbdce91 ("fork, vhost: Use CLONE_THREAD to fix freezer/ps regression") When that patch was added vhost_tasks did not handle SIGKILL and would try to ignore/clear the signal and continue on until the device's close function was called. In the previous patches vhost_tasks and the vhost drivers were converted to support SIGKILL by cleaning themselves up and exiting. The hacks are no longer needed so this removes them. Signed-off-by: Mike Christie Message-Id: <20240316004707.45557-10-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- fs/coredump.c | 4 +--- kernel/exit.c | 5 +---- kernel/signal.c | 4 +--- 3 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/fs/coredump.c b/fs/coredump.c index be6403b4b14b..8eae24afb3cb 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -371,9 +371,7 @@ static int zap_process(struct task_struct *start, int exit_code) if (t != current && !(t->flags & PF_POSTCOREDUMP)) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - /* The vhost_worker does not particpate in coredumps */ - if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER) - nr++; + nr++; } } diff --git a/kernel/exit.c b/kernel/exit.c index 41a12630cbbc..fca3a3234954 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -414,10 +414,7 @@ static void coredump_task_exit(struct task_struct *tsk) tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); - - /* The vhost_worker does not particpate in coredumps */ - if (core_state && - ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { + if (core_state) { struct core_thread self; self.task = current; diff --git a/kernel/signal.c b/kernel/signal.c index 7bdbcf1b78d0..41d5cbccab2a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1375,9 +1375,7 @@ int zap_other_threads(struct task_struct *p) for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - /* Don't require de_thread to wait for the vhost_worker */ - if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) - count++; + count++; /* Don't bother with already dead threads */ if (t->exit_state) -- cgit v1.2.3 From 2c92ca849fcc6ee7d0c358e9959abc9f58661aea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 16 May 2024 13:34:54 -0400 Subject: tracing/treewide: Remove second parameter of __assign_str() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the rework of how the __string() handles dynamic strings where it saves off the source string in field in the helper structure[1], the assignment of that value to the trace event field is stored in the helper value and does not need to be passed in again. This means that with: __string(field, mystring) Which use to be assigned with __assign_str(field, mystring), no longer needs the second parameter and it is unused. With this, __assign_str() will now only get a single parameter. There's over 700 users of __assign_str() and because coccinelle does not handle the TRACE_EVENT() macro I ended up using the following sed script: git grep -l __assign_str | while read a ; do sed -e 's/\(__assign_str([^,]*[^ ,]\) *,[^;]*/\1)/' $a > /tmp/test-file; mv /tmp/test-file $a; done I then searched for __assign_str() that did not end with ';' as those were multi line assignments that the sed script above would fail to catch. Note, the same updates will need to be done for: __assign_str_len() __assign_rel_str() __assign_rel_str_len() I tested this with both an allmodconfig and an allyesconfig (build only for both). [1] https://lore.kernel.org/linux-trace-kernel/20240222211442.634192653@goodmis.org/ Link: https://lore.kernel.org/linux-trace-kernel/20240516133454.681ba6a0@rorschach.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Julia Lawall Signed-off-by: Steven Rostedt (Google) Acked-by: Jani Nikula Acked-by: Christian König for the amdgpu parts. Acked-by: Thomas Hellström #for Acked-by: Rafael J. Wysocki # for thermal Acked-by: Takashi Iwai Acked-by: Darrick J. Wong # xfs Tested-by: Guenter Roeck --- arch/arm64/kernel/trace-events-emulation.h | 2 +- arch/powerpc/include/asm/trace.h | 4 +- arch/x86/kvm/trace.h | 2 +- drivers/base/regmap/trace.h | 18 ++-- drivers/base/trace.h | 2 +- drivers/block/rnbd/rnbd-srv-trace.h | 12 +-- drivers/bus/mhi/host/trace.h | 12 +-- drivers/cxl/core/trace.h | 32 +++--- drivers/dma-buf/sync_trace.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 16 +-- .../drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h | 2 +- drivers/gpu/drm/i915/display/intel_display_trace.h | 56 +++++----- drivers/gpu/drm/lima/lima_trace.h | 2 +- drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h | 12 +-- drivers/gpu/drm/scheduler/gpu_scheduler_trace.h | 4 +- drivers/gpu/drm/virtio/virtgpu_trace.h | 2 +- drivers/infiniband/core/cma_trace.h | 4 +- drivers/infiniband/hw/hfi1/hfi.h | 2 +- drivers/infiniband/hw/hfi1/trace_dbg.h | 2 +- drivers/infiniband/hw/hfi1/trace_rx.h | 2 +- drivers/infiniband/hw/hfi1/trace_tid.h | 4 +- drivers/infiniband/hw/hfi1/trace_tx.h | 4 +- drivers/infiniband/sw/rdmavt/trace.h | 2 +- drivers/infiniband/sw/rdmavt/trace_rvt.h | 2 +- drivers/interconnect/trace.h | 10 +- drivers/iommu/intel/trace.h | 14 +-- drivers/media/platform/nvidia/tegra-vde/trace.h | 2 +- drivers/misc/mei/mei-trace.h | 6 +- drivers/net/dsa/mv88e6xxx/trace.h | 4 +- .../net/ethernet/freescale/dpaa/dpaa_eth_trace.h | 2 +- .../net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h | 4 +- .../net/ethernet/fungible/funeth/funeth_trace.h | 6 +- drivers/net/ethernet/hisilicon/hns3/hns3_trace.h | 4 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_trace.h | 12 +-- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h | 10 +- drivers/net/ethernet/intel/i40e/i40e_trace.h | 10 +- drivers/net/ethernet/intel/iavf/iavf_trace.h | 6 +- drivers/net/ethernet/intel/ice/ice_trace.h | 12 +-- .../net/ethernet/marvell/octeontx2/af/rvu_trace.h | 12 +-- .../mellanox/mlx5/core/diag/cmd_tracepoint.h | 4 +- .../mellanox/mlx5/core/diag/en_rep_tracepoint.h | 2 +- .../mellanox/mlx5/core/diag/en_tc_tracepoint.h | 2 +- .../mellanox/mlx5/core/diag/fw_tracer_tracepoint.h | 5 +- .../mellanox/mlx5/core/esw/diag/qos_tracepoint.h | 8 +- .../mlx5/core/sf/dev/diag/dev_tracepoint.h | 2 +- .../mellanox/mlx5/core/sf/diag/sf_tracepoint.h | 14 +-- .../mellanox/mlx5/core/sf/diag/vhca_tracepoint.h | 2 +- drivers/net/fjes/fjes_trace.h | 10 +- drivers/net/hyperv/netvsc_trace.h | 8 +- drivers/net/wireless/ath/ath10k/trace.h | 64 +++++------ drivers/net/wireless/ath/ath11k/trace.h | 44 ++++---- drivers/net/wireless/ath/ath12k/trace.h | 16 +-- drivers/net/wireless/ath/ath6kl/trace.h | 4 +- drivers/net/wireless/ath/trace.h | 4 +- .../broadcom/brcm80211/brcmfmac/tracepoint.h | 4 +- .../brcm80211/brcmsmac/brcms_trace_brcmsmac.h | 2 +- .../brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h | 2 +- .../brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h | 6 +- .../net/wireless/intel/iwlwifi/iwl-devtrace-msg.h | 2 +- drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h | 2 +- drivers/soc/qcom/pmic_pdcharger_ulog.h | 2 +- drivers/soc/qcom/trace-aoss.h | 4 +- drivers/soc/qcom/trace-rpmh.h | 4 +- drivers/thermal/thermal_trace.h | 10 +- drivers/usb/cdns3/cdns3-trace.h | 26 ++--- drivers/usb/cdns3/cdnsp-trace.h | 10 +- drivers/usb/chipidea/trace.h | 4 +- drivers/usb/dwc3/trace.h | 8 +- drivers/usb/gadget/udc/cdns2/cdns2-trace.h | 22 ++-- drivers/usb/gadget/udc/trace.h | 4 +- drivers/usb/mtu3/mtu3_trace.h | 8 +- drivers/usb/musb/musb_trace.h | 12 +-- fs/bcachefs/trace.h | 6 +- fs/nfs/nfs4trace.h | 42 ++++---- fs/nfs/nfstrace.h | 41 ++++--- fs/nfsd/trace.h | 40 +++---- fs/ocfs2/ocfs2_trace.h | 60 +++++------ fs/smb/client/trace.h | 18 ++-- fs/xfs/scrub/trace.h | 10 +- fs/xfs/xfs_trace.h | 28 ++--- include/ras/ras_event.h | 12 +-- include/trace/events/asoc.h | 22 ++-- include/trace/events/avc.h | 6 +- include/trace/events/bridge.h | 16 +-- include/trace/events/btrfs.h | 6 +- include/trace/events/cgroup.h | 10 +- include/trace/events/clk.h | 18 ++-- include/trace/events/cma.h | 8 +- include/trace/events/devfreq.h | 4 +- include/trace/events/devlink.h | 50 ++++----- include/trace/events/dma_fence.h | 4 +- include/trace/events/erofs.h | 2 +- include/trace/events/f2fs.h | 20 ++-- include/trace/events/habanalabs.h | 10 +- include/trace/events/huge_memory.h | 4 +- include/trace/events/hwmon.h | 6 +- include/trace/events/initcall.h | 2 +- include/trace/events/intel_ish.h | 2 +- include/trace/events/io_uring.h | 14 +-- include/trace/events/iocost.h | 14 +-- include/trace/events/iommu.h | 8 +- include/trace/events/irq.h | 2 +- include/trace/events/iscsi.h | 2 +- include/trace/events/kmem.h | 2 +- include/trace/events/lock.h | 4 +- include/trace/events/mmap_lock.h | 4 +- include/trace/events/mmc.h | 4 +- include/trace/events/module.h | 8 +- include/trace/events/napi.h | 2 +- include/trace/events/neigh.h | 6 +- include/trace/events/net.h | 12 +-- include/trace/events/netlink.h | 2 +- include/trace/events/oom.h | 2 +- include/trace/events/osnoise.h | 2 +- include/trace/events/power.h | 23 ++-- include/trace/events/pwc.h | 4 +- include/trace/events/qdisc.h | 12 +-- include/trace/events/qla.h | 2 +- include/trace/events/qrtr.h | 2 +- include/trace/events/regulator.h | 6 +- include/trace/events/rpcgss.h | 20 ++-- include/trace/events/rpcrdma.h | 52 ++++----- include/trace/events/rpm.h | 6 +- include/trace/events/sched.h | 8 +- include/trace/events/sof.h | 12 +-- include/trace/events/sof_intel.h | 16 +-- include/trace/events/sunrpc.h | 118 ++++++++++----------- include/trace/events/swiotlb.h | 2 +- include/trace/events/target.h | 4 +- include/trace/events/tegra_apb_dma.h | 6 +- include/trace/events/ufs.h | 24 ++--- include/trace/events/workqueue.h | 2 +- include/trace/events/xdp.h | 2 +- include/trace/stages/stage6_event_callback.h | 4 +- kernel/trace/bpf_trace.h | 2 +- net/batman-adv/trace.h | 4 +- net/dsa/trace.h | 34 +++--- net/ieee802154/trace.h | 2 +- net/mac80211/trace.h | 2 +- net/openvswitch/openvswitch_trace.h | 8 +- net/smc/smc_tracepoint.h | 4 +- net/tipc/trace.h | 16 +-- net/wireless/trace.h | 2 +- samples/trace_events/trace-events-sample.h | 19 ++-- sound/core/pcm_trace.h | 2 +- sound/hda/trace.h | 6 +- sound/soc/intel/avs/trace.h | 4 +- 147 files changed, 794 insertions(+), 808 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/trace-events-emulation.h b/arch/arm64/kernel/trace-events-emulation.h index 6c40f58b844a..c51b547b583e 100644 --- a/arch/arm64/kernel/trace-events-emulation.h +++ b/arch/arm64/kernel/trace-events-emulation.h @@ -18,7 +18,7 @@ TRACE_EVENT(instruction_emulation, ), TP_fast_assign( - __assign_str(instr, instr); + __assign_str(instr); __entry->addr = addr; ), diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index d9ac3a4f46e1..a7b69b25296b 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -137,7 +137,7 @@ TRACE_EVENT(rtas_input, TP_fast_assign( __entry->nargs = be32_to_cpu(rtas_args->nargs); - __assign_str(name, name); + __assign_str(name); be32_to_cpu_array(__get_dynamic_array(inputs), rtas_args->args, __entry->nargs); ), @@ -162,7 +162,7 @@ TRACE_EVENT(rtas_output, TP_fast_assign( __entry->nr_other = be32_to_cpu(rtas_args->nret) - 1; __entry->status = be32_to_cpu(rtas_args->rets[0]); - __assign_str(name, name); + __assign_str(name); be32_to_cpu_array(__get_dynamic_array(other_outputs), &rtas_args->rets[1], __entry->nr_other); ), diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 9d0b02ef307e..e19fed438a67 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1678,7 +1678,7 @@ TRACE_EVENT(kvm_nested_vmenter_failed, ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); __entry->err = err; ), diff --git a/drivers/base/regmap/trace.h b/drivers/base/regmap/trace.h index 704e106e5dbd..bcc5a8b226a6 100644 --- a/drivers/base/regmap/trace.h +++ b/drivers/base/regmap/trace.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(regmap_reg, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); + __assign_str(name); __entry->reg = reg; __entry->val = val; ), @@ -74,7 +74,7 @@ DECLARE_EVENT_CLASS(regmap_bulk, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); + __assign_str(name); __entry->reg = reg; __entry->val_len = val_len; memcpy(__get_dynamic_array(buf), val, val_len); @@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(regmap_block, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); + __assign_str(name); __entry->reg = reg; __entry->count = count; ), @@ -163,9 +163,9 @@ TRACE_EVENT(regcache_sync, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); - __assign_str(status, status); - __assign_str(type, type); + __assign_str(name); + __assign_str(status); + __assign_str(type); ), TP_printk("%s type=%s status=%s", __get_str(name), @@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(regmap_bool, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); + __assign_str(name); __entry->flag = flag; ), @@ -216,7 +216,7 @@ DECLARE_EVENT_CLASS(regmap_async, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); + __assign_str(name); ), TP_printk("%s", __get_str(name)) @@ -264,7 +264,7 @@ TRACE_EVENT(regcache_drop_region, ), TP_fast_assign( - __assign_str(name, regmap_name(map)); + __assign_str(name); __entry->from = from; __entry->to = to; ), diff --git a/drivers/base/trace.h b/drivers/base/trace.h index 3192e18f877e..e52b6eae060d 100644 --- a/drivers/base/trace.h +++ b/drivers/base/trace.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(devres, __field(size_t, size) ), TP_fast_assign( - __assign_str(devname, dev_name(dev)); + __assign_str(devname); __entry->op = op; __entry->node = node; __entry->name = name; diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h index 8dedf73bdd28..89d0bcb17195 100644 --- a/drivers/block/rnbd/rnbd-srv-trace.h +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(rnbd_srv_link_class, TP_fast_assign( __entry->qdepth = srv->queue_depth; - __assign_str(sessname, srv->sessname); + __assign_str(sessname); ), TP_printk("sessname: %s qdepth: %d", @@ -85,7 +85,7 @@ TRACE_EVENT(process_rdma, ), TP_fast_assign( - __assign_str(sessname, srv->sessname); + __assign_str(sessname); __entry->dir = id->dir; __entry->ver = srv->ver; __entry->device_id = le32_to_cpu(msg->device_id); @@ -130,7 +130,7 @@ TRACE_EVENT(process_msg_sess_info, __entry->proto_ver = srv->ver; __entry->clt_ver = msg->ver; __entry->srv_ver = RNBD_PROTO_VER_MAJOR; - __assign_str(sessname, srv->sessname); + __assign_str(sessname); ), TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)", @@ -165,8 +165,8 @@ TRACE_EVENT(process_msg_open, TP_fast_assign( __entry->access_mode = msg->access_mode; - __assign_str(sessname, srv->sessname); - __assign_str(dev_name, msg->dev_name); + __assign_str(sessname); + __assign_str(dev_name); ), TP_printk("Open message received: session='%s' path='%s' access_mode=%s", @@ -189,7 +189,7 @@ TRACE_EVENT(process_msg_close, TP_fast_assign( __entry->device_id = le32_to_cpu(msg->device_id); - __assign_str(sessname, srv->sessname); + __assign_str(sessname); ), TP_printk("Close message received: session='%s' device id='%d'", diff --git a/drivers/bus/mhi/host/trace.h b/drivers/bus/mhi/host/trace.h index 368515dcb22d..95613c8ebe06 100644 --- a/drivers/bus/mhi/host/trace.h +++ b/drivers/bus/mhi/host/trace.h @@ -103,7 +103,7 @@ TRACE_EVENT(mhi_gen_tre, ), TP_fast_assign( - __assign_str(name, mhi_cntrl->mhi_dev->name); + __assign_str(name); __entry->ch_num = mhi_chan->chan; __entry->wp = mhi_tre; __entry->tre_ptr = mhi_tre->ptr; @@ -131,7 +131,7 @@ TRACE_EVENT(mhi_intvec_states, ), TP_fast_assign( - __assign_str(name, mhi_cntrl->mhi_dev->name); + __assign_str(name); __entry->local_ee = mhi_cntrl->ee; __entry->state = mhi_cntrl->dev_state; __entry->dev_ee = dev_ee; @@ -158,7 +158,7 @@ TRACE_EVENT(mhi_tryset_pm_state, ), TP_fast_assign( - __assign_str(name, mhi_cntrl->mhi_dev->name); + __assign_str(name); if (pm_state) pm_state = __fls(pm_state); __entry->pm_state = pm_state; @@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(mhi_process_event_ring, ), TP_fast_assign( - __assign_str(name, mhi_cntrl->mhi_dev->name); + __assign_str(name); __entry->rp = rp; __entry->ptr = rp->ptr; __entry->dword0 = rp->dword[0]; @@ -226,7 +226,7 @@ DECLARE_EVENT_CLASS(mhi_update_channel_state, ), TP_fast_assign( - __assign_str(name, mhi_cntrl->mhi_dev->name); + __assign_str(name); __entry->ch_num = mhi_chan->chan; __entry->state = state; __entry->reason = reason; @@ -265,7 +265,7 @@ TRACE_EVENT(mhi_pm_st_transition, ), TP_fast_assign( - __assign_str(name, mhi_cntrl->mhi_dev->name); + __assign_str(name); __entry->state = state; ), diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index 07a0394b1d99..ee5cd4eb2f16 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -60,8 +60,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) ), TP_fast_assign( - __assign_str(memdev, dev_name(&cxlmd->dev)); - __assign_str(host, dev_name(cxlmd->dev.parent)); + __assign_str(memdev); + __assign_str(host); __entry->serial = cxlmd->cxlds->serial; __entry->status = status; __entry->first_error = fe; @@ -106,8 +106,8 @@ TRACE_EVENT(cxl_aer_correctable_error, __field(u32, status) ), TP_fast_assign( - __assign_str(memdev, dev_name(&cxlmd->dev)); - __assign_str(host, dev_name(cxlmd->dev.parent)); + __assign_str(memdev); + __assign_str(host); __entry->serial = cxlmd->cxlds->serial; __entry->status = status; ), @@ -142,8 +142,8 @@ TRACE_EVENT(cxl_overflow, ), TP_fast_assign( - __assign_str(memdev, dev_name(&cxlmd->dev)); - __assign_str(host, dev_name(cxlmd->dev.parent)); + __assign_str(memdev); + __assign_str(host); __entry->serial = cxlmd->cxlds->serial; __entry->log = log; __entry->count = le16_to_cpu(payload->overflow_err_count); @@ -200,8 +200,8 @@ TRACE_EVENT(cxl_overflow, __field(u8, hdr_maint_op_class) #define CXL_EVT_TP_fast_assign(cxlmd, l, hdr) \ - __assign_str(memdev, dev_name(&(cxlmd)->dev)); \ - __assign_str(host, dev_name((cxlmd)->dev.parent)); \ + __assign_str(memdev); \ + __assign_str(host); \ __entry->log = (l); \ __entry->serial = (cxlmd)->cxlds->serial; \ __entry->hdr_length = (hdr).length; \ @@ -359,10 +359,10 @@ TRACE_EVENT(cxl_general_media, __entry->validity_flags = get_unaligned_le16(&rec->validity_flags); __entry->hpa = hpa; if (cxlr) { - __assign_str(region_name, dev_name(&cxlr->dev)); + __assign_str(region_name); uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); } else { - __assign_str(region_name, ""); + __assign_str(region_name); uuid_copy(&__entry->region_uuid, &uuid_null); } ), @@ -462,10 +462,10 @@ TRACE_EVENT(cxl_dram, CXL_EVENT_DER_CORRECTION_MASK_SIZE); __entry->hpa = hpa; if (cxlr) { - __assign_str(region_name, dev_name(&cxlr->dev)); + __assign_str(region_name); uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); } else { - __assign_str(region_name, ""); + __assign_str(region_name); uuid_copy(&__entry->region_uuid, &uuid_null); } ), @@ -692,8 +692,8 @@ TRACE_EVENT(cxl_poison, ), TP_fast_assign( - __assign_str(memdev, dev_name(&cxlmd->dev)); - __assign_str(host, dev_name(cxlmd->dev.parent)); + __assign_str(memdev); + __assign_str(host); __entry->serial = cxlmd->cxlds->serial; __entry->overflow_ts = cxl_poison_overflow(flags, overflow_ts); __entry->dpa = cxl_poison_record_dpa(record); @@ -702,12 +702,12 @@ TRACE_EVENT(cxl_poison, __entry->trace_type = trace_type; __entry->flags = flags; if (cxlr) { - __assign_str(region, dev_name(&cxlr->dev)); + __assign_str(region); memcpy(__entry->uuid, &cxlr->params.uuid, 16); __entry->hpa = cxl_trace_hpa(cxlr, cxlmd, __entry->dpa); } else { - __assign_str(region, ""); + __assign_str(region); memset(__entry->uuid, 0, 16); __entry->hpa = ULLONG_MAX; } diff --git a/drivers/dma-buf/sync_trace.h b/drivers/dma-buf/sync_trace.h index 06e468a218ff..d71dcf954b8d 100644 --- a/drivers/dma-buf/sync_trace.h +++ b/drivers/dma-buf/sync_trace.h @@ -20,7 +20,7 @@ TRACE_EVENT(sync_timeline, ), TP_fast_assign( - __assign_str(name, timeline->name); + __assign_str(name); __entry->value = timeline->value; ), diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index f539b1d00234..7aafeb763e5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -178,10 +178,10 @@ TRACE_EVENT(amdgpu_cs_ioctl, TP_fast_assign( __entry->sched_job_id = job->base.id; - __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job)); + __assign_str(timeline); __entry->context = job->base.s_fence->finished.context; __entry->seqno = job->base.s_fence->finished.seqno; - __assign_str(ring, to_amdgpu_ring(job->base.sched)->name); + __assign_str(ring); __entry->num_ibs = job->num_ibs; ), TP_printk("sched_job=%llu, timeline=%s, context=%u, seqno=%u, ring_name=%s, num_ibs=%u", @@ -203,10 +203,10 @@ TRACE_EVENT(amdgpu_sched_run_job, TP_fast_assign( __entry->sched_job_id = job->base.id; - __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job)); + __assign_str(timeline); __entry->context = job->base.s_fence->finished.context; __entry->seqno = job->base.s_fence->finished.seqno; - __assign_str(ring, to_amdgpu_ring(job->base.sched)->name); + __assign_str(ring); __entry->num_ibs = job->num_ibs; ), TP_printk("sched_job=%llu, timeline=%s, context=%u, seqno=%u, ring_name=%s, num_ibs=%u", @@ -231,7 +231,7 @@ TRACE_EVENT(amdgpu_vm_grab_id, TP_fast_assign( __entry->pasid = vm->pasid; - __assign_str(ring, ring->name); + __assign_str(ring); __entry->vmid = job->vmid; __entry->vm_hub = ring->vm_hub, __entry->pd_addr = job->vm_pd_addr; @@ -425,7 +425,7 @@ TRACE_EVENT(amdgpu_vm_flush, ), TP_fast_assign( - __assign_str(ring, ring->name); + __assign_str(ring); __entry->vmid = vmid; __entry->vm_hub = ring->vm_hub; __entry->pd_addr = pd_addr; @@ -526,7 +526,7 @@ TRACE_EVENT(amdgpu_ib_pipe_sync, ), TP_fast_assign( - __assign_str(ring, sched_job->base.sched->name); + __assign_str(ring); __entry->id = sched_job->base.id; __entry->fence = fence; __entry->ctx = fence->context; @@ -563,7 +563,7 @@ TRACE_EVENT(amdgpu_runpm_reference_dumps, ), TP_fast_assign( __entry->index = index; - __assign_str(func, func); + __assign_str(func); ), TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n", __entry->index, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h index 133af994a08c..4686d4b0cbad 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h @@ -87,7 +87,7 @@ TRACE_EVENT(amdgpu_dc_performance, __entry->writes = write_count; __entry->read_delta = read_count - *last_read; __entry->write_delta = write_count - *last_write; - __assign_str(func, func); + __assign_str(func); __entry->line = line; *last_read = read_count; *last_write = write_count; diff --git a/drivers/gpu/drm/i915/display/intel_display_trace.h b/drivers/gpu/drm/i915/display/intel_display_trace.h index 7862e7cefe02..49a5e6d9dc0d 100644 --- a/drivers/gpu/drm/i915/display/intel_display_trace.h +++ b/drivers/gpu/drm/i915/display/intel_display_trace.h @@ -34,7 +34,7 @@ TRACE_EVENT(intel_pipe_enable, TP_fast_assign( struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); struct intel_crtc *it__; - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); for_each_intel_crtc(&dev_priv->drm, it__) { __entry->frame[it__->pipe] = intel_crtc_get_vblank_counter(it__); __entry->scanline[it__->pipe] = intel_get_crtc_scanline(it__); @@ -63,7 +63,7 @@ TRACE_EVENT(intel_pipe_disable, TP_fast_assign( struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); struct intel_crtc *it__; - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); for_each_intel_crtc(&dev_priv->drm, it__) { __entry->frame[it__->pipe] = intel_crtc_get_vblank_counter(it__); __entry->scanline[it__->pipe] = intel_get_crtc_scanline(it__); @@ -91,7 +91,7 @@ TRACE_EVENT(intel_pipe_crc, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -119,7 +119,7 @@ TRACE_EVENT(intel_cpu_fifo_underrun, TP_fast_assign( struct intel_crtc *crtc = intel_crtc_for_pipe(dev_priv, pipe); - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -144,7 +144,7 @@ TRACE_EVENT(intel_pch_fifo_underrun, TP_fast_assign( enum pipe pipe = pch_transcoder; struct intel_crtc *crtc = intel_crtc_for_pipe(dev_priv, pipe); - __assign_str(dev, __dev_name_i915(dev_priv)); + __assign_str(dev); __entry->pipe = pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -169,7 +169,7 @@ TRACE_EVENT(intel_memory_cxsr, TP_fast_assign( struct intel_crtc *crtc; - __assign_str(dev, __dev_name_i915(dev_priv)); + __assign_str(dev); for_each_intel_crtc(&dev_priv->drm, crtc) { __entry->frame[crtc->pipe] = intel_crtc_get_vblank_counter(crtc); __entry->scanline[crtc->pipe] = intel_get_crtc_scanline(crtc); @@ -209,7 +209,7 @@ TRACE_EVENT(g4x_wm, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -256,7 +256,7 @@ TRACE_EVENT(vlv_wm, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -293,7 +293,7 @@ TRACE_EVENT(vlv_fifo_size, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -323,8 +323,8 @@ TRACE_EVENT(intel_plane_update_noarm, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name); + __assign_str(dev); + __assign_str(name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -354,8 +354,8 @@ TRACE_EVENT(intel_plane_update_arm, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name); + __assign_str(dev); + __assign_str(name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -383,8 +383,8 @@ TRACE_EVENT(intel_plane_disable_arm, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name); + __assign_str(dev); + __assign_str(name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -410,8 +410,8 @@ TRACE_EVENT(intel_fbc_activate, TP_fast_assign( struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); - __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name); + __assign_str(dev); + __assign_str(name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -437,8 +437,8 @@ TRACE_EVENT(intel_fbc_deactivate, TP_fast_assign( struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); - __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name); + __assign_str(dev); + __assign_str(name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -464,8 +464,8 @@ TRACE_EVENT(intel_fbc_nuke, TP_fast_assign( struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); - __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name); + __assign_str(dev); + __assign_str(name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -488,7 +488,7 @@ TRACE_EVENT(intel_crtc_vblank_work_start, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -511,7 +511,7 @@ TRACE_EVENT(intel_crtc_vblank_work_end, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -536,7 +536,7 @@ TRACE_EVENT(intel_pipe_update_start, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -564,7 +564,7 @@ TRACE_EVENT(intel_pipe_update_vblank_evaded, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = crtc->debug.start_vbl_count; __entry->scanline = crtc->debug.scanline_start; @@ -590,7 +590,7 @@ TRACE_EVENT(intel_pipe_update_end, ), TP_fast_assign( - __assign_str(dev, __dev_name_kms(crtc)); + __assign_str(dev); __entry->pipe = crtc->pipe; __entry->frame = frame; __entry->scanline = scanline_end; @@ -613,7 +613,7 @@ TRACE_EVENT(intel_frontbuffer_invalidate, ), TP_fast_assign( - __assign_str(dev, __dev_name_i915(i915)); + __assign_str(dev); __entry->frontbuffer_bits = frontbuffer_bits; __entry->origin = origin; ), @@ -634,7 +634,7 @@ TRACE_EVENT(intel_frontbuffer_flush, ), TP_fast_assign( - __assign_str(dev, __dev_name_i915(i915)); + __assign_str(dev); __entry->frontbuffer_bits = frontbuffer_bits; __entry->origin = origin; ), diff --git a/drivers/gpu/drm/lima/lima_trace.h b/drivers/gpu/drm/lima/lima_trace.h index 494b9790b1da..3a349d10304e 100644 --- a/drivers/gpu/drm/lima/lima_trace.h +++ b/drivers/gpu/drm/lima/lima_trace.h @@ -24,7 +24,7 @@ DECLARE_EVENT_CLASS(lima_task, __entry->task_id = task->base.id; __entry->context = task->base.s_fence->finished.context; __entry->seqno = task->base.s_fence->finished.seqno; - __assign_str(pipe, task->base.sched->name); + __assign_str(pipe); ), TP_printk("task=%llu, context=%u seqno=%u pipe=%s", diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h index bd92fb2979aa..0fdd41162e4b 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h @@ -113,7 +113,7 @@ TRACE_EVENT(tracing_mark_write, ), TP_fast_assign( __entry->pid = pid; - __assign_str(trace_name, name); + __assign_str(trace_name); __entry->trace_begin = trace_begin; ), TP_printk("%s|%d|%s", __entry->trace_begin ? "B" : "E", @@ -130,7 +130,7 @@ TRACE_EVENT(dpu_trace_counter, ), TP_fast_assign( __entry->pid = current->tgid; - __assign_str(counter_name, name); + __assign_str(counter_name); __entry->value = value; ), TP_printk("%d|%s|%d", __entry->pid, @@ -379,7 +379,7 @@ TRACE_EVENT(dpu_enc_rc, __entry->sw_event = sw_event; __entry->idle_pc_supported = idle_pc_supported; __entry->rc_state = rc_state; - __assign_str(stage_str, stage); + __assign_str(stage_str); ), TP_printk("%s: id:%u, sw_event:%d, idle_pc_supported:%s, rc_state:%d", __get_str(stage_str), __entry->drm_id, __entry->sw_event, @@ -401,7 +401,7 @@ TRACE_EVENT(dpu_enc_frame_done_cb_not_busy, TP_fast_assign( __entry->drm_id = drm_id; __entry->event = event; - __assign_str(intf_mode_str, intf_mode); + __assign_str(intf_mode_str); __entry->intf_idx = intf_idx; __entry->wb_idx = wb_idx; ), @@ -446,7 +446,7 @@ TRACE_EVENT(dpu_enc_trigger_flush, ), TP_fast_assign( __entry->drm_id = drm_id; - __assign_str(intf_mode_str, intf_mode); + __assign_str(intf_mode_str); __entry->intf_idx = intf_idx; __entry->wb_idx = wb_idx; __entry->pending_kickoff_cnt = pending_kickoff_cnt; @@ -946,7 +946,7 @@ TRACE_EVENT(dpu_core_perf_update_clk, __field( u64, clk_rate ) ), TP_fast_assign( - __assign_str(dev_name, dev->unique); + __assign_str(dev_name); __entry->stop_req = stop_req; __entry->clk_rate = clk_rate; ), diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h index f8ed093b7356..c75302ca3427 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h @@ -48,7 +48,7 @@ DECLARE_EVENT_CLASS(drm_sched_job, __entry->entity = entity; __entry->id = sched_job->id; __entry->fence = &sched_job->s_fence->finished; - __assign_str(name, sched_job->sched->name); + __assign_str(name); __entry->job_count = spsc_queue_count(&entity->job_queue); __entry->hw_job_count = atomic_read( &sched_job->sched->credit_count); @@ -94,7 +94,7 @@ TRACE_EVENT(drm_sched_job_wait_dep, ), TP_fast_assign( - __assign_str(name, sched_job->sched->name); + __assign_str(name); __entry->id = sched_job->id; __entry->fence = fence; __entry->ctx = fence->context; diff --git a/drivers/gpu/drm/virtio/virtgpu_trace.h b/drivers/gpu/drm/virtio/virtgpu_trace.h index 031bc77689d5..227bf0ae7ed5 100644 --- a/drivers/gpu/drm/virtio/virtgpu_trace.h +++ b/drivers/gpu/drm/virtio/virtgpu_trace.h @@ -25,7 +25,7 @@ DECLARE_EVENT_CLASS(virtio_gpu_cmd, TP_fast_assign( __entry->dev = vq->vdev->index; __entry->vq = vq->index; - __assign_str(name, vq->name); + __assign_str(name); __entry->type = le32_to_cpu(hdr->type); __entry->flags = le32_to_cpu(hdr->flags); __entry->fence_id = le64_to_cpu(hdr->fence_id); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index 47f3c6e4be89..dc622f3778be 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -84,7 +84,7 @@ TRACE_EVENT(cm_id_attach, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); - __assign_str(devname, device->name); + __assign_str(devname); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", @@ -334,7 +334,7 @@ DECLARE_EVENT_CLASS(cma_client_class, ), TP_fast_assign( - __assign_str(name, device->name); + __assign_str(name); ), TP_printk("device name=%s", diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4b3f1cb125fc..eb38f81aeeb1 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -2425,7 +2425,7 @@ static inline bool hfi1_need_drop(struct hfi1_devdata *dd) int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); #define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) -#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev) static inline void hfi1_update_ah_attr(struct ib_device *ibdev, struct rdma_ah_attr *attr) diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h index 75599d5168db..58304b91380f 100644 --- a/drivers/infiniband/hw/hfi1/trace_dbg.h +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(hfi1_trace_template, TP_STRUCT__entry(__string(function, function) __vstring(msg, vaf->fmt, vaf->va) ), - TP_fast_assign(__assign_str(function, function); + TP_fast_assign(__assign_str(function); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("(%s) %s", diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index e6904aa80c00..8d5e12fe88a5 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -90,7 +90,7 @@ TRACE_EVENT(hfi1_mmu_invalidate, TP_fast_assign( __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __assign_str(type, type); + __assign_str(type); __entry->start = start; __entry->end = end; ), diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index d129b8195959..e358f5b885fa 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(/* msg */ ), TP_fast_assign(/* assign */ __entry->qpn = qp ? qp->ibqp.qp_num : 0; - __assign_str(msg, msg); + __assign_str(msg); __entry->more = more; ), TP_printk(/* print */ @@ -651,7 +651,7 @@ DECLARE_EVENT_CLASS(/* tid_node */ TP_fast_assign(/* assign */ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; - __assign_str(msg, msg); + __assign_str(msg); __entry->index = index; __entry->base = base; __entry->map = map; diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index c79856d4fdfb..c0ba6b0a2c4e 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -740,8 +740,8 @@ TRACE_EVENT(hfi1_sdma_state, __string(newstate, nstate) ), TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __assign_str(curstate, cstate); - __assign_str(newstate, nstate); + __assign_str(curstate); + __assign_str(newstate); ), TP_printk("[%s] current state %s new state %s", __get_str(dev), diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h index 4341965a5ea7..bdb6b9326b64 100644 --- a/drivers/infiniband/sw/rdmavt/trace.h +++ b/drivers/infiniband/sw/rdmavt/trace.h @@ -4,7 +4,7 @@ */ #define RDI_DEV_ENTRY(rdi) __string(dev, rvt_get_ibdev_name(rdi)) -#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rvt_get_ibdev_name(rdi)) +#define RDI_DEV_ASSIGN(rdi) __assign_str(dev) #include "trace_rvt.h" #include "trace_qp.h" diff --git a/drivers/infiniband/sw/rdmavt/trace_rvt.h b/drivers/infiniband/sw/rdmavt/trace_rvt.h index df33c2ca9710..a00489e66ddf 100644 --- a/drivers/infiniband/sw/rdmavt/trace_rvt.h +++ b/drivers/infiniband/sw/rdmavt/trace_rvt.h @@ -24,7 +24,7 @@ TRACE_EVENT(rvt_dbg, ), TP_fast_assign( RDI_DEV_ASSIGN(rdi); - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("[%s]: %s", __get_str(dev), __get_str(msg)) ); diff --git a/drivers/interconnect/trace.h b/drivers/interconnect/trace.h index 3d668ff566bf..206373546528 100644 --- a/drivers/interconnect/trace.h +++ b/drivers/interconnect/trace.h @@ -32,9 +32,9 @@ TRACE_EVENT(icc_set_bw, ), TP_fast_assign( - __assign_str(path_name, p->name); - __assign_str(dev, dev_name(p->reqs[i].dev)); - __assign_str(node_name, n->name); + __assign_str(path_name); + __assign_str(dev); + __assign_str(node_name); __entry->avg_bw = avg_bw; __entry->peak_bw = peak_bw; __entry->node_avg_bw = n->avg_bw; @@ -64,8 +64,8 @@ TRACE_EVENT(icc_set_bw_end, ), TP_fast_assign( - __assign_str(path_name, p->name); - __assign_str(dev, dev_name(p->reqs[0].dev)); + __assign_str(path_name); + __assign_str(dev); __entry->ret = ret; ), diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h index 961ac1c1bc21..9defdae6ebae 100644 --- a/drivers/iommu/intel/trace.h +++ b/drivers/iommu/intel/trace.h @@ -32,7 +32,7 @@ TRACE_EVENT(qi_submit, ), TP_fast_assign( - __assign_str(iommu, iommu->name); + __assign_str(iommu); __entry->qw0 = qw0; __entry->qw1 = qw1; __entry->qw2 = qw2; @@ -79,8 +79,8 @@ TRACE_EVENT(prq_report, __entry->dw2 = dw2; __entry->dw3 = dw3; __entry->seq = seq; - __assign_str(iommu, iommu->name); - __assign_str(dev, dev_name(dev)); + __assign_str(iommu); + __assign_str(dev); ), TP_printk("%s/%s seq# %ld: %s", @@ -102,8 +102,8 @@ DECLARE_EVENT_CLASS(cache_tag_log, __field(u32, users) ), TP_fast_assign( - __assign_str(iommu, tag->iommu->name); - __assign_str(dev, dev_name(tag->dev)); + __assign_str(iommu); + __assign_str(dev); __entry->type = tag->type; __entry->domain_id = tag->domain_id; __entry->pasid = tag->pasid; @@ -152,8 +152,8 @@ DECLARE_EVENT_CLASS(cache_tag_flush, __field(unsigned long, mask) ), TP_fast_assign( - __assign_str(iommu, tag->iommu->name); - __assign_str(dev, dev_name(tag->dev)); + __assign_str(iommu); + __assign_str(dev); __entry->type = tag->type; __entry->domain_id = tag->domain_id; __entry->pasid = tag->pasid; diff --git a/drivers/media/platform/nvidia/tegra-vde/trace.h b/drivers/media/platform/nvidia/tegra-vde/trace.h index 7853ab095ca4..e8a75a7bd05d 100644 --- a/drivers/media/platform/nvidia/tegra-vde/trace.h +++ b/drivers/media/platform/nvidia/tegra-vde/trace.h @@ -20,7 +20,7 @@ DECLARE_EVENT_CLASS(register_access, __field(u32, value) ), TP_fast_assign( - __assign_str(hw_name, tegra_vde_reg_base_name(vde, base)); + __assign_str(hw_name); __entry->offset = offset; __entry->value = value; ), diff --git a/drivers/misc/mei/mei-trace.h b/drivers/misc/mei/mei-trace.h index fe46ff2b9d69..5312edbf5190 100644 --- a/drivers/misc/mei/mei-trace.h +++ b/drivers/misc/mei/mei-trace.h @@ -26,7 +26,7 @@ TRACE_EVENT(mei_reg_read, __field(u32, val) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); __entry->reg = reg; __entry->offs = offs; __entry->val = val; @@ -45,7 +45,7 @@ TRACE_EVENT(mei_reg_write, __field(u32, val) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); __entry->reg = reg; __entry->offs = offs; __entry->val = val; @@ -64,7 +64,7 @@ TRACE_EVENT(mei_pci_cfg_read, __field(u32, val) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); __entry->reg = reg; __entry->offs = offs; __entry->val = val; diff --git a/drivers/net/dsa/mv88e6xxx/trace.h b/drivers/net/dsa/mv88e6xxx/trace.h index f59ca04768e7..5bd015b2b97a 100644 --- a/drivers/net/dsa/mv88e6xxx/trace.h +++ b/drivers/net/dsa/mv88e6xxx/trace.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(mv88e6xxx_atu_violation, ), TP_fast_assign( - __assign_str(name, dev_name(dev)); + __assign_str(name); __entry->spid = spid; __entry->portvec = portvec; memcpy(__entry->addr, addr, ETH_ALEN); @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(mv88e6xxx_vtu_violation, ), TP_fast_assign( - __assign_str(name, dev_name(dev)); + __assign_str(name); __entry->spid = spid; __entry->vid = vid; ), diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h index 889f89df9930..6f0e58a2a58a 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h @@ -57,7 +57,7 @@ DECLARE_EVENT_CLASS(dpaa_eth_fd, __entry->fd_offset = qm_fd_get_offset(fd); __entry->fd_length = qm_fd_get_length(fd); __entry->fd_status = fd->status; - __assign_str(name, netdev->name); + __assign_str(name); ), /* This is what gets printed when the trace event is triggered */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h index 9b43fadb9b11..956767e0869c 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h @@ -48,7 +48,7 @@ DECLARE_EVENT_CLASS(dpaa2_eth_fd, __entry->fd_addr = dpaa2_fd_get_addr(fd); __entry->fd_len = dpaa2_fd_get_len(fd); __entry->fd_offset = dpaa2_fd_get_offset(fd); - __assign_str(name, netdev->name); + __assign_str(name); ), /* This is what gets printed when the trace event is @@ -144,7 +144,7 @@ DECLARE_EVENT_CLASS(dpaa2_eth_buf, __entry->dma_addr = dma_addr; __entry->map_size = map_size; __entry->bpid = bpid; - __assign_str(name, netdev->name); + __assign_str(name); ), /* This is what gets printed when the trace event is diff --git a/drivers/net/ethernet/fungible/funeth/funeth_trace.h b/drivers/net/ethernet/fungible/funeth/funeth_trace.h index 9e58dfec19d5..b9985900f30b 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_trace.h +++ b/drivers/net/ethernet/fungible/funeth/funeth_trace.h @@ -32,7 +32,7 @@ TRACE_EVENT(funeth_tx, __entry->len = len; __entry->sqe_idx = sqe_idx; __entry->ngle = ngle; - __assign_str(devname, txq->netdev->name); + __assign_str(devname); ), TP_printk("%s: Txq %u, SQE idx %u, len %u, num GLEs %u", @@ -62,7 +62,7 @@ TRACE_EVENT(funeth_tx_free, __entry->sqe_idx = sqe_idx; __entry->num_sqes = num_sqes; __entry->hw_head = hw_head; - __assign_str(devname, txq->netdev->name); + __assign_str(devname); ), TP_printk("%s: Txq %u, SQE idx %u, SQEs %u, HW head %u", @@ -97,7 +97,7 @@ TRACE_EVENT(funeth_rx, __entry->len = pkt_len; __entry->hash = hash; __entry->cls_vec = cls_vec; - __assign_str(devname, rxq->netdev->name); + __assign_str(devname); ), TP_printk("%s: Rxq %u, CQ head %u, RQEs %u, len %u, hash %u, CV %#x", diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h index b8a1ecb4b8fb..3362b8d14d4f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h @@ -84,7 +84,7 @@ TRACE_EVENT(hns3_tx_desc, __entry->desc_dma = ring->desc_dma_addr, memcpy(__entry->desc, &ring->desc[cur_ntu], sizeof(struct hns3_desc)); - __assign_str(devname, ring->tqp->handle->kinfo.netdev->name); + __assign_str(devname); ), TP_printk( @@ -117,7 +117,7 @@ TRACE_EVENT(hns3_rx_desc, __entry->buf_dma = ring->desc_cb[ring->next_to_clean].dma; memcpy(__entry->desc, &ring->desc[ring->next_to_clean], sizeof(struct hns3_desc)); - __assign_str(devname, ring->tqp->handle->kinfo.netdev->name); + __assign_str(devname); ), TP_printk( diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h index 7e47f0c21d88..7103cf04bffc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h @@ -33,8 +33,8 @@ TRACE_EVENT(hclge_pf_mbx_get, __entry->vfid = req->mbx_src_vfid; __entry->code = req->msg.code; __entry->subcode = req->msg.subcode; - __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name); + __assign_str(pciname); + __assign_str(devname); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_vf_to_pf_cmd)); ), @@ -64,8 +64,8 @@ TRACE_EVENT(hclge_pf_mbx_send, TP_fast_assign( __entry->vfid = req->dest_vfid; __entry->code = le16_to_cpu(req->msg.code); - __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name); + __assign_str(pciname); + __assign_str(devname); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_pf_to_vf_cmd)); ), @@ -101,7 +101,7 @@ DECLARE_EVENT_CLASS(hclge_pf_cmd_template, __entry->rsv = le16_to_cpu(desc->rsv); __entry->index = index; __entry->num = num; - __assign_str(pciname, pci_name(hw->cmq.csq.pdev)); + __assign_str(pciname); for (i = 0; i < HCLGE_DESC_DATA_LEN; i++) __entry->data[i] = le32_to_cpu(desc->data[i]);), @@ -144,7 +144,7 @@ DECLARE_EVENT_CLASS(hclge_pf_special_cmd_template, TP_fast_assign(int i; __entry->index = index; __entry->num = num; - __assign_str(pciname, pci_name(hw->cmq.csq.pdev)); + __assign_str(pciname); for (i = 0; i < PF_DESC_LEN; i++) __entry->data[i] = le32_to_cpu(data[i]); ), diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h index e2e3a2602b6a..66b084309c91 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h @@ -30,8 +30,8 @@ TRACE_EVENT(hclge_vf_mbx_get, TP_fast_assign( __entry->vfid = req->dest_vfid; __entry->code = le16_to_cpu(req->msg.code); - __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, hdev->nic.kinfo.netdev->name); + __assign_str(pciname); + __assign_str(devname); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_pf_to_vf_cmd)); ), @@ -63,8 +63,8 @@ TRACE_EVENT(hclge_vf_mbx_send, __entry->vfid = req->mbx_src_vfid; __entry->code = req->msg.code; __entry->subcode = req->msg.subcode; - __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, hdev->nic.kinfo.netdev->name); + __assign_str(pciname); + __assign_str(devname); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_vf_to_pf_cmd)); ), @@ -101,7 +101,7 @@ DECLARE_EVENT_CLASS(hclge_vf_cmd_template, __entry->rsv = le16_to_cpu(desc->rsv); __entry->index = index; __entry->num = num; - __assign_str(pciname, pci_name(hw->cmq.csq.pdev)); + __assign_str(pciname); for (i = 0; i < HCLGE_DESC_DATA_LEN; i++) __entry->data[i] = le32_to_cpu(desc->data[i]);), diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h index 33b4e30f5e00..759f3d1c4c8f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_trace.h +++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h @@ -89,8 +89,8 @@ TRACE_EVENT(i40e_napi_poll, __entry->tx_clean_complete = tx_clean_complete; __entry->irq_num = q->irq_num; __entry->curr_cpu = get_cpu(); - __assign_str(qname, q->name); - __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV); + __assign_str(qname); + __assign_str(dev_name); __assign_bitmask(irq_affinity, cpumask_bits(&q->affinity_mask), nr_cpumask_bits); ), @@ -132,7 +132,7 @@ DECLARE_EVENT_CLASS( __entry->ring = ring; __entry->desc = desc; __entry->buf = buf; - __assign_str(devname, ring->netdev->name); + __assign_str(devname); ), TP_printk( @@ -177,7 +177,7 @@ DECLARE_EVENT_CLASS( __entry->ring = ring; __entry->desc = desc; __entry->xdp = xdp; - __assign_str(devname, ring->netdev->name); + __assign_str(devname); ), TP_printk( @@ -219,7 +219,7 @@ DECLARE_EVENT_CLASS( TP_fast_assign( __entry->skb = skb; __entry->ring = ring; - __assign_str(devname, ring->netdev->name); + __assign_str(devname); ), TP_printk( diff --git a/drivers/net/ethernet/intel/iavf/iavf_trace.h b/drivers/net/ethernet/intel/iavf/iavf_trace.h index 82fda6f5abf0..62212011c807 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_trace.h +++ b/drivers/net/ethernet/intel/iavf/iavf_trace.h @@ -83,7 +83,7 @@ DECLARE_EVENT_CLASS( __entry->ring = ring; __entry->desc = desc; __entry->buf = buf; - __assign_str(devname, ring->netdev->name); + __assign_str(devname); ), TP_printk( @@ -128,7 +128,7 @@ DECLARE_EVENT_CLASS( __entry->ring = ring; __entry->desc = desc; __entry->skb = skb; - __assign_str(devname, ring->netdev->name); + __assign_str(devname); ), TP_printk( @@ -170,7 +170,7 @@ DECLARE_EVENT_CLASS( TP_fast_assign( __entry->skb = skb; __entry->ring = ring; - __assign_str(devname, ring->netdev->name); + __assign_str(devname); ), TP_printk( diff --git a/drivers/net/ethernet/intel/ice/ice_trace.h b/drivers/net/ethernet/intel/ice/ice_trace.h index b2f5c9fe0149..244cddd2a9ea 100644 --- a/drivers/net/ethernet/intel/ice/ice_trace.h +++ b/drivers/net/ethernet/intel/ice/ice_trace.h @@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(ice_rx_dim_template, TP_fast_assign(__entry->q_vector = q_vector; __entry->dim = dim; - __assign_str(devname, q_vector->rx.rx_ring->netdev->name);), + __assign_str(devname);), TP_printk("netdev: %s Rx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d", __get_str(devname), @@ -96,7 +96,7 @@ DECLARE_EVENT_CLASS(ice_tx_dim_template, TP_fast_assign(__entry->q_vector = q_vector; __entry->dim = dim; - __assign_str(devname, q_vector->tx.tx_ring->netdev->name);), + __assign_str(devname);), TP_printk("netdev: %s Tx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d", __get_str(devname), @@ -128,7 +128,7 @@ DECLARE_EVENT_CLASS(ice_tx_template, TP_fast_assign(__entry->ring = ring; __entry->desc = desc; __entry->buf = buf; - __assign_str(devname, ring->netdev->name);), + __assign_str(devname);), TP_printk("netdev: %s ring: %pK desc: %pK buf %pK", __get_str(devname), __entry->ring, __entry->desc, __entry->buf) @@ -156,7 +156,7 @@ DECLARE_EVENT_CLASS(ice_rx_template, TP_fast_assign(__entry->ring = ring; __entry->desc = desc; - __assign_str(devname, ring->netdev->name);), + __assign_str(devname);), TP_printk("netdev: %s ring: %pK desc: %pK", __get_str(devname), __entry->ring, __entry->desc) @@ -180,7 +180,7 @@ DECLARE_EVENT_CLASS(ice_rx_indicate_template, TP_fast_assign(__entry->ring = ring; __entry->desc = desc; __entry->skb = skb; - __assign_str(devname, ring->netdev->name);), + __assign_str(devname);), TP_printk("netdev: %s ring: %pK desc: %pK skb %pK", __get_str(devname), __entry->ring, __entry->desc, __entry->skb) @@ -203,7 +203,7 @@ DECLARE_EVENT_CLASS(ice_xmit_template, TP_fast_assign(__entry->ring = ring; __entry->skb = skb; - __assign_str(devname, ring->netdev->name);), + __assign_str(devname);), TP_printk("netdev: %s skb: %pK ring: %pK", __get_str(devname), __entry->skb, __entry->ring) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h index 28984d0e848a..5704520f9b02 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h @@ -24,7 +24,7 @@ TRACE_EVENT(otx2_msg_alloc, __field(u16, id) __field(u64, size) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)); + TP_fast_assign(__assign_str(dev); __entry->id = id; __entry->size = size; ), @@ -39,7 +39,7 @@ TRACE_EVENT(otx2_msg_send, __field(u16, num_msgs) __field(u64, msg_size) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)); + TP_fast_assign(__assign_str(dev); __entry->num_msgs = num_msgs; __entry->msg_size = msg_size; ), @@ -55,7 +55,7 @@ TRACE_EVENT(otx2_msg_check, __field(u16, rspid) __field(int, rc) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)); + TP_fast_assign(__assign_str(dev); __entry->reqid = reqid; __entry->rspid = rspid; __entry->rc = rc; @@ -72,8 +72,8 @@ TRACE_EVENT(otx2_msg_interrupt, __string(str, msg) __field(u64, intr) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)); - __assign_str(str, msg); + TP_fast_assign(__assign_str(dev); + __assign_str(str); __entry->intr = intr; ), TP_printk("[%s] mbox interrupt %s (0x%llx)\n", __get_str(dev), @@ -87,7 +87,7 @@ TRACE_EVENT(otx2_msg_process, __field(u16, id) __field(int, err) ), - TP_fast_assign(__assign_str(dev, pci_name(pdev)); + TP_fast_assign(__assign_str(dev); __entry->id = id; __entry->err = err; ), diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h index 406ebe17405f..b4b3a43e56a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h @@ -22,10 +22,10 @@ TRACE_EVENT(mlx5_cmd, __field(u32, syndrome) __field(int, err) ), - TP_fast_assign(__assign_str(command_str, command_str); + TP_fast_assign(__assign_str(command_str); __entry->opcode = opcode; __entry->op_mod = op_mod; - __assign_str(status_str, status_str); + __assign_str(status_str); __entry->status = status; __entry->syndrome = syndrome; __entry->err = err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h index f15718db5d0e..78e481b2c015 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h @@ -25,7 +25,7 @@ TRACE_EVENT(mlx5e_rep_neigh_update, struct in6_addr *pin6; __be32 *p32; - __assign_str(devname, nhe->neigh_dev->name); + __assign_str(devname); __entry->neigh_connected = neigh_connected; memcpy(__entry->ha, ha, ETH_ALEN); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h index ac52ef37f38a..4b1ca228012b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h @@ -86,7 +86,7 @@ TRACE_EVENT(mlx5e_tc_update_neigh_used_value, struct in6_addr *pin6; __be32 *p32; - __assign_str(devname, nhe->neigh_dev->name); + __assign_str(devname); __entry->neigh_used = neigh_used; p32 = (__be32 *)__entry->v4; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h index 3038be575923..50f8a7630f86 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h @@ -55,12 +55,11 @@ TRACE_EVENT(mlx5_fw, ), TP_fast_assign( - __assign_str(dev_name, - dev_name(tracer->dev->device)); + __assign_str(dev_name); __entry->trace_timestamp = trace_timestamp; __entry->lost = lost; __entry->event_id = event_id; - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("%s [0x%llx] %d [0x%x] %s", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h index 458baf0c6415..1ce332f21ebe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h @@ -17,7 +17,7 @@ TRACE_EVENT(mlx5_esw_vport_qos_destroy, __field(unsigned short, vport_id) __field(unsigned int, tsar_ix) ), - TP_fast_assign(__assign_str(devname, dev_name(vport->dev->device)); + TP_fast_assign(__assign_str(devname); __entry->vport_id = vport->vport; __entry->tsar_ix = vport->qos.esw_tsar_ix; ), @@ -36,7 +36,7 @@ DECLARE_EVENT_CLASS(mlx5_esw_vport_qos_template, __field(unsigned int, max_rate) __field(void *, group) ), - TP_fast_assign(__assign_str(devname, dev_name(vport->dev->device)); + TP_fast_assign(__assign_str(devname); __entry->vport_id = vport->vport; __entry->tsar_ix = vport->qos.esw_tsar_ix; __entry->bw_share = bw_share; @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(mlx5_esw_group_qos_template, __field(const void *, group) __field(unsigned int, tsar_ix) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->group = group; __entry->tsar_ix = tsar_ix; ), @@ -102,7 +102,7 @@ TRACE_EVENT(mlx5_esw_group_qos_config, __field(unsigned int, bw_share) __field(unsigned int, max_rate) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->group = group; __entry->tsar_ix = tsar_ix; __entry->bw_share = bw_share; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h index 7f7c9af5deed..0537de86f981 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h @@ -22,7 +22,7 @@ DECLARE_EVENT_CLASS(mlx5_sf_dev_template, __field(u16, hw_fn_id) __field(u32, sfnum) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->sfdev = sfdev; __entry->aux_id = aux_id; __entry->hw_fn_id = sfdev->fn_id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h index 8bf1cd90930d..302ce00da5a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h @@ -24,7 +24,7 @@ TRACE_EVENT(mlx5_sf_add, __field(u16, hw_fn_id) __field(u32, sfnum) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->port_index = port_index; __entry->controller = controller; __entry->hw_fn_id = hw_fn_id; @@ -46,7 +46,7 @@ TRACE_EVENT(mlx5_sf_free, __field(u32, controller) __field(u16, hw_fn_id) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->port_index = port_index; __entry->controller = controller; __entry->hw_fn_id = hw_fn_id; @@ -67,7 +67,7 @@ TRACE_EVENT(mlx5_sf_hwc_alloc, __field(u16, hw_fn_id) __field(u32, sfnum) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->controller = controller; __entry->hw_fn_id = hw_fn_id; __entry->sfnum = sfnum; @@ -84,7 +84,7 @@ TRACE_EVENT(mlx5_sf_hwc_free, TP_STRUCT__entry(__string(devname, dev_name(dev->device)) __field(u16, hw_fn_id) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->hw_fn_id = hw_fn_id; ), TP_printk("(%s) hw_id=0x%x\n", __get_str(devname), __entry->hw_fn_id) @@ -97,7 +97,7 @@ TRACE_EVENT(mlx5_sf_hwc_deferred_free, TP_STRUCT__entry(__string(devname, dev_name(dev->device)) __field(u16, hw_fn_id) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->hw_fn_id = hw_fn_id; ), TP_printk("(%s) hw_id=0x%x\n", __get_str(devname), __entry->hw_fn_id) @@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(mlx5_sf_state_template, __field(unsigned int, port_index) __field(u32, controller) __field(u16, hw_fn_id)), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->port_index = port_index; __entry->controller = controller; __entry->hw_fn_id = hw_fn_id; @@ -152,7 +152,7 @@ TRACE_EVENT(mlx5_sf_update_state, __field(u16, hw_fn_id) __field(u8, state) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->port_index = port_index; __entry->controller = controller; __entry->hw_fn_id = hw_fn_id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h index fd814a190b8b..6352cb004a18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h @@ -20,7 +20,7 @@ TRACE_EVENT(mlx5_sf_vhca_event, __field(u32, sfnum) __field(u8, vhca_state) ), - TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + TP_fast_assign(__assign_str(devname); __entry->hw_fn_id = event->function_id; __entry->sfnum = event->sw_function_id; __entry->vhca_state = event->new_vhca_state; diff --git a/drivers/net/fjes/fjes_trace.h b/drivers/net/fjes/fjes_trace.h index 6437ddbd7842..166ef015262b 100644 --- a/drivers/net/fjes/fjes_trace.h +++ b/drivers/net/fjes/fjes_trace.h @@ -85,7 +85,7 @@ TRACE_EVENT(fjes_hw_request_info_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err); + __assign_str(err); ), TP_printk("%s", __get_str(err)) ); @@ -145,7 +145,7 @@ TRACE_EVENT(fjes_hw_register_buff_addr_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err); + __assign_str(err); ), TP_printk("%s", __get_str(err)) ); @@ -189,7 +189,7 @@ TRACE_EVENT(fjes_hw_unregister_buff_addr_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err); + __assign_str(err); ), TP_printk("%s", __get_str(err)) ); @@ -232,7 +232,7 @@ TRACE_EVENT(fjes_hw_start_debug_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err); + __assign_str(err); ), TP_printk("%s", __get_str(err)) ); @@ -258,7 +258,7 @@ TRACE_EVENT(fjes_hw_stop_debug_err, __string(err, err) ), TP_fast_assign( - __assign_str(err, err); + __assign_str(err); ), TP_printk("%s", __get_str(err)) ); diff --git a/drivers/net/hyperv/netvsc_trace.h b/drivers/net/hyperv/netvsc_trace.h index f7585563dea5..05e620cbdd29 100644 --- a/drivers/net/hyperv/netvsc_trace.h +++ b/drivers/net/hyperv/netvsc_trace.h @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(rndis_msg_class, __field( u32, msg_len ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->queue = q; __entry->req_id = msg->msg.init_req.req_id; __entry->msg_type = msg->ndis_msg_type; @@ -121,7 +121,7 @@ TRACE_EVENT(nvsp_send, __field( u32, msg_type ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->msg_type = msg->hdr.msg_type; ), TP_printk("dev=%s type=%s", @@ -142,7 +142,7 @@ TRACE_EVENT(nvsp_send_pkt, __field( u32, section_size ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->qid = chan->offermsg.offer.sub_channel_index; __entry->channel_type = rpkt->channel_type; __entry->section_index = rpkt->send_buf_section_index; @@ -165,7 +165,7 @@ TRACE_EVENT(nvsp_recv, __field( u32, msg_type ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->qid = chan->offermsg.offer.sub_channel_index; __entry->msg_type = msg->hdr.msg_type; ), diff --git a/drivers/net/wireless/ath/ath10k/trace.h b/drivers/net/wireless/ath/ath10k/trace.h index 64e7a767d963..68b78ca17eaa 100644 --- a/drivers/net/wireless/ath/ath10k/trace.h +++ b/drivers/net/wireless/ath/ath10k/trace.h @@ -55,8 +55,8 @@ DECLARE_EVENT_CLASS(ath10k_log_event, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk( @@ -92,8 +92,8 @@ TRACE_EVENT(ath10k_log_dbg, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->level = level; __assign_vstr(msg, vaf->fmt, vaf->va); ), @@ -121,10 +121,10 @@ TRACE_EVENT(ath10k_log_dbg_dump, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); - __assign_str(msg, msg); - __assign_str(prefix, prefix); + __assign_str(device); + __assign_str(driver); + __assign_str(msg); + __assign_str(prefix); __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); ), @@ -152,8 +152,8 @@ TRACE_EVENT(ath10k_wmi_cmd, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->id = id; __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); @@ -182,8 +182,8 @@ TRACE_EVENT(ath10k_wmi_event, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->id = id; __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); @@ -211,8 +211,8 @@ TRACE_EVENT(ath10k_htt_stats, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); ), @@ -239,8 +239,8 @@ TRACE_EVENT(ath10k_wmi_dbglog, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->hw_type = ar->hw_rev; __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); @@ -269,8 +269,8 @@ TRACE_EVENT(ath10k_htt_pktlog, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->hw_type = ar->hw_rev; __entry->buf_len = buf_len; memcpy(__get_dynamic_array(pktlog), buf, buf_len); @@ -301,8 +301,8 @@ TRACE_EVENT(ath10k_htt_tx, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->msdu_id = msdu_id; __entry->msdu_len = msdu_len; __entry->vdev_id = vdev_id; @@ -332,8 +332,8 @@ TRACE_EVENT(ath10k_txrx_tx_unref, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->msdu_id = msdu_id; ), @@ -358,8 +358,8 @@ DECLARE_EVENT_CLASS(ath10k_hdr_event, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = ath10k_frm_hdr_len(data, len); memcpy(__get_dynamic_array(data), data, __entry->len); ), @@ -386,8 +386,8 @@ DECLARE_EVENT_CLASS(ath10k_payload_event, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len - ath10k_frm_hdr_len(data, len); memcpy(__get_dynamic_array(payload), data + ath10k_frm_hdr_len(data, len), __entry->len); @@ -435,8 +435,8 @@ TRACE_EVENT(ath10k_htt_rx_desc, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->hw_type = ar->hw_rev; __entry->len = len; memcpy(__get_dynamic_array(rxdesc), data, len); @@ -472,8 +472,8 @@ TRACE_EVENT(ath10k_wmi_diag_container, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->type = type; __entry->timestamp = timestamp; __entry->code = code; @@ -505,8 +505,8 @@ TRACE_EVENT(ath10k_wmi_diag, ), TP_fast_assign( - __assign_str(device, dev_name(ar->dev)); - __assign_str(driver, dev_driver_string(ar->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; memcpy(__get_dynamic_array(data), data, len); ), diff --git a/drivers/net/wireless/ath/ath11k/trace.h b/drivers/net/wireless/ath/ath11k/trace.h index 235ab8ea715f..75246b0a82e3 100644 --- a/drivers/net/wireless/ath/ath11k/trace.h +++ b/drivers/net/wireless/ath/ath11k/trace.h @@ -48,8 +48,8 @@ TRACE_EVENT(ath11k_htt_pktlog, ), TP_fast_assign( - __assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->buf_len = buf_len; __entry->pktlog_checksum = pktlog_checksum; memcpy(__get_dynamic_array(pktlog), buf, buf_len); @@ -77,8 +77,8 @@ TRACE_EVENT(ath11k_htt_ppdu_stats, ), TP_fast_assign( - __assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; memcpy(__get_dynamic_array(ppdu), data, len); ), @@ -105,8 +105,8 @@ TRACE_EVENT(ath11k_htt_rxdesc, ), TP_fast_assign( - __assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; __entry->log_type = log_type; memcpy(__get_dynamic_array(rxdesc), data, len); @@ -130,8 +130,8 @@ DECLARE_EVENT_CLASS(ath11k_log_event, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); + __assign_str(device); + __assign_str(driver); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk( @@ -171,8 +171,8 @@ TRACE_EVENT(ath11k_wmi_cmd, ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->id = id; __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); @@ -201,8 +201,8 @@ TRACE_EVENT(ath11k_wmi_event, ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->id = id; __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); @@ -230,8 +230,8 @@ TRACE_EVENT(ath11k_log_dbg, ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->level = level; WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), ATH11K_MSG_MAX, vaf->fmt, @@ -262,10 +262,10 @@ TRACE_EVENT(ath11k_log_dbg_dump, ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); - __assign_str(msg, msg); - __assign_str(prefix, prefix); + __assign_str(device); + __assign_str(driver); + __assign_str(msg); + __assign_str(prefix); __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); ), @@ -292,8 +292,8 @@ TRACE_EVENT(ath11k_wmi_diag, ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; memcpy(__get_dynamic_array(data), data, len); ), @@ -318,8 +318,8 @@ TRACE_EVENT(ath11k_ps_timekeeper, __field(u32, peer_ps_timestamp) ), - TP_fast_assign(__assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + TP_fast_assign(__assign_str(device); + __assign_str(driver); memcpy(__get_dynamic_array(peer_addr), peer_addr, ETH_ALEN); __entry->peer_ps_state = peer_ps_state; diff --git a/drivers/net/wireless/ath/ath12k/trace.h b/drivers/net/wireless/ath/ath12k/trace.h index 240737e1542d..253c67accb0e 100644 --- a/drivers/net/wireless/ath/ath12k/trace.h +++ b/drivers/net/wireless/ath/ath12k/trace.h @@ -36,8 +36,8 @@ TRACE_EVENT(ath12k_htt_pktlog, ), TP_fast_assign( - __assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->buf_len = buf_len; __entry->pktlog_checksum = pktlog_checksum; memcpy(__get_dynamic_array(pktlog), buf, buf_len); @@ -73,8 +73,8 @@ TRACE_EVENT(ath12k_htt_ppdu_stats, ), TP_fast_assign( - __assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; __entry->info = ar->pdev->timestamp.info; __entry->sync_tstmp_lo_us = ar->pdev->timestamp.sync_timestamp_hi_us; @@ -117,8 +117,8 @@ TRACE_EVENT(ath12k_htt_rxdesc, ), TP_fast_assign( - __assign_str(device, dev_name(ar->ab->dev)); - __assign_str(driver, dev_driver_string(ar->ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; __entry->type = type; __entry->info = ar->pdev->timestamp.info; @@ -153,8 +153,8 @@ TRACE_EVENT(ath12k_wmi_diag, ), TP_fast_assign( - __assign_str(device, dev_name(ab->dev)); - __assign_str(driver, dev_driver_string(ab->dev)); + __assign_str(device); + __assign_str(driver); __entry->len = len; memcpy(__get_dynamic_array(data), data, len); ), diff --git a/drivers/net/wireless/ath/ath6kl/trace.h b/drivers/net/wireless/ath/ath6kl/trace.h index 231a94769ddb..8577aa459c58 100644 --- a/drivers/net/wireless/ath/ath6kl/trace.h +++ b/drivers/net/wireless/ath/ath6kl/trace.h @@ -304,8 +304,8 @@ TRACE_EVENT(ath6kl_log_dbg_dump, ), TP_fast_assign( - __assign_str(msg, msg); - __assign_str(prefix, prefix); + __assign_str(msg); + __assign_str(prefix); __entry->buf_len = buf_len; memcpy(__get_dynamic_array(buf), buf, buf_len); ), diff --git a/drivers/net/wireless/ath/trace.h b/drivers/net/wireless/ath/trace.h index 9935cf475b6d..82aac0a4baff 100644 --- a/drivers/net/wireless/ath/trace.h +++ b/drivers/net/wireless/ath/trace.h @@ -44,8 +44,8 @@ TRACE_EVENT(ath_log, ), TP_fast_assign( - __assign_str(device, wiphy_name(wiphy)); - __assign_str(driver, KBUILD_MODNAME); + __assign_str(device); + __assign_str(driver); __assign_vstr(msg, vaf->fmt, vaf->va); ), diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h index 5d66e94c806d..96032322b165 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h @@ -41,7 +41,7 @@ TRACE_EVENT(brcmf_err, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(func, func); + __assign_str(func); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s: %s", __get_str(func), __get_str(msg)) @@ -57,7 +57,7 @@ TRACE_EVENT(brcmf_dbg, ), TP_fast_assign( __entry->level = level; - __assign_str(func, func); + __assign_str(func); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s: %s", __get_str(func), __get_str(msg)) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h index a0da3248b942..53b3dba50737 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h @@ -81,7 +81,7 @@ TRACE_EVENT(brcms_macintstatus, __field(u32, mask) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); __entry->in_isr = in_isr; __entry->macintstatus = macintstatus; __entry->mask = mask; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h index 42b0a91656c4..908ce3c864fe 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h @@ -71,7 +71,7 @@ TRACE_EVENT(brcms_dbg, ), TP_fast_assign( __entry->level = level; - __assign_str(func, func); + __assign_str(func); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s: %s", __get_str(func), __get_str(msg)) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h index cf2cc070f1e5..24ac34fa0207 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h @@ -31,7 +31,7 @@ TRACE_EVENT(brcms_txdesc, __dynamic_array(u8, txh, txh_len) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); memcpy(__get_dynamic_array(txh), txh, txh_len); ), TP_printk("[%s] txdesc", __get_str(dev)) @@ -54,7 +54,7 @@ TRACE_EVENT(brcms_txstatus, __field(u16, ackphyrxsh) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); __entry->framelen = framelen; __entry->frameid = frameid; __entry->status = status; @@ -85,7 +85,7 @@ TRACE_EVENT(brcms_ampdu_session, __field(u16, dma_len) ), TP_fast_assign( - __assign_str(dev, dev_name(dev)); + __assign_str(dev); __entry->max_ampdu_len = max_ampdu_len; __entry->max_ampdu_frames = max_ampdu_frames; __entry->ampdu_len = ampdu_len; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h index 1d6c292cf545..0db1fa5477af 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h @@ -57,7 +57,7 @@ TRACE_EVENT(iwlwifi_dbg, ), TP_fast_assign( __entry->level = level; - __assign_str(function, function); + __assign_str(function); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s", __get_str(msg)) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h index c3e09f4fefeb..76166e1b10e5 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h @@ -87,7 +87,7 @@ static inline void trace_ ## name(proto) {} #endif #define DEV_ENTRY __string(dev, dev_name(dev)) -#define DEV_ASSIGN __assign_str(dev, dev_name(dev)) +#define DEV_ASSIGN __assign_str(dev) #include "iwl-devtrace-io.h" #include "iwl-devtrace-ucode.h" diff --git a/drivers/soc/qcom/pmic_pdcharger_ulog.h b/drivers/soc/qcom/pmic_pdcharger_ulog.h index 152e3a6b5480..1cfa58f0e34c 100644 --- a/drivers/soc/qcom/pmic_pdcharger_ulog.h +++ b/drivers/soc/qcom/pmic_pdcharger_ulog.h @@ -18,7 +18,7 @@ TRACE_EVENT(pmic_pdcharger_ulog_msg, __string(msg, msg) ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("%s", __get_str(msg)) ); diff --git a/drivers/soc/qcom/trace-aoss.h b/drivers/soc/qcom/trace-aoss.h index 554029b33b44..fb5b0470c40d 100644 --- a/drivers/soc/qcom/trace-aoss.h +++ b/drivers/soc/qcom/trace-aoss.h @@ -18,7 +18,7 @@ TRACE_EVENT(aoss_send, __string(msg, msg) ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("%s", __get_str(msg)) ); @@ -31,7 +31,7 @@ TRACE_EVENT(aoss_send_done, __field(int, ret) ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); __entry->ret = ret; ), TP_printk("%s: %d", __get_str(msg), __entry->ret) diff --git a/drivers/soc/qcom/trace-rpmh.h b/drivers/soc/qcom/trace-rpmh.h index be6b42ecc1f8..593ec1d4e010 100644 --- a/drivers/soc/qcom/trace-rpmh.h +++ b/drivers/soc/qcom/trace-rpmh.h @@ -26,7 +26,7 @@ TRACE_EVENT(rpmh_tx_done, ), TP_fast_assign( - __assign_str(name, d->name); + __assign_str(name); __entry->m = m; __entry->addr = r->cmds[0].addr; __entry->data = r->cmds[0].data; @@ -55,7 +55,7 @@ TRACE_EVENT(rpmh_send_msg, ), TP_fast_assign( - __assign_str(name, d->name); + __assign_str(name); __entry->m = m; __entry->state = state; __entry->n = n; diff --git a/drivers/thermal/thermal_trace.h b/drivers/thermal/thermal_trace.h index 88a962f560f2..df8f4edd6068 100644 --- a/drivers/thermal/thermal_trace.h +++ b/drivers/thermal/thermal_trace.h @@ -37,7 +37,7 @@ TRACE_EVENT(thermal_temperature, ), TP_fast_assign( - __assign_str(thermal_zone, tz->type); + __assign_str(thermal_zone); __entry->id = tz->id; __entry->temp_prev = tz->last_temperature; __entry->temp = tz->temperature; @@ -60,7 +60,7 @@ TRACE_EVENT(cdev_update, ), TP_fast_assign( - __assign_str(type, cdev->type); + __assign_str(type); __entry->target = target; ), @@ -82,7 +82,7 @@ TRACE_EVENT(thermal_zone_trip, ), TP_fast_assign( - __assign_str(thermal_zone, tz->type); + __assign_str(thermal_zone); __entry->id = tz->id; __entry->trip = trip; __entry->trip_type = trip_type; @@ -156,7 +156,7 @@ TRACE_EVENT(thermal_power_devfreq_get_power, ), TP_fast_assign( - __assign_str(type, cdev->type); + __assign_str(type); __entry->freq = freq; __entry->busy_time = status->busy_time; __entry->total_time = status->total_time; @@ -184,7 +184,7 @@ TRACE_EVENT(thermal_power_devfreq_limit, ), TP_fast_assign( - __assign_str(type, cdev->type); + __assign_str(type); __entry->freq = freq; __entry->cdev_state = cdev_state; __entry->power = power; diff --git a/drivers/usb/cdns3/cdns3-trace.h b/drivers/usb/cdns3/cdns3-trace.h index 40db89e3333c..c4e542f1b9b7 100644 --- a/drivers/usb/cdns3/cdns3-trace.h +++ b/drivers/usb/cdns3/cdns3-trace.h @@ -33,7 +33,7 @@ TRACE_EVENT(cdns3_halt, __field(u8, flush) ), TP_fast_assign( - __assign_str(name, ep_priv->name); + __assign_str(name); __entry->halt = halt; __entry->flush = flush; ), @@ -49,8 +49,8 @@ TRACE_EVENT(cdns3_wa1, __string(msg, msg) ), TP_fast_assign( - __assign_str(ep_name, ep_priv->name); - __assign_str(msg, msg); + __assign_str(ep_name); + __assign_str(msg); ), TP_printk("WA1: %s %s", __get_str(ep_name), __get_str(msg)) ); @@ -63,8 +63,8 @@ TRACE_EVENT(cdns3_wa2, __string(msg, msg) ), TP_fast_assign( - __assign_str(ep_name, ep_priv->name); - __assign_str(msg, msg); + __assign_str(ep_name); + __assign_str(msg); ), TP_printk("WA2: %s %s", __get_str(ep_name), __get_str(msg)) ); @@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(cdns3_log_doorbell, __field(u32, ep_trbaddr) ), TP_fast_assign( - __assign_str(name, ep_name); + __assign_str(name); __entry->ep_trbaddr = ep_trbaddr; ), TP_printk("%s, ep_trbaddr %08x", __get_str(name), @@ -125,7 +125,7 @@ DECLARE_EVENT_CLASS(cdns3_log_epx_irq, __field(u32, use_streams) ), TP_fast_assign( - __assign_str(ep_name, priv_ep->name); + __assign_str(ep_name); __entry->ep_sts = readl(&priv_dev->regs->ep_sts); __entry->ep_traddr = readl(&priv_dev->regs->ep_traddr); __entry->ep_last_sid = priv_ep->last_stream_id; @@ -214,7 +214,7 @@ DECLARE_EVENT_CLASS(cdns3_log_request, __field(unsigned int, stream_id) ), TP_fast_assign( - __assign_str(name, req->priv_ep->name); + __assign_str(name); __entry->req = req; __entry->buf = req->request.buf; __entry->actual = req->request.actual; @@ -294,7 +294,7 @@ DECLARE_EVENT_CLASS(cdns3_stream_split_transfer_len, __field(unsigned int, stream_id) ), TP_fast_assign( - __assign_str(name, req->priv_ep->name); + __assign_str(name); __entry->req = req; __entry->actual = req->request.length; __entry->length = req->request.actual; @@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(cdns3_log_aligned_request, __field(u32, aligned_buf_size) ), TP_fast_assign( - __assign_str(name, priv_req->priv_ep->name); + __assign_str(name); __entry->req = &priv_req->request; __entry->buf = priv_req->request.buf; __entry->dma = priv_req->request.dma; @@ -364,7 +364,7 @@ DECLARE_EVENT_CLASS(cdns3_log_map_request, __field(dma_addr_t, dma) ), TP_fast_assign( - __assign_str(name, priv_req->priv_ep->name); + __assign_str(name); __entry->req = &priv_req->request; __entry->buf = priv_req->request.buf; __entry->dma = priv_req->request.dma; @@ -395,7 +395,7 @@ DECLARE_EVENT_CLASS(cdns3_log_trb, __field(unsigned int, last_stream_id) ), TP_fast_assign( - __assign_str(name, priv_ep->name); + __assign_str(name); __entry->trb = trb; __entry->buffer = le32_to_cpu(trb->buffer); __entry->length = le32_to_cpu(trb->length); @@ -467,7 +467,7 @@ DECLARE_EVENT_CLASS(cdns3_log_ep, __field(u8, dequeue) ), TP_fast_assign( - __assign_str(name, priv_ep->name); + __assign_str(name); __entry->maxpacket = priv_ep->endpoint.maxpacket; __entry->maxpacket_limit = priv_ep->endpoint.maxpacket_limit; __entry->max_streams = priv_ep->endpoint.max_streams; diff --git a/drivers/usb/cdns3/cdnsp-trace.h b/drivers/usb/cdns3/cdnsp-trace.h index 4b51011eb00b..f2bcf77a5d0a 100644 --- a/drivers/usb/cdns3/cdnsp-trace.h +++ b/drivers/usb/cdns3/cdnsp-trace.h @@ -48,7 +48,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_ep, __field(u8, drbls_count) ), TP_fast_assign( - __assign_str(name, pep->name); + __assign_str(name); __entry->state = pep->ep_state; __entry->stream_id = stream_id; __entry->enabled = pep->ep_state & EP_HAS_STREAMS; @@ -138,7 +138,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_simple, __string(text, msg) ), TP_fast_assign( - __assign_str(text, msg); + __assign_str(text); ), TP_printk("%s", __get_str(text)) ); @@ -303,7 +303,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_bounce, __field(unsigned int, unalign) ), TP_fast_assign( - __assign_str(name, preq->pep->name); + __assign_str(name); __entry->new_buf_len = new_buf_len; __entry->offset = offset; __entry->dma = dma; @@ -470,7 +470,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_request, ), TP_fast_assign( - __assign_str(name, req->pep->name); + __assign_str(name); __entry->request = &req->request; __entry->preq = req; __entry->buf = req->request.buf; @@ -674,7 +674,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_td_info, __field(dma_addr_t, trb_dma) ), TP_fast_assign( - __assign_str(name, preq->pep->name); + __assign_str(name); __entry->request = &preq->request; __entry->preq = preq; __entry->first_trb = preq->td.first_trb; diff --git a/drivers/usb/chipidea/trace.h b/drivers/usb/chipidea/trace.h index ca0e65b48f0a..1875419cd17f 100644 --- a/drivers/usb/chipidea/trace.h +++ b/drivers/usb/chipidea/trace.h @@ -31,7 +31,7 @@ TRACE_EVENT(ci_log, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(name, dev_name(ci->dev)); + __assign_str(name); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s: %s", __get_str(name), __get_str(msg)) @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(ci_log_trb, __field(u32, type) ), TP_fast_assign( - __assign_str(name, hwep->name); + __assign_str(name); __entry->req = &hwreq->req; __entry->td = td; __entry->dma = td->dma; diff --git a/drivers/usb/dwc3/trace.h b/drivers/usb/dwc3/trace.h index d2997d17cfbe..bdeb1aaf65d8 100644 --- a/drivers/usb/dwc3/trace.h +++ b/drivers/usb/dwc3/trace.h @@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(dwc3_log_request, __field(int, no_interrupt) ), TP_fast_assign( - __assign_str(name, req->dep->name); + __assign_str(name); __entry->req = req; __entry->actual = req->request.actual; __entry->length = req->request.length; @@ -193,7 +193,7 @@ DECLARE_EVENT_CLASS(dwc3_log_gadget_ep_cmd, __field(int, cmd_status) ), TP_fast_assign( - __assign_str(name, dep->name); + __assign_str(name); __entry->cmd = cmd; __entry->param0 = params->param0; __entry->param1 = params->param1; @@ -229,7 +229,7 @@ DECLARE_EVENT_CLASS(dwc3_log_trb, __field(u32, dequeue) ), TP_fast_assign( - __assign_str(name, dep->name); + __assign_str(name); __entry->trb = trb; __entry->bpl = trb->bpl; __entry->bph = trb->bph; @@ -301,7 +301,7 @@ DECLARE_EVENT_CLASS(dwc3_log_ep, __field(u8, trb_dequeue) ), TP_fast_assign( - __assign_str(name, dep->name); + __assign_str(name); __entry->maxpacket = dep->endpoint.maxpacket; __entry->maxpacket_limit = dep->endpoint.maxpacket_limit; __entry->max_streams = dep->endpoint.max_streams; diff --git a/drivers/usb/gadget/udc/cdns2/cdns2-trace.h b/drivers/usb/gadget/udc/cdns2/cdns2-trace.h index 61f241634ea5..ade1752956b1 100644 --- a/drivers/usb/gadget/udc/cdns2/cdns2-trace.h +++ b/drivers/usb/gadget/udc/cdns2/cdns2-trace.h @@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(cdns2_log_simple, __string(text, msg) ), TP_fast_assign( - __assign_str(text, msg); + __assign_str(text); ), TP_printk("%s", __get_str(text)) ); @@ -103,7 +103,7 @@ TRACE_EVENT(cdns2_ep_halt, __field(u8, flush) ), TP_fast_assign( - __assign_str(name, ep_priv->name); + __assign_str(name); __entry->halt = halt; __entry->flush = flush; ), @@ -119,8 +119,8 @@ TRACE_EVENT(cdns2_wa1, __string(msg, msg) ), TP_fast_assign( - __assign_str(ep_name, ep_priv->name); - __assign_str(msg, msg); + __assign_str(ep_name); + __assign_str(msg); ), TP_printk("WA1: %s %s", __get_str(ep_name), __get_str(msg)) ); @@ -134,7 +134,7 @@ DECLARE_EVENT_CLASS(cdns2_log_doorbell, __field(u32, ep_trbaddr) ), TP_fast_assign( - __assign_str(name, pep->name); + __assign_str(name); __entry->ep_trbaddr = ep_trbaddr; ), TP_printk("%s, ep_trbaddr %08x", __get_str(name), @@ -196,7 +196,7 @@ DECLARE_EVENT_CLASS(cdns2_log_epx_irq, __field(u32, ep_traddr) ), TP_fast_assign( - __assign_str(ep_name, pep->name); + __assign_str(ep_name); __entry->ep_sts = readl(&pdev->adma_regs->ep_sts); __entry->ep_ists = readl(&pdev->adma_regs->ep_ists); __entry->ep_traddr = readl(&pdev->adma_regs->ep_traddr); @@ -288,7 +288,7 @@ DECLARE_EVENT_CLASS(cdns2_log_request, __field(int, end_trb) ), TP_fast_assign( - __assign_str(name, preq->pep->name); + __assign_str(name); __entry->request = &preq->request; __entry->preq = preq; __entry->buf = preq->request.buf; @@ -380,7 +380,7 @@ DECLARE_EVENT_CLASS(cdns2_log_map_request, __field(dma_addr_t, dma) ), TP_fast_assign( - __assign_str(name, priv_req->pep->name); + __assign_str(name); __entry->req = &priv_req->request; __entry->buf = priv_req->request.buf; __entry->dma = priv_req->request.dma; @@ -411,7 +411,7 @@ DECLARE_EVENT_CLASS(cdns2_log_trb, __field(u32, type) ), TP_fast_assign( - __assign_str(name, pep->name); + __assign_str(name); __entry->trb = trb; __entry->buffer = le32_to_cpu(trb->buffer); __entry->length = le32_to_cpu(trb->length); @@ -476,7 +476,7 @@ DECLARE_EVENT_CLASS(cdns2_log_ep, __field(u8, dequeue) ), TP_fast_assign( - __assign_str(name, pep->name); + __assign_str(name); __entry->maxpacket = pep->endpoint.maxpacket; __entry->maxpacket_limit = pep->endpoint.maxpacket_limit; __entry->flags = pep->ep_state; @@ -568,7 +568,7 @@ DECLARE_EVENT_CLASS(cdns2_log_epx_reg_config, __field(u32, ep_cfg_reg) ), TP_fast_assign( - __assign_str(ep_name, pep->name); + __assign_str(ep_name); __entry->burst_size = pep->trb_burst_size; __entry->maxpack_reg = pep->dir ? readw(&pdev->epx_regs->txmaxpack[pep->num - 1]) : readw(&pdev->epx_regs->rxmaxpack[pep->num - 1]); diff --git a/drivers/usb/gadget/udc/trace.h b/drivers/usb/gadget/udc/trace.h index a5ed26fbc2da..4e334298b0e8 100644 --- a/drivers/usb/gadget/udc/trace.h +++ b/drivers/usb/gadget/udc/trace.h @@ -157,7 +157,7 @@ DECLARE_EVENT_CLASS(udc_log_ep, __field(int, ret) ), TP_fast_assign( - __assign_str(name, ep->name); + __assign_str(name); __entry->maxpacket = ep->maxpacket; __entry->maxpacket_limit = ep->maxpacket_limit; __entry->max_streams = ep->max_streams; @@ -233,7 +233,7 @@ DECLARE_EVENT_CLASS(udc_log_req, __field(struct usb_request *, req) ), TP_fast_assign( - __assign_str(name, ep->name); + __assign_str(name); __entry->length = req->length; __entry->actual = req->actual; __entry->num_sgs = req->num_sgs; diff --git a/drivers/usb/mtu3/mtu3_trace.h b/drivers/usb/mtu3/mtu3_trace.h index 03d2a9bac27e..89870175d635 100644 --- a/drivers/usb/mtu3/mtu3_trace.h +++ b/drivers/usb/mtu3/mtu3_trace.h @@ -26,7 +26,7 @@ TRACE_EVENT(mtu3_log, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(name, dev_name(dev)); + __assign_str(name); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s: %s", __get_str(name), __get_str(msg)) @@ -127,7 +127,7 @@ DECLARE_EVENT_CLASS(mtu3_log_request, __field(int, no_interrupt) ), TP_fast_assign( - __assign_str(name, mreq->mep->name); + __assign_str(name); __entry->mreq = mreq; __entry->gpd = mreq->gpd; __entry->actual = mreq->request.actual; @@ -182,7 +182,7 @@ DECLARE_EVENT_CLASS(mtu3_log_gpd, __field(u32, dw3) ), TP_fast_assign( - __assign_str(name, mep->name); + __assign_str(name); __entry->gpd = gpd; __entry->dw0 = le32_to_cpu(gpd->dw0_info); __entry->dw1 = le32_to_cpu(gpd->next_gpd); @@ -226,7 +226,7 @@ DECLARE_EVENT_CLASS(mtu3_log_ep, __field(struct mtu3_gpd_ring *, gpd_ring) ), TP_fast_assign( - __assign_str(name, mep->name); + __assign_str(name); __entry->type = mep->type; __entry->slot = mep->slot; __entry->maxp = mep->ep.maxpacket; diff --git a/drivers/usb/musb/musb_trace.h b/drivers/usb/musb/musb_trace.h index f246b14394c4..726e6697d475 100644 --- a/drivers/usb/musb/musb_trace.h +++ b/drivers/usb/musb/musb_trace.h @@ -31,7 +31,7 @@ TRACE_EVENT(musb_log, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(name, dev_name(musb->controller)); + __assign_str(name); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s: %s", __get_str(name), __get_str(msg)) @@ -46,9 +46,9 @@ TRACE_EVENT(musb_state, __string(desc, desc) ), TP_fast_assign( - __assign_str(name, dev_name(musb->controller)); + __assign_str(name); __entry->devctl = devctl; - __assign_str(desc, desc); + __assign_str(desc); ), TP_printk("%s: devctl: %02x %s", __get_str(name), __entry->devctl, __get_str(desc)) @@ -160,7 +160,7 @@ TRACE_EVENT(musb_isr, __field(u16, int_rx) ), TP_fast_assign( - __assign_str(name, dev_name(musb->controller)); + __assign_str(name); __entry->int_usb = musb->int_usb; __entry->int_tx = musb->int_tx; __entry->int_rx = musb->int_rx; @@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(musb_urb, __field(u32, actual_len) ), TP_fast_assign( - __assign_str(name, dev_name(musb->controller)); + __assign_str(name); __entry->urb = urb; __entry->pipe = urb->pipe; __entry->status = urb->status; @@ -325,7 +325,7 @@ DECLARE_EVENT_CLASS(musb_cppi41, ), TP_fast_assign( __entry->ch = ch; - __assign_str(name, dev_name(ch->hw_ep->musb->controller)); + __assign_str(name); __entry->hwep = ch->hw_ep->epnum; __entry->port = ch->port_num; __entry->is_tx = ch->is_tx; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 362e1fc7ef6a..84fcf26e306e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str, TP_fast_assign( __entry->dev = c->dev; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) @@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str, __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %pS %s", @@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %s", diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 10985a4b8259..4de8780a7c48 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -47,7 +47,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_fast_assign( __entry->error = error < 0 ? -error : 0; - __assign_str(dstaddr, clp->cl_hostname); + __assign_str(dstaddr); ), TP_printk( @@ -94,8 +94,8 @@ TRACE_EVENT(nfs4_trunked_exchange_id, TP_fast_assign( __entry->error = error < 0 ? -error : 0; - __assign_str(main_addr, clp->cl_hostname); - __assign_str(trunk_addr, addr); + __assign_str(main_addr); + __assign_str(trunk_addr); ), TP_printk( @@ -365,7 +365,7 @@ TRACE_EVENT(nfs4_state_mgr, TP_fast_assign( __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname); + __assign_str(hostname); ), TP_printk( @@ -393,8 +393,8 @@ TRACE_EVENT(nfs4_state_mgr_failed, TP_fast_assign( __entry->error = status < 0 ? -status : 0; __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname); - __assign_str(section, section); + __assign_str(hostname); + __assign_str(section); ), TP_printk( @@ -578,7 +578,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle = 0; } __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -1072,7 +1072,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = -error; - __assign_str(name, name->name); + __assign_str(name); ), TP_printk( @@ -1156,8 +1156,8 @@ TRACE_EVENT(nfs4_rename, __entry->olddir = NFS_FILEID(olddir); __entry->newdir = NFS_FILEID(newdir); __entry->error = error < 0 ? -error : 0; - __assign_str(oldname, oldname->name); - __assign_str(newname, newname->name); + __assign_str(oldname); + __assign_str(newname); ), TP_printk( @@ -1359,7 +1359,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -1416,7 +1416,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); + __assign_str(dstaddr); __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = @@ -1960,7 +1960,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_event, ), TP_fast_assign( - __assign_str(dstaddr, clp->cl_hostname); + __assign_str(dstaddr); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -1998,7 +1998,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status, TP_fast_assign( __entry->dev = server->s_dev; __entry->status = status; - __assign_str(dstaddr, server->nfs_client->cl_hostname); + __assign_str(dstaddr); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -2036,8 +2036,8 @@ TRACE_EVENT(fl_getdevinfo, ), TP_fast_assign( - __assign_str(mds_addr, server->nfs_client->cl_hostname); - __assign_str(ds_ips, ds_remotestr); + __assign_str(mds_addr); + __assign_str(ds_ips); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -2083,9 +2083,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, be32_to_cpu(hdr->args.stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&hdr->args.stateid); - __assign_str(dstaddr, hdr->ds_clp ? - rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -2139,9 +2137,7 @@ TRACE_EVENT(ff_layout_commit_error, __entry->dev = inode->i_sb->s_dev; __entry->offset = data->args.offset; __entry->count = data->args.count; - __assign_str(dstaddr, data->ds_clp ? - rpc_peeraddr2str(data->ds_clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -2579,7 +2575,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __assign_str(name, name); + __assign_str(name); ), TP_printk( diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index afedb449b54f..1e710654af11 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -457,7 +457,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __entry->error = error < 0 ? -error : 0; __entry->flags = flags; __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -512,7 +512,7 @@ TRACE_EVENT(nfs_atomic_open_enter, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -551,7 +551,7 @@ TRACE_EVENT(nfs_atomic_open_exit, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -587,7 +587,7 @@ TRACE_EVENT(nfs_create_enter, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -623,7 +623,7 @@ TRACE_EVENT(nfs_create_exit, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -654,7 +654,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -693,7 +693,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -747,7 +747,7 @@ TRACE_EVENT(nfs_link_enter, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -783,7 +783,7 @@ TRACE_EVENT(nfs_link_exit, __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -819,8 +819,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event, __entry->dev = old_dir->i_sb->s_dev; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -868,8 +868,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, __entry->error = -error; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -1636,8 +1636,8 @@ TRACE_EVENT(nfs_mount_assign, ), TP_fast_assign( - __assign_str(option, option); - __assign_str(value, value); + __assign_str(option); + __assign_str(value); ), TP_printk("option %s=%s", @@ -1657,7 +1657,7 @@ TRACE_EVENT(nfs_mount_option, ), TP_fast_assign( - __assign_str(option, param->key); + __assign_str(option); ), TP_printk("option %s", __get_str(option)) @@ -1675,7 +1675,7 @@ TRACE_EVENT(nfs_mount_path, ), TP_fast_assign( - __assign_str(path, path); + __assign_str(path); ), TP_printk("path='%s'", __get_str(path)) @@ -1710,9 +1710,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event, __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->version = task->tk_client->cl_vers; __entry->error = error; - __assign_str(program, - task->tk_client->cl_program->name); - __assign_str(procedure, task->tk_msg.rpc_proc->p_name); + __assign_str(program); + __assign_str(procedure); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index b5e48d504062..77bbd23aa150 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound, TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->opcnt = opcnt; - __assign_str(tag, tag); + __assign_str(tag); ), TP_printk("xid=0x%08x opcnt=%u tag=%s", __entry->xid, __entry->opcnt, __get_str(tag) @@ -127,7 +127,7 @@ TRACE_EVENT(nfsd_compound_status, __entry->args_opcnt = args_opcnt; __entry->resp_opcnt = resp_opcnt; __entry->status = be32_to_cpu(status); - __assign_str(name, name); + __assign_str(name); ), TP_printk("op=%u/%u %s status=%d", __entry->resp_opcnt, __entry->args_opcnt, @@ -318,7 +318,7 @@ TRACE_EVENT(nfsd_exp_find_key, TP_fast_assign( __entry->fsidtype = key->ek_fsidtype; memcpy(__entry->fsid, key->ek_fsid, 4*6); - __assign_str(auth_domain, key->ek_client->name); + __assign_str(auth_domain); __entry->status = status; ), TP_printk("fsid=%x::%s domain=%s status=%d", @@ -342,8 +342,8 @@ TRACE_EVENT(nfsd_expkey_update, TP_fast_assign( __entry->fsidtype = key->ek_fsidtype; memcpy(__entry->fsid, key->ek_fsid, 4*6); - __assign_str(auth_domain, key->ek_client->name); - __assign_str(path, exp_path); + __assign_str(auth_domain); + __assign_str(path); __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); ), TP_printk("fsid=%x::%s domain=%s path=%s cache=%s", @@ -365,8 +365,8 @@ TRACE_EVENT(nfsd_exp_get_by_name, __field(int, status) ), TP_fast_assign( - __assign_str(path, key->ex_path.dentry->d_name.name); - __assign_str(auth_domain, key->ex_client->name); + __assign_str(path); + __assign_str(auth_domain); __entry->status = status; ), TP_printk("path=%s domain=%s status=%d", @@ -385,8 +385,8 @@ TRACE_EVENT(nfsd_export_update, __field(bool, cache) ), TP_fast_assign( - __assign_str(path, key->ex_path.dentry->d_name.name); - __assign_str(auth_domain, key->ex_client->name); + __assign_str(path); + __assign_str(auth_domain); __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); ), TP_printk("path=%s domain=%s cache=%s", @@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent, TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __assign_str(name, name); + __assign_str(name); ), TP_printk("fh_hash=0x%08x ino=%llu name=%s", __entry->fh_hash, __entry->ino, __get_str(name) @@ -1000,7 +1000,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - __assign_str(name, clp->cl_name.data); + __assign_str(name); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), @@ -1519,7 +1519,7 @@ TRACE_EVENT(nfsd_cb_setup, TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; - __assign_str(netid, netid); + __assign_str(netid); __entry->authflavor = authflavor; __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, clp->cl_cb_conn.cb_addrlen) @@ -1864,7 +1864,7 @@ TRACE_EVENT(nfsd_ctl_unlock_ip, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(address, address); + __assign_str(address); ), TP_printk("address=%s", __get_str(address) @@ -1883,7 +1883,7 @@ TRACE_EVENT(nfsd_ctl_unlock_fs, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(path, path); + __assign_str(path); ), TP_printk("path=%s", __get_str(path) @@ -1907,8 +1907,8 @@ TRACE_EVENT(nfsd_ctl_filehandle, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->maxsize = maxsize; - __assign_str(domain, domain); - __assign_str(path, path); + __assign_str(domain); + __assign_str(path); ), TP_printk("domain=%s path=%s maxsize=%d", __get_str(domain), __get_str(path), __entry->maxsize @@ -1968,7 +1968,7 @@ TRACE_EVENT(nfsd_ctl_version, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(mesg, mesg); + __assign_str(mesg); ), TP_printk("%s", __get_str(mesg) @@ -2009,7 +2009,7 @@ TRACE_EVENT(nfsd_ctl_ports_addxprt, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->port = port; - __assign_str(transport, transport); + __assign_str(transport); ), TP_printk("transport=%s port=%d", __get_str(transport), __entry->port @@ -2070,7 +2070,7 @@ TRACE_EVENT(nfsd_ctl_time, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->time = time; - __assign_str(name, name); + __assign_str(name); ), TP_printk("file=%s time=%d", __get_str(name), __entry->time @@ -2089,7 +2089,7 @@ TRACE_EVENT(nfsd_ctl_recoverydir, ), TP_fast_assign( __entry->netns_ino = net->ns.inum; - __assign_str(recdir, recdir); + __assign_str(recdir); ), TP_printk("recdir=%s", __get_str(recdir) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 9898c11bdfa1..60e208b01c8d 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string, __string(name,name) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) ); @@ -1289,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->para = para; ), TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, @@ -1425,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->ia_valid = ia_valid; __entry->ia_mode = ia_mode; __entry->ia_uid = ia_uid; @@ -1683,7 +1683,7 @@ TRACE_EVENT(ocfs2_parse_options, ), TP_fast_assign( __entry->is_remount = is_remount; - __assign_str(options, options); + __assign_str(options); ), TP_printk("%d %s", __entry->is_remount, __get_str(options)) ); @@ -1718,8 +1718,8 @@ TRACE_EVENT(ocfs2_initialize_super, __field(int, cluster_bits) ), TP_fast_assign( - __assign_str(label, label); - __assign_str(uuid_str, uuid_str); + __assign_str(label); + __assign_str(uuid_str); __entry->root_dir = root_dir; __entry->system_dir = system_dir; __entry->cluster_bits = cluster_bits; @@ -1746,7 +1746,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt, __field(int, credits) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->meta = meta; __entry->clusters = clusters; __entry->credits = credits; @@ -1770,7 +1770,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find, ), TP_fast_assign( __entry->ino = ino; - __assign_str(name, name); + __assign_str(name); __entry->name_index = name_index; __entry->hash = hash; __entry->location = location; @@ -2019,7 +2019,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper, __entry->dq_id = dq_id; __entry->dq_type = dq_type; __entry->type = type; - __assign_str(s_id, s_id); + __assign_str(s_id); ), TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, __entry->type, __get_str(s_id)) @@ -2060,7 +2060,7 @@ TRACE_EVENT(ocfs2_dx_dir_search, TP_fast_assign( __entry->ino = ino; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->blkno = blkno; @@ -2088,7 +2088,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk, ), TP_fast_assign( __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->blkno = blkno; __entry->dir = dir; ), @@ -2107,7 +2107,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry, TP_fast_assign( __entry->dir = dir; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s", __entry->dir, __entry->namelen, __get_str(name)) @@ -2135,7 +2135,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block, __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->num_used = num_used; ), TP_printk("%llu %x %x %.*s %u", __entry->dir, @@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->extra = extra; ), @@ -2217,7 +2217,7 @@ TRACE_EVENT(ocfs2_mknod, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->dev = dev; __entry->mode = mode; @@ -2241,9 +2241,9 @@ TRACE_EVENT(ocfs2_link, TP_fast_assign( __entry->ino = ino; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s %.*s", __entry->ino, __entry->old_len, __get_str(old_name), @@ -2279,9 +2279,9 @@ TRACE_EVENT(ocfs2_rename, __entry->new_dir = new_dir; __entry->new_dentry = new_dentry; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%p %p %p %p %.*s %.*s", __entry->old_dir, __entry->old_dentry, @@ -2301,7 +2301,7 @@ TRACE_EVENT(ocfs2_rename_target_exists, ), TP_fast_assign( __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%.*s", __entry->new_len, __get_str(new_name)) ); @@ -2344,7 +2344,7 @@ TRACE_EVENT(ocfs2_symlink_begin, __entry->dentry = dentry; __entry->symname = symname; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, __entry->symname, __entry->len, __get_str(name)) @@ -2360,7 +2360,7 @@ TRACE_EVENT(ocfs2_blkno_stringify, ), TP_fast_assign( __entry->blkno = blkno; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->blkno, __get_str(name), @@ -2381,7 +2381,7 @@ TRACE_EVENT(ocfs2_orphan_del, ), TP_fast_assign( __entry->dir = dir; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->dir, __get_str(name), @@ -2403,7 +2403,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate, TP_fast_assign( __entry->dentry = dentry; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) ); @@ -2420,7 +2420,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->pgen = pgen; __entry->gen = gen; ), @@ -2445,7 +2445,7 @@ TRACE_EVENT(ocfs2_find_local_alias, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%.*s", __entry->len, __get_str(name)) ); @@ -2462,7 +2462,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->fsdata = fsdata; ), @@ -2480,7 +2480,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found, __field(unsigned long long, ino) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->ino = ino; ), @@ -2527,7 +2527,7 @@ TRACE_EVENT(ocfs2_get_parent, TP_fast_assign( __entry->child = child; __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->ino = ino; ), TP_printk("%p %.*s %llu", __entry->child, __entry->len, @@ -2551,7 +2551,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin, TP_fast_assign( __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->fh = fh; __entry->len = len; __entry->connectable = connectable; diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index af97389e983e..36d47ce59631 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -518,7 +518,7 @@ DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __assign_str(path, full_path); + __assign_str(path); ), TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s", __entry->xid, __entry->sesid, __entry->tid, @@ -762,7 +762,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, ), TP_fast_assign( __entry->xid = xid; - __assign_str(func_name, func_name); + __assign_str(func_name); __entry->rc = rc; ), TP_printk("\t%s: xid=%u rc=%d", @@ -815,7 +815,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, ), TP_fast_assign( __entry->xid = xid; - __assign_str(func_name, func_name); + __assign_str(func_name); ), TP_printk("\t%s: xid=%u", __get_str(func_name), __entry->xid) @@ -852,7 +852,7 @@ DECLARE_EVENT_CLASS(smb3_tcon_class, __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __assign_str(name, unc_name); + __assign_str(name); __entry->rc = rc; ), TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d", @@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_open_enter_class, __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __assign_str(path, full_path); + __assign_str(path); __entry->create_options = create_options; __entry->desired_access = desired_access; ), @@ -1098,7 +1098,7 @@ DECLARE_EVENT_CLASS(smb3_connect_class, __entry->conn_id = conn_id; pss = (struct sockaddr_storage *)__entry->dst_addr; *pss = *dst_addr; - __assign_str(hostname, hostname); + __assign_str(hostname); ), TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc", __entry->conn_id, @@ -1134,7 +1134,7 @@ DECLARE_EVENT_CLASS(smb3_connect_err_class, __entry->rc = rc; pss = (struct sockaddr_storage *)__entry->dst_addr; *pss = *dst_addr; - __assign_str(hostname, hostname); + __assign_str(hostname); ), TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc", __entry->rc, @@ -1166,7 +1166,7 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __assign_str(hostname, hostname); + __assign_str(hostname); ), TP_printk("conn_id=0x%llx server=%s current_mid=%llu", __entry->conn_id, @@ -1255,7 +1255,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __assign_str(hostname, hostname); + __assign_str(hostname); __entry->credits = credits; __entry->credits_to_add = credits_to_add; __entry->in_flight = in_flight; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index e27daa51cab6..92ef4cdc486e 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -561,7 +561,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -604,7 +604,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -644,7 +644,7 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -684,7 +684,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -723,7 +723,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 05cb59bd0b80..25ff6fe1eb6c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -168,7 +168,7 @@ TRACE_EVENT(xlog_intent_recovery_failed, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __assign_str(name, ops->name); + __assign_str(name); __entry->error = error; ), TP_printk("dev %d:%d optype %s error %d", @@ -1913,7 +1913,7 @@ TRACE_EVENT(xfs_alloc_cur_check, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->bno = bno; __entry->len = len; __entry->diff = diff; @@ -2473,7 +2473,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; @@ -2523,7 +2523,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->ino = 0; break; } - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->error = error; if (!error && stat) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { @@ -2567,7 +2567,7 @@ TRACE_EVENT(xfs_btree_free_block, __entry->ino = cur->bc_ino.ip->i_ino; else __entry->ino = 0; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); ), @@ -2643,7 +2643,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __assign_str(name, dfp->dfp_ops->name); + __assign_str(name); __entry->intent = dfp->dfp_intent; __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; @@ -2732,7 +2732,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __assign_str(name, dfp->dfp_ops->name); + __assign_str(name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; @@ -4244,7 +4244,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; @@ -4273,7 +4273,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, cur->bc_ino.ip->i_ino); __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, @@ -4312,7 +4312,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->nr_this_level = nr_this_level; @@ -4350,7 +4350,7 @@ TRACE_EVENT(xfs_btree_bload_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->block_idx = block_idx; __entry->nr_blocks = nr_blocks; @@ -4573,7 +4573,7 @@ TRACE_EVENT(xfs_force_shutdown, __entry->dev = mp->m_super->s_dev; __entry->ptag = ptag; __entry->flags = flags; - __assign_str(fname, fname); + __assign_str(fname); __entry->line_num = line_num; ), TP_printk("dev %d:%d tag %s flags %s file %s line_num %d", @@ -4755,7 +4755,7 @@ DECLARE_EVENT_CLASS(xfbtree_freesp_class, ), TP_fast_assign( __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; - __assign_str(btname, cur->bc_ops->name); + __assign_str(btname); __entry->nlevels = cur->bc_nlevels; __entry->fileoff = fileoff; ), @@ -5122,7 +5122,7 @@ DECLARE_EVENT_CLASS(xfs_getparents_rec_class, __entry->bufsize = ppi->gp_bufsize; __entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino; __entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen; - __assign_str(name, pptr->gpr_name); + __assign_str(name); ), TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'", MAJOR(__entry->dev), MINOR(__entry->dev), diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c011ea236e9b..7c47151d5c72 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -61,7 +61,7 @@ TRACE_EVENT(extlog_mem_event, else __entry->pa_mask_lsb = ~0; __entry->fru_id = *fru_id; - __assign_str(fru_text, fru_text); + __assign_str(fru_text); cper_mem_err_pack(mem, &__entry->data); ), @@ -131,8 +131,8 @@ TRACE_EVENT(mc_event, TP_fast_assign( __entry->error_type = err_type; - __assign_str(msg, error_msg); - __assign_str(label, label); + __assign_str(msg); + __assign_str(label); __entry->error_count = error_count; __entry->mc_index = mc_index; __entry->top_layer = top_layer; @@ -141,7 +141,7 @@ TRACE_EVENT(mc_event, __entry->address = address; __entry->grain_bits = grain_bits; __entry->syndrome = syndrome; - __assign_str(driver_detail, driver_detail); + __assign_str(driver_detail); ), TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", @@ -239,7 +239,7 @@ TRACE_EVENT(non_standard_event, TP_fast_assign( memcpy(__entry->sec_type, sec_type, UUID_SIZE); memcpy(__entry->fru_id, fru_id, UUID_SIZE); - __assign_str(fru_text, fru_text); + __assign_str(fru_text); __entry->sev = sev; __entry->len = len; memcpy(__get_dynamic_array(buf), err, len); @@ -313,7 +313,7 @@ TRACE_EVENT(aer_event, ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name); __entry->status = status; __entry->severity = severity; __entry->tlp_header_valid = tlp_header_valid; diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h index 517015ef36a8..202fc3680c36 100644 --- a/include/trace/events/asoc.h +++ b/include/trace/events/asoc.h @@ -32,8 +32,8 @@ DECLARE_EVENT_CLASS(snd_soc_dapm, ), TP_fast_assign( - __assign_str(card_name, dapm->card->name); - __assign_str(comp_name, dapm->component ? dapm->component->name : "(none)"); + __assign_str(card_name); + __assign_str(comp_name); __entry->val = val; ), @@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(snd_soc_dapm_basic, ), TP_fast_assign( - __assign_str(name, card->name); + __assign_str(name); __entry->event = event; ), @@ -104,7 +104,7 @@ DECLARE_EVENT_CLASS(snd_soc_dapm_widget, ), TP_fast_assign( - __assign_str(name, w->name); + __assign_str(name); __entry->val = val; ), @@ -150,7 +150,7 @@ TRACE_EVENT(snd_soc_dapm_walk_done, ), TP_fast_assign( - __assign_str(name, card->name); + __assign_str(name); __entry->power_checks = card->dapm_stats.power_checks; __entry->path_checks = card->dapm_stats.path_checks; __entry->neighbour_checks = card->dapm_stats.neighbour_checks; @@ -179,9 +179,9 @@ TRACE_EVENT(snd_soc_dapm_path, ), TP_fast_assign( - __assign_str(wname, widget->name); - __assign_str(pname, path->name ? path->name : DAPM_DIRECT); - __assign_str(pnname, path->node[dir]->name); + __assign_str(wname); + __assign_str(pname); + __assign_str(pnname); __entry->path_connect = path->connect; __entry->path_node = (long)path->node[dir]; __entry->path_dir = dir; @@ -226,7 +226,7 @@ TRACE_EVENT(snd_soc_jack_irq, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) @@ -245,7 +245,7 @@ TRACE_EVENT(snd_soc_jack_report, ), TP_fast_assign( - __assign_str(name, jack->jack->id); + __assign_str(name); __entry->mask = mask; __entry->val = val; ), @@ -266,7 +266,7 @@ TRACE_EVENT(snd_soc_jack_notify, ), TP_fast_assign( - __assign_str(name, jack->jack->id); + __assign_str(name); __entry->val = val; ), diff --git a/include/trace/events/avc.h b/include/trace/events/avc.h index b55fda2e0773..fed0f141d5f6 100644 --- a/include/trace/events/avc.h +++ b/include/trace/events/avc.h @@ -36,9 +36,9 @@ TRACE_EVENT(selinux_audited, __entry->denied = sad->denied; __entry->audited = sad->audited; __entry->result = sad->result; - __assign_str(tcontext, tcontext); - __assign_str(scontext, scontext); - __assign_str(tclass, tclass); + __assign_str(tcontext); + __assign_str(scontext); + __assign_str(tclass); ), TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s", diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h index a6b3a4e409f0..3fe4725c83ff 100644 --- a/include/trace/events/bridge.h +++ b/include/trace/events/bridge.h @@ -25,7 +25,7 @@ TRACE_EVENT(br_fdb_add, ), TP_fast_assign( - __assign_str(dev, dev->name); + __assign_str(dev); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; __entry->nlh_flags = nlh_flags; @@ -54,8 +54,8 @@ TRACE_EVENT(br_fdb_external_learn_add, ), TP_fast_assign( - __assign_str(br_dev, br->dev->name); - __assign_str(dev, p ? p->dev->name : "null"); + __assign_str(br_dev); + __assign_str(dev); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; ), @@ -80,8 +80,8 @@ TRACE_EVENT(fdb_delete, ), TP_fast_assign( - __assign_str(br_dev, br->dev->name); - __assign_str(dev, f->dst ? f->dst->dev->name : "null"); + __assign_str(br_dev); + __assign_str(dev); memcpy(__entry->addr, f->key.addr.addr, ETH_ALEN); __entry->vid = f->key.vlan_id; ), @@ -108,8 +108,8 @@ TRACE_EVENT(br_fdb_update, ), TP_fast_assign( - __assign_str(br_dev, br->dev->name); - __assign_str(dev, source->dev->name); + __assign_str(br_dev); + __assign_str(dev); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; __entry->flags = flags; @@ -141,7 +141,7 @@ TRACE_EVENT(br_mdb_full, TP_fast_assign( struct in6_addr *in6; - __assign_str(dev, dev->name); + __assign_str(dev); __entry->vid = group->vid; if (!group->proto) { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index d2d94d7c3fb5..fadf406b5260 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1140,7 +1140,7 @@ TRACE_EVENT(btrfs_space_reservation, ), TP_fast_assign_btrfs(fs_info, - __assign_str(type, type); + __assign_str(type); __entry->val = val; __entry->bytes = bytes; __entry->reserve = reserve; @@ -1169,7 +1169,7 @@ TRACE_EVENT(btrfs_trigger_flush, __entry->flags = flags; __entry->bytes = bytes; __entry->flush = flush; - __assign_str(reason, reason); + __assign_str(reason); ), TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu", @@ -1622,7 +1622,7 @@ DECLARE_EVENT_CLASS(btrfs_workqueue, TP_fast_assign_btrfs(btrfs_workqueue_owner(wq), __entry->wq = wq; - __assign_str(name, name); + __assign_str(name); ), TP_printk_btrfs("name=%s wq=%p", __get_str(name), diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index 0b95865a90f3..af2755bda6eb 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(cgroup_root, TP_fast_assign( __entry->root = root->hierarchy_id; __entry->ss_mask = root->subsys_mask; - __assign_str(name, root->name); + __assign_str(name); ), TP_printk("root=%d ss_mask=%#x name=%s", @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(cgroup, __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; - __assign_str(path, path); + __assign_str(path); ), TP_printk("root=%d id=%llu level=%d path=%s", @@ -137,9 +137,9 @@ DECLARE_EVENT_CLASS(cgroup_migrate, __entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; - __assign_str(dst_path, path); + __assign_str(dst_path); __entry->pid = task->pid; - __assign_str(comm, task->comm); + __assign_str(comm); ), TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s", @@ -181,7 +181,7 @@ DECLARE_EVENT_CLASS(cgroup_event, __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; - __assign_str(path, path); + __assign_str(path); __entry->val = val; ), diff --git a/include/trace/events/clk.h b/include/trace/events/clk.h index daed3c7a48c1..759f7371a6dc 100644 --- a/include/trace/events/clk.h +++ b/include/trace/events/clk.h @@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(clk, ), TP_fast_assign( - __assign_str(name, core->name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) @@ -97,7 +97,7 @@ DECLARE_EVENT_CLASS(clk_rate, ), TP_fast_assign( - __assign_str(name, core->name); + __assign_str(name); __entry->rate = rate; ), @@ -145,7 +145,7 @@ DECLARE_EVENT_CLASS(clk_rate_range, ), TP_fast_assign( - __assign_str(name, core->name); + __assign_str(name); __entry->min = min; __entry->max = max; ), @@ -174,8 +174,8 @@ DECLARE_EVENT_CLASS(clk_parent, ), TP_fast_assign( - __assign_str(name, core->name); - __assign_str(pname, parent ? parent->name : "none"); + __assign_str(name); + __assign_str(pname); ), TP_printk("%s %s", __get_str(name), __get_str(pname)) @@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(clk_phase, ), TP_fast_assign( - __assign_str(name, core->name); + __assign_str(name); __entry->phase = phase; ), @@ -241,7 +241,7 @@ DECLARE_EVENT_CLASS(clk_duty_cycle, ), TP_fast_assign( - __assign_str(name, core->name); + __assign_str(name); __entry->num = duty->num; __entry->den = duty->den; ), @@ -279,8 +279,8 @@ DECLARE_EVENT_CLASS(clk_rate_request, ), TP_fast_assign( - __assign_str(name, req->core ? req->core->name : "none"); - __assign_str(pname, req->best_parent_hw ? clk_hw_get_name(req->best_parent_hw) : "none"); + __assign_str(name); + __assign_str(pname); __entry->min = req->min_rate; __entry->max = req->max_rate; __entry->prate = req->best_parent_rate; diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 25103e67737c..383c09f583ac 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -23,7 +23,7 @@ TRACE_EVENT(cma_release, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; @@ -49,7 +49,7 @@ TRACE_EVENT(cma_alloc_start, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->count = count; __entry->align = align; ), @@ -77,7 +77,7 @@ TRACE_EVENT(cma_alloc_finish, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; @@ -110,7 +110,7 @@ TRACE_EVENT(cma_alloc_busy_retry, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; diff --git a/include/trace/events/devfreq.h b/include/trace/events/devfreq.h index 7627c620bbda..6cbc4d59fd96 100644 --- a/include/trace/events/devfreq.h +++ b/include/trace/events/devfreq.h @@ -23,7 +23,7 @@ TRACE_EVENT(devfreq_frequency, ), TP_fast_assign( - __assign_str(dev_name, dev_name(&devfreq->dev)); + __assign_str(dev_name); __entry->freq = freq; __entry->prev_freq = prev_freq; __entry->busy_time = devfreq->last_status.busy_time; @@ -54,7 +54,7 @@ TRACE_EVENT(devfreq_monitor, __entry->busy_time = devfreq->last_status.busy_time; __entry->total_time = devfreq->last_status.total_time; __entry->polling_ms = devfreq->profile->polling_ms; - __assign_str(dev_name, dev_name(&devfreq->dev)); + __assign_str(dev_name); ), TP_printk("dev_name=%-30s freq=%-12lu polling_ms=%-3u load=%-2lu", diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 77ff7cfc6049..f241e204fe6b 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -31,9 +31,9 @@ TRACE_EVENT(devlink_hwmsg, ), TP_fast_assign( - __assign_str(bus_name, devlink_to_dev(devlink)->bus->name); - __assign_str(dev_name, dev_name(devlink_to_dev(devlink))); - __assign_str(driver_name, devlink_to_dev(devlink)->driver->name); + __assign_str(bus_name); + __assign_str(dev_name); + __assign_str(driver_name); __entry->incoming = incoming; __entry->type = type; memcpy(__get_dynamic_array(buf), buf, len); @@ -63,11 +63,11 @@ TRACE_EVENT(devlink_hwerr, ), TP_fast_assign( - __assign_str(bus_name, devlink_to_dev(devlink)->bus->name); - __assign_str(dev_name, dev_name(devlink_to_dev(devlink))); - __assign_str(driver_name, devlink_to_dev(devlink)->driver->name); + __assign_str(bus_name); + __assign_str(dev_name); + __assign_str(driver_name); __entry->err = err; - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("bus_name=%s dev_name=%s driver_name=%s err=%d %s", @@ -93,11 +93,11 @@ TRACE_EVENT(devlink_health_report, ), TP_fast_assign( - __assign_str(bus_name, devlink_to_dev(devlink)->bus->name); - __assign_str(dev_name, dev_name(devlink_to_dev(devlink))); - __assign_str(driver_name, devlink_to_dev(devlink)->driver->name); - __assign_str(reporter_name, reporter_name); - __assign_str(msg, msg); + __assign_str(bus_name); + __assign_str(dev_name); + __assign_str(driver_name); + __assign_str(reporter_name); + __assign_str(msg); ), TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s", @@ -125,10 +125,10 @@ TRACE_EVENT(devlink_health_recover_aborted, ), TP_fast_assign( - __assign_str(bus_name, devlink_to_dev(devlink)->bus->name); - __assign_str(dev_name, dev_name(devlink_to_dev(devlink))); - __assign_str(driver_name, devlink_to_dev(devlink)->driver->name); - __assign_str(reporter_name, reporter_name); + __assign_str(bus_name); + __assign_str(dev_name); + __assign_str(driver_name); + __assign_str(reporter_name); __entry->health_state = health_state; __entry->time_since_last_recover = time_since_last_recover; ), @@ -158,10 +158,10 @@ TRACE_EVENT(devlink_health_reporter_state_update, ), TP_fast_assign( - __assign_str(bus_name, devlink_to_dev(devlink)->bus->name); - __assign_str(dev_name, dev_name(devlink_to_dev(devlink))); - __assign_str(driver_name, devlink_to_dev(devlink)->driver->name); - __assign_str(reporter_name, reporter_name); + __assign_str(bus_name); + __assign_str(dev_name); + __assign_str(driver_name); + __assign_str(reporter_name); __entry->new_state = new_state; ), @@ -192,11 +192,11 @@ TRACE_EVENT(devlink_trap_report, TP_fast_assign( struct net_device *input_dev = metadata->input_dev; - __assign_str(bus_name, devlink_to_dev(devlink)->bus->name); - __assign_str(dev_name, dev_name(devlink_to_dev(devlink))); - __assign_str(driver_name, devlink_to_dev(devlink)->driver->name); - __assign_str(trap_name, metadata->trap_name); - __assign_str(trap_group_name, metadata->trap_group_name); + __assign_str(bus_name); + __assign_str(dev_name); + __assign_str(driver_name); + __assign_str(trap_name); + __assign_str(trap_group_name); strscpy(__entry->input_dev_name, input_dev ? input_dev->name : "NULL", IFNAMSIZ); ), diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h index 3963e79ca7b4..a4de3df8500b 100644 --- a/include/trace/events/dma_fence.h +++ b/include/trace/events/dma_fence.h @@ -23,8 +23,8 @@ DECLARE_EVENT_CLASS(dma_fence, ), TP_fast_assign( - __assign_str(driver, fence->ops->get_driver_name(fence)); - __assign_str(timeline, fence->ops->get_timeline_name(fence)); + __assign_str(driver); + __assign_str(timeline); __entry->context = fence->context; __entry->seqno = fence->seqno; ), diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index e18684b02c3d..b9bbfd855f2a 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -47,7 +47,7 @@ TRACE_EVENT(erofs_lookup, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; - __assign_str(name, dentry->d_name.name); + __assign_str(name); __entry->flags = flags; ), diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 371ba28415f5..ed794b5fefbe 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -354,7 +354,7 @@ TRACE_EVENT(f2fs_unlink_enter, __entry->ino = dir->i_ino; __entry->size = dir->i_size; __entry->blocks = dir->i_blocks; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, " @@ -843,7 +843,7 @@ TRACE_EVENT(f2fs_lookup_start, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __assign_str(name, dentry->d_name.name); + __assign_str(name); __entry->flags = flags; ), @@ -871,7 +871,7 @@ TRACE_EVENT(f2fs_lookup_end, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __assign_str(name, dentry->d_name.name); + __assign_str(name); __entry->cino = ino; __entry->err = err; ), @@ -903,9 +903,9 @@ TRACE_EVENT(f2fs_rename_start, TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; __entry->ino = old_dir->i_ino; - __assign_str(old_name, old_dentry->d_name.name); + __assign_str(old_name); __entry->new_pino = new_dir->i_ino; - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(new_name); __entry->flags = flags; ), @@ -937,8 +937,8 @@ TRACE_EVENT(f2fs_rename_end, TP_fast_assign( __entry->dev = old_dentry->d_sb->s_dev; __entry->ino = old_dentry->d_inode->i_ino; - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); __entry->flags = flags; __entry->ret = ret; ), @@ -1557,7 +1557,7 @@ TRACE_EVENT(f2fs_write_checkpoint, TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; - __assign_str(dest_msg, msg); + __assign_str(dest_msg); ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", @@ -2333,12 +2333,12 @@ DECLARE_EVENT_CLASS(f2fs__rw_start, * because this screws up the tooling that parses * the traces. */ - __assign_str(pathbuf, pathname); + __assign_str(pathbuf); (void)strreplace(__get_str(pathbuf), ' ', '_'); __entry->offset = offset; __entry->bytes = bytes; __entry->i_size = i_size_read(inode); - __assign_str(cmdline, command); + __assign_str(cmdline); (void)strreplace(__get_str(cmdline), ' ', '_'); __entry->pid = pid; __entry->ino = inode->i_ino; diff --git a/include/trace/events/habanalabs.h b/include/trace/events/habanalabs.h index a78d21fa9f29..4a2bb2c896d1 100644 --- a/include/trace/events/habanalabs.h +++ b/include/trace/events/habanalabs.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(habanalabs_mmu_template, ), TP_fast_assign( - __assign_str(dname, dev_name(dev)); + __assign_str(dname); __entry->virt_addr = virt_addr; __entry->phys_addr = phys_addr; __entry->page_size = page_size; @@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(habanalabs_dma_alloc_template, ), TP_fast_assign( - __assign_str(dname, dev_name(dev)); + __assign_str(dname); __entry->cpu_addr = cpu_addr; __entry->dma_addr = dma_addr; __entry->size = size; @@ -103,7 +103,7 @@ DECLARE_EVENT_CLASS(habanalabs_dma_map_template, ), TP_fast_assign( - __assign_str(dname, dev_name(dev)); + __assign_str(dname); __entry->phys_addr = phys_addr; __entry->dma_addr = dma_addr; __entry->len = len; @@ -141,7 +141,7 @@ DECLARE_EVENT_CLASS(habanalabs_comms_template, ), TP_fast_assign( - __assign_str(dname, dev_name(dev)); + __assign_str(dname); __entry->op_str = op_str; ), @@ -178,7 +178,7 @@ DECLARE_EVENT_CLASS(habanalabs_reg_access_template, ), TP_fast_assign( - __assign_str(dname, dev_name(dev)); + __assign_str(dname); __entry->addr = addr; __entry->val = val; ), diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index ab576898a126..b5f5369b6300 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -191,7 +191,7 @@ TRACE_EVENT(mm_khugepaged_scan_file, TP_fast_assign( __entry->mm = mm; __entry->pfn = folio ? folio_pfn(folio) : -1; - __assign_str(filename, file->f_path.dentry->d_iname); + __assign_str(filename); __entry->present = present; __entry->swap = swap; __entry->result = result; @@ -228,7 +228,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file, __entry->index = index; __entry->addr = addr; __entry->is_shmem = is_shmem; - __assign_str(filename, file->f_path.dentry->d_iname); + __assign_str(filename); __entry->nr = nr; __entry->result = result; ), diff --git a/include/trace/events/hwmon.h b/include/trace/events/hwmon.h index d7a1d0ffb679..d1ff560cd9b5 100644 --- a/include/trace/events/hwmon.h +++ b/include/trace/events/hwmon.h @@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, TP_fast_assign( __entry->index = index; - __assign_str(attr_name, attr_name); + __assign_str(attr_name); __entry->val = val; ), @@ -57,8 +57,8 @@ TRACE_EVENT(hwmon_attr_show_string, TP_fast_assign( __entry->index = index; - __assign_str(attr_name, attr_name); - __assign_str(label, s); + __assign_str(attr_name); + __assign_str(label); ), TP_printk("index=%d, attr_name=%s, val=%s", diff --git a/include/trace/events/initcall.h b/include/trace/events/initcall.h index eb903c3f195f..5282afdf3ddf 100644 --- a/include/trace/events/initcall.h +++ b/include/trace/events/initcall.h @@ -18,7 +18,7 @@ TRACE_EVENT(initcall_level, ), TP_fast_assign( - __assign_str(level, level); + __assign_str(level); ), TP_printk("level=%s", __get_str(level)) diff --git a/include/trace/events/intel_ish.h b/include/trace/events/intel_ish.h index e6d7ff55ee8c..64b6612c41bc 100644 --- a/include/trace/events/intel_ish.h +++ b/include/trace/events/intel_ish.h @@ -18,7 +18,7 @@ TRACE_EVENT(ishtp_dump, ), TP_fast_assign( - __assign_str(message, message); + __assign_str(message); ), TP_printk("%s", __get_str(message)) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index e948df7ce625..412c9c210a32 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -164,7 +164,7 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->work = &req->work; __entry->rw = rw; - __assign_str(op_str, io_uring_get_opcode(req->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", @@ -202,7 +202,7 @@ TRACE_EVENT(io_uring_defer, __entry->data = req->cqe.user_data; __entry->opcode = req->opcode; - __assign_str(op_str, io_uring_get_opcode(req->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", @@ -303,7 +303,7 @@ TRACE_EVENT(io_uring_fail_link, __entry->opcode = req->opcode; __entry->link = link; - __assign_str(op_str, io_uring_get_opcode(req->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", @@ -392,7 +392,7 @@ TRACE_EVENT(io_uring_submit_req, __entry->flags = (__force unsigned long long) req->flags; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; - __assign_str(op_str, io_uring_get_opcode(req->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " @@ -436,7 +436,7 @@ TRACE_EVENT(io_uring_poll_arm, __entry->mask = mask; __entry->events = events; - __assign_str(op_str, io_uring_get_opcode(req->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", @@ -475,7 +475,7 @@ TRACE_EVENT(io_uring_task_add, __entry->opcode = req->opcode; __entry->mask = mask; - __assign_str(op_str, io_uring_get_opcode(req->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", @@ -538,7 +538,7 @@ TRACE_EVENT(io_uring_req_failed, __entry->addr3 = sqe->addr3; __entry->error = error; - __assign_str(op_str, io_uring_get_opcode(sqe->opcode)); + __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, " diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index af8bfed528fc..e772b1bc60d6 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -34,8 +34,8 @@ DECLARE_EVENT_CLASS(iocost_iocg_state, ), TP_fast_assign( - __assign_str(devname, ioc_name(iocg->ioc)); - __assign_str(cgroup, path); + __assign_str(devname); + __assign_str(cgroup); __entry->now = now->now; __entry->vnow = now->vnow; __entry->vrate = iocg->ioc->vtime_base_rate; @@ -93,8 +93,8 @@ DECLARE_EVENT_CLASS(iocg_inuse_update, ), TP_fast_assign( - __assign_str(devname, ioc_name(iocg->ioc)); - __assign_str(cgroup, path); + __assign_str(devname); + __assign_str(cgroup); __entry->now = now->now; __entry->old_inuse = old_inuse; __entry->new_inuse = new_inuse; @@ -159,7 +159,7 @@ TRACE_EVENT(iocost_ioc_vrate_adj, ), TP_fast_assign( - __assign_str(devname, ioc_name(ioc)); + __assign_str(devname); __entry->old_vrate = ioc->vtime_base_rate; __entry->new_vrate = new_vrate; __entry->busy_level = ioc->busy_level; @@ -200,8 +200,8 @@ TRACE_EVENT(iocost_iocg_forgive_debt, ), TP_fast_assign( - __assign_str(devname, ioc_name(iocg->ioc)); - __assign_str(cgroup, path); + __assign_str(devname); + __assign_str(cgroup); __entry->now = now->now; __entry->vnow = now->vnow; __entry->usage_pct = usage_pct; diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h index 70743db1fb75..373007e567cb 100644 --- a/include/trace/events/iommu.h +++ b/include/trace/events/iommu.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(iommu_group_event, TP_fast_assign( __entry->gid = group_id; - __assign_str(device, dev_name(dev)); + __assign_str(device); ), TP_printk("IOMMU: groupID=%d device=%s", @@ -62,7 +62,7 @@ DECLARE_EVENT_CLASS(iommu_device_event, ), TP_fast_assign( - __assign_str(device, dev_name(dev)); + __assign_str(device); ), TP_printk("IOMMU: device=%s", __get_str(device) @@ -138,8 +138,8 @@ DECLARE_EVENT_CLASS(iommu_error, ), TP_fast_assign( - __assign_str(device, dev_name(dev)); - __assign_str(driver, dev_driver_string(dev)); + __assign_str(device); + __assign_str(driver); __entry->iova = iova; __entry->flags = flags; ), diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index a07b4607b663..837c1740d0d0 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -63,7 +63,7 @@ TRACE_EVENT(irq_handler_entry, TP_fast_assign( __entry->irq = irq; - __assign_str(name, action->name); + __assign_str(name); ), TP_printk("irq=%d name=%s", __entry->irq, __get_str(name)) diff --git a/include/trace/events/iscsi.h b/include/trace/events/iscsi.h index 8ff2a3ca5d75..990fd154f586 100644 --- a/include/trace/events/iscsi.h +++ b/include/trace/events/iscsi.h @@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(iscsi_log_msg, ), TP_fast_assign( - __assign_str(dname, dev_name(dev)); + __assign_str(dname); __assign_vstr(msg, vaf->fmt, vaf->va); ), diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 6e62cc64cd92..8a829e0f6e55 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -126,7 +126,7 @@ TRACE_EVENT(kmem_cache_free, TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __assign_str(name, s->name); + __assign_str(name); ), TP_printk("call_site=%pS ptr=%p name=%s", diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h index 9ebd081e057e..8e89baa3775f 100644 --- a/include/trace/events/lock.h +++ b/include/trace/events/lock.h @@ -37,7 +37,7 @@ TRACE_EVENT(lock_acquire, TP_fast_assign( __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0); - __assign_str(name, lock->name); + __assign_str(name); __entry->lockdep_addr = lock; ), @@ -59,7 +59,7 @@ DECLARE_EVENT_CLASS(lock, ), TP_fast_assign( - __assign_str(name, lock->name); + __assign_str(name); __entry->lockdep_addr = lock; ), diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index 14db8044c1ff..f2827f98a44f 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(mmap_lock, TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path, memcg_path); + __assign_str(memcg_path); __entry->write = write; ), @@ -65,7 +65,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path, memcg_path); + __assign_str(memcg_path); __entry->write = write; __entry->success = success; ), diff --git a/include/trace/events/mmc.h b/include/trace/events/mmc.h index 7b706ff21335..f1c2e94f7f68 100644 --- a/include/trace/events/mmc.h +++ b/include/trace/events/mmc.h @@ -68,7 +68,7 @@ TRACE_EVENT(mmc_request_start, __entry->need_retune = host->need_retune; __entry->hold_retune = host->hold_retune; __entry->retune_period = host->retune_period; - __assign_str(name, mmc_hostname(host)); + __assign_str(name); __entry->mrq = mrq; ), @@ -156,7 +156,7 @@ TRACE_EVENT(mmc_request_done, __entry->need_retune = host->need_retune; __entry->hold_retune = host->hold_retune; __entry->retune_period = host->retune_period; - __assign_str(name, mmc_hostname(host)); + __assign_str(name); __entry->mrq = mrq; ), diff --git a/include/trace/events/module.h b/include/trace/events/module.h index 097485c73c01..e5a006be9dc6 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -41,7 +41,7 @@ TRACE_EVENT(module_load, TP_fast_assign( __entry->taints = mod->taints; - __assign_str(name, mod->name); + __assign_str(name); ), TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) @@ -58,7 +58,7 @@ TRACE_EVENT(module_free, ), TP_fast_assign( - __assign_str(name, mod->name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) @@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(module_refcnt, TP_fast_assign( __entry->ip = ip; __entry->refcnt = atomic_read(&mod->refcnt); - __assign_str(name, mod->name); + __assign_str(name); ), TP_printk("%s call_site=%ps refcnt=%d", @@ -119,7 +119,7 @@ TRACE_EVENT(module_request, TP_fast_assign( __entry->ip = ip; __entry->wait = wait; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%s wait=%d call_site=%ps", diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h index dc03cf8e0369..b567b9ffedc1 100644 --- a/include/trace/events/napi.h +++ b/include/trace/events/napi.h @@ -26,7 +26,7 @@ TRACE_EVENT(napi_poll, TP_fast_assign( __entry->napi = napi; - __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV); + __assign_str(dev_name); __entry->work = work; __entry->budget = budget; ), diff --git a/include/trace/events/neigh.h b/include/trace/events/neigh.h index 833143d0992e..12362c35dbc0 100644 --- a/include/trace/events/neigh.h +++ b/include/trace/events/neigh.h @@ -42,7 +42,7 @@ TRACE_EVENT(neigh_create, __be32 *p32; __entry->family = tbl->family; - __assign_str(dev, (dev ? dev->name : "NULL")); + __assign_str(dev); __entry->entries = atomic_read(&tbl->gc_entries); __entry->created = n != NULL; __entry->gc_exempt = exempt_from_gc; @@ -103,7 +103,7 @@ TRACE_EVENT(neigh_update, __be32 *p32; __entry->family = n->tbl->family; - __assign_str(dev, (n->dev ? n->dev->name : "NULL")); + __assign_str(dev); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; @@ -180,7 +180,7 @@ DECLARE_EVENT_CLASS(neigh__update, __be32 *p32; __entry->family = n->tbl->family; - __assign_str(dev, (n->dev ? n->dev->name : "NULL")); + __assign_str(dev); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; diff --git a/include/trace/events/net.h b/include/trace/events/net.h index f667c76a3b02..d55162c12f90 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -38,7 +38,7 @@ TRACE_EVENT(net_dev_start_xmit, ), TP_fast_assign( - __assign_str(name, dev->name); + __assign_str(name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); @@ -89,7 +89,7 @@ TRACE_EVENT(net_dev_xmit, __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; - __assign_str(name, dev->name); + __assign_str(name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", @@ -110,8 +110,8 @@ TRACE_EVENT(net_dev_xmit_timeout, ), TP_fast_assign( - __assign_str(name, dev->name); - __assign_str(driver, netdev_drivername(dev)); + __assign_str(name); + __assign_str(driver); __entry->queue_index = queue_index; ), @@ -134,7 +134,7 @@ DECLARE_EVENT_CLASS(net_dev_template, TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; - __assign_str(name, skb->dev->name); + __assign_str(name); ), TP_printk("dev=%s skbaddr=%p len=%u", @@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, ), TP_fast_assign( - __assign_str(name, skb->dev->name); + __assign_str(name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else diff --git a/include/trace/events/netlink.h b/include/trace/events/netlink.h index 3b7be3b386a4..f036b8a20505 100644 --- a/include/trace/events/netlink.h +++ b/include/trace/events/netlink.h @@ -17,7 +17,7 @@ TRACE_EVENT(netlink_extack, ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("msg=%s", __get_str(msg)) diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index b799f3bcba82..a42be4c8563b 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -92,7 +92,7 @@ TRACE_EVENT(mark_victim, TP_fast_assign( __entry->pid = task->pid; - __assign_str(comm, task->comm); + __assign_str(comm); __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm); __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES)); __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES)); diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h index 82f741ec0f57..a2379a4f0684 100644 --- a/include/trace/events/osnoise.h +++ b/include/trace/events/osnoise.h @@ -75,7 +75,7 @@ TRACE_EVENT(irq_noise, ), TP_fast_assign( - __assign_str(desc, desc); + __assign_str(desc); __entry->vector = vector; __entry->start = start; __entry->duration = duration; diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 77f14f7a11d4..d2349b6b531a 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -76,7 +76,7 @@ TRACE_EVENT(powernv_throttle, TP_fast_assign( __entry->chip_id = chip_id; - __assign_str(reason, reason); + __assign_str(reason); __entry->pmax = pmax; ), @@ -210,11 +210,10 @@ TRACE_EVENT(device_pm_callback_start, ), TP_fast_assign( - __assign_str(device, dev_name(dev)); - __assign_str(driver, dev_driver_string(dev)); - __assign_str(parent, - dev->parent ? dev_name(dev->parent) : "none"); - __assign_str(pm_ops, pm_ops ? pm_ops : "none "); + __assign_str(device); + __assign_str(driver); + __assign_str(parent); + __assign_str(pm_ops); __entry->event = event; ), @@ -236,8 +235,8 @@ TRACE_EVENT(device_pm_callback_end, ), TP_fast_assign( - __assign_str(device, dev_name(dev)); - __assign_str(driver, dev_driver_string(dev)); + __assign_str(device); + __assign_str(driver); __entry->error = error; ), @@ -279,7 +278,7 @@ DECLARE_EVENT_CLASS(wakeup_source, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->state = state; ), @@ -318,7 +317,7 @@ DECLARE_EVENT_CLASS(clock, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->state = state; __entry->cpu_id = cpu_id; ), @@ -364,7 +363,7 @@ DECLARE_EVENT_CLASS(power_domain, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->state = state; __entry->cpu_id = cpu_id; ), @@ -486,7 +485,7 @@ DECLARE_EVENT_CLASS(dev_pm_qos_request, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->type = type; __entry->new_value = new_value; ), diff --git a/include/trace/events/pwc.h b/include/trace/events/pwc.h index a2da764a3b41..0543702542d9 100644 --- a/include/trace/events/pwc.h +++ b/include/trace/events/pwc.h @@ -26,7 +26,7 @@ TRACE_EVENT(pwc_handler_enter, __entry->urb__actual_length = urb->actual_length; __entry->fbuf__filled = (pdev->fill_buf ? pdev->fill_buf->filled : 0); - __assign_str(name, pdev->v4l2_dev.name); + __assign_str(name); ), TP_printk("dev=%s (fbuf=%p filled=%d) urb=%p (status=%d actual_length=%u)", __get_str(name), @@ -50,7 +50,7 @@ TRACE_EVENT(pwc_handler_exit, __entry->urb = urb; __entry->fbuf = pdev->fill_buf; __entry->fbuf__filled = pdev->fill_buf->filled; - __assign_str(name, pdev->v4l2_dev.name); + __assign_str(name); ), TP_printk(" dev=%s (fbuf=%p filled=%d) urb=%p", __get_str(name), diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index 1f4258308b96..f1b5e816e7e5 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -88,8 +88,8 @@ TRACE_EVENT(qdisc_reset, ), TP_fast_assign( - __assign_str(dev, qdisc_dev(q)->name); - __assign_str(kind, q->ops->id); + __assign_str(dev); + __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), @@ -113,8 +113,8 @@ TRACE_EVENT(qdisc_destroy, ), TP_fast_assign( - __assign_str(dev, qdisc_dev(q)->name); - __assign_str(kind, q->ops->id); + __assign_str(dev); + __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), @@ -137,8 +137,8 @@ TRACE_EVENT(qdisc_create, ), TP_fast_assign( - __assign_str(dev, dev->name); - __assign_str(kind, ops->id); + __assign_str(dev); + __assign_str(kind); __entry->parent = parent; ), diff --git a/include/trace/events/qla.h b/include/trace/events/qla.h index e7fd55e7dc3d..8800c35525a1 100644 --- a/include/trace/events/qla.h +++ b/include/trace/events/qla.h @@ -25,7 +25,7 @@ DECLARE_EVENT_CLASS(qla_log_event, __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - __assign_str(buf, buf); + __assign_str(buf); __assign_vstr(msg, vaf->fmt, vaf->va); ), diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h index 441132c67133..14f822983741 100644 --- a/include/trace/events/qrtr.h +++ b/include/trace/events/qrtr.h @@ -102,7 +102,7 @@ TRACE_EVENT(qrtr_ns_message, ), TP_fast_assign( - __assign_str(ctrl_pkt_str, ctrl_pkt_str); + __assign_str(ctrl_pkt_str); __entry->sq_node = sq_node; __entry->sq_port = sq_port; ), diff --git a/include/trace/events/regulator.h b/include/trace/events/regulator.h index 72b3ba93b0a5..c58481a5d955 100644 --- a/include/trace/events/regulator.h +++ b/include/trace/events/regulator.h @@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(regulator_basic, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); ), TP_printk("name=%s", __get_str(name)) @@ -119,7 +119,7 @@ DECLARE_EVENT_CLASS(regulator_range, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->min = min; __entry->max = max; ), @@ -152,7 +152,7 @@ DECLARE_EVENT_CLASS(regulator_value, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->val = val; ), diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index f50fcafc69de..7f0c1ceae726 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -154,7 +154,7 @@ DECLARE_EVENT_CLASS(rpcgss_ctx_class, TP_fast_assign( __entry->cred = gc; __entry->service = gc->gc_service; - __assign_str(principal, gc->gc_principal); + __assign_str(principal); ), TP_printk("cred=%p service=%s principal='%s'", @@ -189,7 +189,7 @@ DECLARE_EVENT_CLASS(rpcgss_svc_gssapi_class, TP_fast_assign( __entry->xid = __be32_to_cpu(rqstp->rq_xid); __entry->maj_stat = maj_stat; - __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x maj_stat=%s", @@ -225,7 +225,7 @@ TRACE_EVENT(rpcgss_svc_wrap_failed, TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); - __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid) @@ -245,7 +245,7 @@ TRACE_EVENT(rpcgss_svc_unwrap_failed, TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); - __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid) @@ -271,7 +271,7 @@ TRACE_EVENT(rpcgss_svc_seqno_bad, __entry->expected = expected; __entry->received = received; __entry->xid = __be32_to_cpu(rqstp->rq_xid); - __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x expected seqno %u, received seqno %u", @@ -299,7 +299,7 @@ TRACE_EVENT(rpcgss_svc_accept_upcall, __entry->minor_status = minor_status; __entry->major_status = major_status; __entry->xid = be32_to_cpu(rqstp->rq_xid); - __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x major_status=%s (0x%08lx) minor_status=%u", @@ -327,7 +327,7 @@ TRACE_EVENT(rpcgss_svc_authenticate, TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->seqno = gc->gc_seq; - __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x seqno=%u", __get_str(addr), @@ -563,7 +563,7 @@ TRACE_EVENT(rpcgss_upcall_msg, ), TP_fast_assign( - __assign_str(msg, buf); + __assign_str(msg); ), TP_printk("msg='%s'", __get_str(msg)) @@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context, __entry->timeout = timeout; __entry->window_size = window_size; __entry->len = len; - __assign_str(acceptor, data); + __assign_str(acceptor); ), TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s", @@ -677,7 +677,7 @@ TRACE_EVENT(rpcgss_oid_to_mech, ), TP_fast_assign( - __assign_str(oid, oid); + __assign_str(oid); ), TP_printk("mech for oid %s was not found", __get_str(oid)) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 027ac3ab457d..14392652273a 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -304,8 +304,8 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_class, __entry->xid = be32_to_cpu(rep->rr_xid); __entry->version = be32_to_cpu(rep->rr_vers); __entry->proc = be32_to_cpu(rep->rr_proc); - __assign_str(addr, rpcrdma_addrstr(rep->rr_rxprt)); - __assign_str(port, rpcrdma_portstr(rep->rr_rxprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s xid=0x%08x version=%u proc=%u", @@ -335,8 +335,8 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt, ), TP_fast_assign( - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s", @@ -369,8 +369,8 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_fast_assign( __entry->rc = rc; __entry->connect_status = r_xprt->rx_ep->re_connect_status; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s rc=%d connection status=%d", @@ -608,8 +608,8 @@ DECLARE_EVENT_CLASS(xprtrdma_callback_class, TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s xid=0x%08x", @@ -687,8 +687,8 @@ TRACE_EVENT(xprtrdma_op_connect, TP_fast_assign( __entry->delay = delay; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s delay=%lu", @@ -716,8 +716,8 @@ TRACE_EVENT(xprtrdma_op_set_cto, TP_fast_assign( __entry->connect = connect; __entry->reconnect = reconnect; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s connect=%lu reconnect=%lu", @@ -746,8 +746,8 @@ TRACE_EVENT(xprtrdma_createmrs, TP_fast_assign( __entry->count = count; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s created %u MRs", @@ -775,8 +775,8 @@ TRACE_EVENT(xprtrdma_nomrs_err, __entry->task_id = rqst->rq_task->tk_pid; __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " peer=[%s]:%s", @@ -1001,8 +1001,8 @@ TRACE_EVENT(xprtrdma_post_recvs, __entry->cq_id = ep->re_attr.recv_cq->res.id; __entry->count = count; __entry->posted = ep->re_receive_count; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active", @@ -1031,8 +1031,8 @@ TRACE_EVENT(xprtrdma_post_recvs_err, __entry->cq_id = ep->re_attr.recv_cq->res.id; __entry->status = status; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s cq.id=%d rc=%d", @@ -1445,8 +1445,8 @@ TRACE_EVENT(xprtrdma_cb_setup, TP_fast_assign( __entry->reqs = reqs; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s %u reqs", @@ -1476,7 +1476,7 @@ DECLARE_EVENT_CLASS(svcrdma_accept_class, TP_fast_assign( __entry->status = status; - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s status=%ld", @@ -1962,7 +1962,7 @@ TRACE_EVENT(svcrdma_send_err, TP_fast_assign( __entry->status = status; __entry->xid = __be32_to_cpu(rqst->rq_xid); - __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x status=%d", __get_str(addr), @@ -2025,7 +2025,7 @@ TRACE_EVENT(svcrdma_rq_post_err, TP_fast_assign( __entry->status = status; - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s status=%d", @@ -2138,7 +2138,7 @@ TRACE_EVENT(svcrdma_qp_error, TP_fast_assign( __entry->event = event->event; - __assign_str(device, event->device->name); + __assign_str(device); snprintf(__entry->addr, sizeof(__entry->addr) - 1, "%pISpc", sap); ), diff --git a/include/trace/events/rpm.h b/include/trace/events/rpm.h index bd120e23ce12..2b0b4b6ef862 100644 --- a/include/trace/events/rpm.h +++ b/include/trace/events/rpm.h @@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(rpm_internal, ), TP_fast_assign( - __assign_str(name, dev_name(dev)); + __assign_str(name); __entry->flags = flags; __entry->usage_count = atomic_read( &dev->power.usage_count); @@ -92,7 +92,7 @@ TRACE_EVENT(rpm_return_int, ), TP_fast_assign( - __assign_str(name, dev_name(dev)); + __assign_str(name); __entry->ip = ip; __entry->ret = ret; ), @@ -135,7 +135,7 @@ TRACE_EVENT(rpm_status, ), TP_fast_assign( - __assign_str(name, dev_name(dev)); + __assign_str(name); __entry->status = status; ), diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 68973f650c26..6df2b4685b08 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -411,7 +411,7 @@ TRACE_EVENT(sched_process_exec, ), TP_fast_assign( - __assign_str(filename, bprm->filename); + __assign_str(filename); __entry->pid = p->pid; __entry->old_pid = old_pid; ), @@ -445,10 +445,10 @@ TRACE_EVENT(sched_prepare_exec, ), TP_fast_assign( - __assign_str(interp, bprm->interp); - __assign_str(filename, bprm->filename); + __assign_str(interp); + __assign_str(filename); __entry->pid = task->pid; - __assign_str(comm, task->comm); + __assign_str(comm); ), TP_printk("interp=%s filename=%s pid=%d comm=%s", diff --git a/include/trace/events/sof.h b/include/trace/events/sof.h index cd8e6844cca2..3061423c0667 100644 --- a/include/trace/events/sof.h +++ b/include/trace/events/sof.h @@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(sof_widget_template, __field(int, use_count) ), TP_fast_assign( - __assign_str(name, swidget->widget->name); + __assign_str(name); __entry->use_count = swidget->use_count; ), TP_printk("name=%s use_count=%d", __get_str(name), __entry->use_count) @@ -49,7 +49,7 @@ TRACE_EVENT(sof_ipc3_period_elapsed_position, __field(u64, wallclock) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->host_posn = posn->host_posn; __entry->dai_posn = posn->dai_posn; __entry->wallclock = posn->wallclock; @@ -75,7 +75,7 @@ TRACE_EVENT(sof_pcm_pointer_position, __field(unsigned long, dai_posn) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->pcm_id = le32_to_cpu(spcm->pcm.pcm_id); __entry->stream = substream->stream; __entry->dma_posn = dma_posn; @@ -93,7 +93,7 @@ TRACE_EVENT(sof_stream_position_ipc_rx, __string(device_name, dev_name(dev)) ), TP_fast_assign( - __assign_str(device_name, dev_name(dev)); + __assign_str(device_name); ), TP_printk("device_name=%s", __get_str(device_name)) ); @@ -107,8 +107,8 @@ TRACE_EVENT(sof_ipc4_fw_config, __field(u32, value) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); - __assign_str(key, key); + __assign_str(device_name); + __assign_str(key); __entry->value = value; ), TP_printk("device_name=%s key=%s value=%d", diff --git a/include/trace/events/sof_intel.h b/include/trace/events/sof_intel.h index 4cac5fef99a2..9e579e57b15c 100644 --- a/include/trace/events/sof_intel.h +++ b/include/trace/events/sof_intel.h @@ -22,8 +22,8 @@ TRACE_EVENT(sof_intel_hda_irq, __string(source, source) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); - __assign_str(source, source); + __assign_str(device_name); + __assign_str(source); ), TP_printk("device_name=%s source=%s", __get_str(device_name), __get_str(source)) @@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(sof_intel_ipc_firmware_template, __field(u32, msg_ext) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->msg = msg; __entry->msg_ext = msg_ext; ), @@ -64,7 +64,7 @@ TRACE_EVENT(sof_intel_D0I3C_updated, __field(u8, reg) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->reg = reg; ), TP_printk("device_name=%s register=%#x", @@ -79,7 +79,7 @@ TRACE_EVENT(sof_intel_hda_irq_ipc_check, __field(u32, irq_status) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->irq_status = irq_status; ), TP_printk("device_name=%s irq_status=%#x", @@ -100,7 +100,7 @@ TRACE_EVENT(sof_intel_hda_dsp_pcm, __field(unsigned long, pos) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->hstream_index = hstream->index; __entry->substream = substream->stream; __entry->pos = pos; @@ -119,7 +119,7 @@ TRACE_EVENT(sof_intel_hda_dsp_stream_status, __field(u32, status) ), TP_fast_assign( - __assign_str(device_name, dev_name(dev)); + __assign_str(device_name); __entry->stream = s->index; __entry->status = status; ), @@ -135,7 +135,7 @@ TRACE_EVENT(sof_intel_hda_dsp_check_stream_irq, __field(u32, status) ), TP_fast_assign( - __assign_str(device_name, dev_name(sdev->dev)); + __assign_str(device_name); __entry->status = status; ), TP_printk("device_name=%s status=%#x", diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index ac05ed06a071..5e8495216689 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -188,10 +188,10 @@ TRACE_EVENT(rpc_clnt_new, __entry->client_id = clnt->cl_clid; __entry->xprtsec = args->xprtsec.policy; __entry->flags = args->flags; - __assign_str(program, clnt->cl_program->name); - __assign_str(server, xprt->servername); - __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + __assign_str(program); + __assign_str(server); + __assign_str(addr); + __assign_str(port); ), TP_printk("client=" SUNRPC_TRACE_CLID_SPECIFIER " peer=[%s]:%s" @@ -220,8 +220,8 @@ TRACE_EVENT(rpc_clnt_new_err, TP_fast_assign( __entry->error = error; - __assign_str(program, program); - __assign_str(server, server); + __assign_str(program); + __assign_str(server); ), TP_printk("program=%s server=%s error=%d", @@ -325,8 +325,8 @@ TRACE_EVENT(rpc_request, __entry->client_id = task->tk_client->cl_clid; __entry->version = task->tk_client->cl_vers; __entry->async = RPC_IS_ASYNC(task); - __assign_str(progname, task->tk_client->cl_program->name); - __assign_str(procname, rpc_proc_name(task)); + __assign_str(progname); + __assign_str(procname); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " %sv%d %s (%ssync)", @@ -439,7 +439,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued, __entry->runstate = task->tk_runstate; __entry->status = task->tk_status; __entry->flags = task->tk_flags; - __assign_str(q_name, rpc_qname(q)); + __assign_str(q_name); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -515,10 +515,10 @@ DECLARE_EVENT_CLASS(rpc_reply_event, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); - __assign_str(progname, task->tk_client->cl_program->name); + __assign_str(progname); __entry->version = task->tk_client->cl_vers; - __assign_str(procname, rpc_proc_name(task)); - __assign_str(servername, task->tk_xprt->servername); + __assign_str(procname); + __assign_str(servername); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -647,8 +647,8 @@ TRACE_EVENT(rpc_stats_latency, __entry->task_id = task->tk_pid; __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); __entry->version = task->tk_client->cl_vers; - __assign_str(progname, task->tk_client->cl_program->name); - __assign_str(procname, rpc_proc_name(task)); + __assign_str(progname); + __assign_str(procname); __entry->backlog = ktime_to_us(backlog); __entry->rtt = ktime_to_us(rtt); __entry->execute = ktime_to_us(execute); @@ -697,16 +697,15 @@ TRACE_EVENT(rpc_xdr_overflow, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __assign_str(progname, - task->tk_client->cl_program->name); + __assign_str(progname); __entry->version = task->tk_client->cl_vers; - __assign_str(procedure, task->tk_msg.rpc_proc->p_name); + __assign_str(procedure); } else { __entry->task_id = -1; __entry->client_id = -1; - __assign_str(progname, "unknown"); + __assign_str(progname); __entry->version = 0; - __assign_str(procedure, "unknown"); + __assign_str(procedure); } __entry->requested = requested; __entry->end = xdr->end; @@ -763,10 +762,9 @@ TRACE_EVENT(rpc_xdr_alignment, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __assign_str(progname, - task->tk_client->cl_program->name); + __assign_str(progname); __entry->version = task->tk_client->cl_vers; - __assign_str(procedure, task->tk_msg.rpc_proc->p_name); + __assign_str(procedure); __entry->offset = offset; __entry->copied = copied; @@ -1018,8 +1016,8 @@ DECLARE_EVENT_CLASS(rpc_xprt_lifetime_class, TP_fast_assign( __entry->state = xprt->state; - __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s state=%s", @@ -1061,8 +1059,8 @@ DECLARE_EVENT_CLASS(rpc_xprt_event, TP_fast_assign( __entry->xid = be32_to_cpu(xid); __entry->status = status; - __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s xid=0x%08x status=%d", __get_str(addr), @@ -1140,10 +1138,9 @@ TRACE_EVENT(xprt_retransmit, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->ntrans = rqst->rq_ntrans; __entry->timeout = task->tk_timeout; - __assign_str(progname, - task->tk_client->cl_program->name); + __assign_str(progname); __entry->version = task->tk_client->cl_vers; - __assign_str(procname, rpc_proc_name(task)); + __assign_str(procname); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -1167,8 +1164,8 @@ TRACE_EVENT(xprt_ping, TP_fast_assign( __entry->status = status; - __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s status=%d", @@ -1315,8 +1312,8 @@ TRACE_EVENT(xs_data_ready, ), TP_fast_assign( - __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s", __get_str(addr), __get_str(port)) @@ -1339,10 +1336,8 @@ TRACE_EVENT(xs_stream_read_data, TP_fast_assign( __entry->err = err; __entry->total = total; - __assign_str(addr, xprt ? - xprt->address_strings[RPC_DISPLAY_ADDR] : EVENT_NULL_STR); - __assign_str(port, xprt ? - xprt->address_strings[RPC_DISPLAY_PORT] : EVENT_NULL_STR); + __assign_str(addr); + __assign_str(port); ), TP_printk("peer=[%s]:%s err=%zd total=%zu", __get_str(addr), @@ -1364,8 +1359,8 @@ TRACE_EVENT(xs_stream_read_request, ), TP_fast_assign( - __assign_str(addr, xs->xprt.address_strings[RPC_DISPLAY_ADDR]); - __assign_str(port, xs->xprt.address_strings[RPC_DISPLAY_PORT]); + __assign_str(addr); + __assign_str(port); __entry->xid = be32_to_cpu(xs->recv.xid); __entry->copied = xs->recv.copied; __entry->reclen = xs->recv.len; @@ -1403,7 +1398,7 @@ TRACE_EVENT(rpcb_getport, __entry->version = clnt->cl_vers; __entry->protocol = task->tk_xprt->prot; __entry->bind_version = bind_version; - __assign_str(servername, task->tk_xprt->servername); + __assign_str(servername); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -1493,8 +1488,8 @@ TRACE_EVENT(rpcb_register, TP_fast_assign( __entry->program = program; __entry->version = version; - __assign_str(addr, addr); - __assign_str(netid, netid); + __assign_str(addr); + __assign_str(netid); ), TP_printk("program=%u version=%u addr=%s netid=%s", @@ -1521,7 +1516,7 @@ TRACE_EVENT(rpcb_unregister, TP_fast_assign( __entry->program = program; __entry->version = version; - __assign_str(netid, netid); + __assign_str(netid); ), TP_printk("program=%u version=%u netid=%s", @@ -1551,8 +1546,8 @@ DECLARE_EVENT_CLASS(rpc_tls_class, TP_fast_assign( __entry->requested_policy = clnt->cl_xprtsec.policy; __entry->version = clnt->cl_vers; - __assign_str(servername, xprt->servername); - __assign_str(progname, clnt->cl_program->name) + __assign_str(servername); + __assign_str(progname); ), TP_printk("server=%s %sv%u requested_policy=%s", @@ -1794,10 +1789,9 @@ TRACE_EVENT(svc_process, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->vers = rqst->rq_vers; __entry->proc = rqst->rq_proc; - __assign_str(service, name); - __assign_str(procedure, svc_proc_name(rqst)); - __assign_str(addr, rqst->rq_xprt ? - rqst->rq_xprt->xpt_remotebuf : EVENT_NULL_STR); + __assign_str(service); + __assign_str(procedure); + __assign_str(addr); ), TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s", @@ -1915,7 +1909,7 @@ TRACE_EVENT(svc_stats_latency, __entry->execute = ktime_to_us(ktime_sub(ktime_get(), rqst->rq_stime)); - __assign_str(procedure, svc_proc_name(rqst)); + __assign_str(procedure); ), TP_printk(SVC_RQST_ENDPOINT_FORMAT " proc=%s execute-us=%lu", @@ -1980,8 +1974,8 @@ TRACE_EVENT(svc_xprt_create_err, TP_fast_assign( __entry->error = PTR_ERR(xprt); - __assign_str(program, program); - __assign_str(protocol, protocol); + __assign_str(program); + __assign_str(protocol); __assign_sockaddr(addr, sap, salen); ), @@ -2120,8 +2114,8 @@ TRACE_EVENT(svc_xprt_accept, TP_fast_assign( SVC_XPRT_ENDPOINT_ASSIGNMENTS(xprt); - __assign_str(protocol, xprt->xpt_class->xcl_name); - __assign_str(service, service); + __assign_str(protocol); + __assign_str(service); ), TP_printk(SVC_XPRT_ENDPOINT_FORMAT " protocol=%s service=%s", @@ -2260,7 +2254,7 @@ TRACE_EVENT(svcsock_marker, TP_fast_assign( __entry->length = be32_to_cpu(marker) & RPC_FRAGMENT_SIZE_MASK; __entry->last = be32_to_cpu(marker) & RPC_LAST_STREAM_FRAGMENT; - __assign_str(addr, xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s length=%u%s", __get_str(addr), @@ -2284,7 +2278,7 @@ DECLARE_EVENT_CLASS(svcsock_class, TP_fast_assign( __entry->result = result; __entry->flags = xprt->xpt_flags; - __assign_str(addr, xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s result=%zd flags=%s", __get_str(addr), @@ -2330,7 +2324,7 @@ TRACE_EVENT(svcsock_tcp_recv_short, __entry->expected = expected; __entry->received = received; __entry->flags = xprt->xpt_flags; - __assign_str(addr, xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s flags=%s expected=%u received=%u", @@ -2358,7 +2352,7 @@ TRACE_EVENT(svcsock_tcp_state, __entry->socket_state = socket->state; __entry->sock_state = socket->sk->sk_state; __entry->flags = xprt->xpt_flags; - __assign_str(addr, xprt->xpt_remotebuf); + __assign_str(addr); ), TP_printk("addr=%s state=%s sk_state=%s flags=%s", __get_str(addr), @@ -2385,7 +2379,7 @@ DECLARE_EVENT_CLASS(svcsock_accept_class, TP_fast_assign( __entry->status = status; - __assign_str(service, service); + __assign_str(service); __entry->netns_ino = xprt->xpt_net->ns.inum; ), @@ -2421,7 +2415,7 @@ DECLARE_EVENT_CLASS(cache_event, TP_fast_assign( __entry->h = h; - __assign_str(name, cd->name); + __assign_str(name); ), TP_printk("cache=%s entry=%p", __get_str(name), __entry->h) @@ -2466,7 +2460,7 @@ DECLARE_EVENT_CLASS(register_class, __entry->protocol = protocol; __entry->port = port; __entry->error = error; - __assign_str(program, program); + __assign_str(program); ), TP_printk("program=%sv%u proto=%s port=%u family=%s error=%d", @@ -2511,7 +2505,7 @@ TRACE_EVENT(svc_unregister, TP_fast_assign( __entry->version = version; __entry->error = error; - __assign_str(program, program); + __assign_str(program); ), TP_printk("program=%sv%u error=%d", diff --git a/include/trace/events/swiotlb.h b/include/trace/events/swiotlb.h index da05c9ebd224..3b6ddb136e4e 100644 --- a/include/trace/events/swiotlb.h +++ b/include/trace/events/swiotlb.h @@ -20,7 +20,7 @@ TRACE_EVENT(swiotlb_bounced, ), TP_fast_assign( - __assign_str(dev_name, dev_name(dev)); + __assign_str(dev_name); __entry->dma_mask = (dev->dma_mask ? *dev->dma_mask : 0); __entry->dev_addr = dev_addr; __entry->size = size; diff --git a/include/trace/events/target.h b/include/trace/events/target.h index 67fad2677ed5..a13cbf2b3405 100644 --- a/include/trace/events/target.h +++ b/include/trace/events/target.h @@ -154,7 +154,7 @@ TRACE_EVENT(target_sequencer_start, __entry->task_attribute = cmd->sam_task_attr; __entry->control = scsi_command_control(cmd->t_task_cdb); memcpy(__entry->cdb, cmd->t_task_cdb, TCM_MAX_COMMAND_SIZE); - __assign_str(initiator, cmd->se_sess->se_node_acl->initiatorname); + __assign_str(initiator); ), TP_printk("%s -> LUN %03u tag %#llx %s data_length %6u CDB %s (TA:%s C:%02x)", @@ -198,7 +198,7 @@ TRACE_EVENT(target_cmd_complete, min(18, ((u8 *) cmd->sense_buffer)[SPC_ADD_SENSE_LEN_OFFSET] + 8) : 0; memcpy(__entry->cdb, cmd->t_task_cdb, TCM_MAX_COMMAND_SIZE); memcpy(__entry->sense_data, cmd->sense_buffer, __entry->sense_length); - __assign_str(initiator, cmd->se_sess->se_node_acl->initiatorname); + __assign_str(initiator); ), TP_printk("%s <- LUN %03u tag %#llx status %s (sense len %d%s%s) %s data_length %6u CDB %s (TA:%s C:%02x)", diff --git a/include/trace/events/tegra_apb_dma.h b/include/trace/events/tegra_apb_dma.h index 971cd02d2daf..6d9f5075baa3 100644 --- a/include/trace/events/tegra_apb_dma.h +++ b/include/trace/events/tegra_apb_dma.h @@ -16,7 +16,7 @@ TRACE_EVENT(tegra_dma_tx_status, __field(__u32, residue) ), TP_fast_assign( - __assign_str(chan, dev_name(&dc->dev->device)); + __assign_str(chan); __entry->cookie = cookie; __entry->residue = state ? state->residue : (u32)-1; ), @@ -33,7 +33,7 @@ TRACE_EVENT(tegra_dma_complete_cb, __field(void *, ptr) ), TP_fast_assign( - __assign_str(chan, dev_name(&dc->dev->device)); + __assign_str(chan); __entry->count = count; __entry->ptr = ptr; ), @@ -49,7 +49,7 @@ TRACE_EVENT(tegra_dma_isr, __field(int, irq) ), TP_fast_assign( - __assign_str(chan, dev_name(&dc->dev->device)); + __assign_str(chan); __entry->irq = irq; ), TP_printk("%s: irq %d\n", __get_str(chan), __entry->irq) diff --git a/include/trace/events/ufs.h b/include/trace/events/ufs.h index b930669bd1f0..c4e209fbdfbb 100644 --- a/include/trace/events/ufs.h +++ b/include/trace/events/ufs.h @@ -92,7 +92,7 @@ TRACE_EVENT(ufshcd_clk_gating, ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name); __entry->state = state; ), @@ -117,9 +117,9 @@ TRACE_EVENT(ufshcd_clk_scaling, ), TP_fast_assign( - __assign_str(dev_name, dev_name); - __assign_str(state, state); - __assign_str(clk, clk); + __assign_str(dev_name); + __assign_str(state); + __assign_str(clk); __entry->prev_state = prev_state; __entry->curr_state = curr_state; ), @@ -141,8 +141,8 @@ TRACE_EVENT(ufshcd_auto_bkops_state, ), TP_fast_assign( - __assign_str(dev_name, dev_name); - __assign_str(state, state); + __assign_str(dev_name); + __assign_str(state); ), TP_printk("%s: auto bkops - %s", @@ -163,8 +163,8 @@ DECLARE_EVENT_CLASS(ufshcd_profiling_template, ), TP_fast_assign( - __assign_str(dev_name, dev_name); - __assign_str(profile_info, profile_info); + __assign_str(dev_name); + __assign_str(profile_info); __entry->time_us = time_us; __entry->err = err; ), @@ -206,7 +206,7 @@ DECLARE_EVENT_CLASS(ufshcd_template, TP_fast_assign( __entry->usecs = usecs; __entry->err = err; - __assign_str(dev_name, dev_name); + __assign_str(dev_name); __entry->dev_state = dev_state; __entry->link_state = link_state; ), @@ -326,7 +326,7 @@ TRACE_EVENT(ufshcd_uic_command, ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name); __entry->str_t = str_t; __entry->cmd = cmd; __entry->arg1 = arg1; @@ -356,7 +356,7 @@ TRACE_EVENT(ufshcd_upiu, ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name); __entry->str_t = str_t; memcpy(__entry->hdr, hdr, sizeof(__entry->hdr)); memcpy(__entry->tsf, tsf, sizeof(__entry->tsf)); @@ -384,7 +384,7 @@ TRACE_EVENT(ufshcd_exception_event, ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name); __entry->status = status; ), diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 6ef5b7254070..b0de2bc9ed52 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -38,7 +38,7 @@ TRACE_EVENT(workqueue_queue_work, TP_fast_assign( __entry->work = work; __entry->function = work->func; - __assign_str(workqueue, pwq->wq->name); + __assign_str(workqueue); __entry->req_cpu = req_cpu; __entry->cpu = pwq->pool->cpu; ), diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 9adc2bdf2f94..a7e5452b5d21 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -416,7 +416,7 @@ TRACE_EVENT(bpf_xdp_link_attach_failed, ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("errmsg=%s", __get_str(msg)) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 3690e677263f..1691676fd858 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -31,12 +31,10 @@ #define __vstring(item, fmt, ap) __dynamic_array(char, item, -1) #undef __assign_str -#define __assign_str(dst, src) \ +#define __assign_str(dst) \ do { \ char *__str__ = __get_str(dst); \ int __len__ = __get_dynamic_array_len(dst) - 1; \ - WARN_ON_ONCE(!(void *)(src) != !(void *)__data_offsets.dst##_ptr_); \ - WARN_ON_ONCE((src) && strcmp((src), __data_offsets.dst##_ptr_)); \ memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ EVENT_NULL_STR, __len__); \ __str__[__len__] = '\0'; \ diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h index 9acbc11ac7bb..c4075b56becc 100644 --- a/kernel/trace/bpf_trace.h +++ b/kernel/trace/bpf_trace.h @@ -19,7 +19,7 @@ TRACE_EVENT(bpf_trace_printk, ), TP_fast_assign( - __assign_str(bpf_string, bpf_string); + __assign_str(bpf_string); ), TP_printk("%s", __get_str(bpf_string)) diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 5dd52bc5cabb..6b816cf1a953 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -40,8 +40,8 @@ TRACE_EVENT(batadv_dbg, ), TP_fast_assign( - __assign_str(device, bat_priv->soft_iface->name); - __assign_str(driver, KBUILD_MODNAME); + __assign_str(device); + __assign_str(driver); __assign_vstr(msg, vaf->fmt, vaf->va); ), diff --git a/net/dsa/trace.h b/net/dsa/trace.h index 567f29a39707..83f3e5f78491 100644 --- a/net/dsa/trace.h +++ b/net/dsa/trace.h @@ -39,8 +39,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_op_hw, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; ether_addr_copy(__entry->addr, addr); __entry->vid = vid; @@ -98,8 +98,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_op_refcount, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; ether_addr_copy(__entry->addr, addr); __entry->vid = vid; @@ -157,8 +157,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_del_not_found, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; ether_addr_copy(__entry->addr, addr); __entry->vid = vid; @@ -199,7 +199,7 @@ TRACE_EVENT(dsa_lag_fdb_add_hw, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -227,7 +227,7 @@ TRACE_EVENT(dsa_lag_fdb_add_bump, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -255,7 +255,7 @@ TRACE_EVENT(dsa_lag_fdb_del_hw, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -283,7 +283,7 @@ TRACE_EVENT(dsa_lag_fdb_del_drop, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -310,7 +310,7 @@ TRACE_EVENT(dsa_lag_fdb_del_not_found, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -338,8 +338,8 @@ DECLARE_EVENT_CLASS(dsa_vlan_op_hw, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; __entry->vid = vlan->vid; __entry->flags = vlan->flags; @@ -383,8 +383,8 @@ DECLARE_EVENT_CLASS(dsa_vlan_op_refcount, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; __entry->vid = vlan->vid; __entry->flags = vlan->flags; @@ -426,8 +426,8 @@ TRACE_EVENT(dsa_vlan_del_not_found, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; __entry->vid = vlan->vid; ), diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 62aa6465253a..591ce0a16fc0 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -75,7 +75,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf, ), TP_fast_assign( WPAN_PHY_ASSIGN; - __assign_str(vir_intf_name, name ? name : ""); + __assign_str(vir_intf_name); __entry->type = type; __entry->extended_addr = extended_addr; ), diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 8e758b5074bd..b26aacfbc622 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -33,7 +33,7 @@ __string(vif_name, sdata->name) #define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ __entry->p2p = sdata->vif.p2p; \ - __assign_str(vif_name, sdata->name) + __assign_str(vif_name) #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h index 3eb35d9eb700..74d75aaebef4 100644 --- a/net/openvswitch/openvswitch_trace.h +++ b/net/openvswitch/openvswitch_trace.h @@ -43,8 +43,8 @@ TRACE_EVENT(ovs_do_execute_action, TP_fast_assign( __entry->dpaddr = dp; - __assign_str(dp_name, ovs_dp_name(dp)); - __assign_str(dev_name, skb->dev->name); + __assign_str(dp_name); + __assign_str(dev_name); __entry->skbaddr = skb; __entry->len = skb->len; __entry->data_len = skb->data_len; @@ -113,8 +113,8 @@ TRACE_EVENT(ovs_dp_upcall, TP_fast_assign( __entry->dpaddr = dp; - __assign_str(dp_name, ovs_dp_name(dp)); - __assign_str(dev_name, skb->dev->name); + __assign_str(dp_name); + __assign_str(dev_name); __entry->skbaddr = skb; __entry->len = skb->len; __entry->data_len = skb->data_len; diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index 9fc5e586d24a..a9a6e3c1113a 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; - __assign_str(name, smc->conn.lnk->ibname); + __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", @@ -104,7 +104,7 @@ TRACE_EVENT(smcr_link_down, __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; - __assign_str(name, lnk->ibname); + __assign_str(name); __entry->location = location; ), diff --git a/net/tipc/trace.h b/net/tipc/trace.h index 04af83f0500c..865142ed0ab4 100644 --- a/net/tipc/trace.h +++ b/net/tipc/trace.h @@ -145,7 +145,7 @@ DECLARE_EVENT_CLASS(tipc_skb_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); tipc_skb_dump(skb, more, __get_str(buf)); ), @@ -172,7 +172,7 @@ DECLARE_EVENT_CLASS(tipc_list_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); tipc_list_dump(list, more, __get_str(buf)); ), @@ -200,7 +200,7 @@ DECLARE_EVENT_CLASS(tipc_sk_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); __entry->portid = tipc_sock_get_portid(sk); tipc_sk_dump(sk, dqueues, __get_str(buf)); if (skb) @@ -254,7 +254,7 @@ DECLARE_EVENT_CLASS(tipc_link_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), @@ -337,7 +337,7 @@ DECLARE_EVENT_CLASS(tipc_node_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); __entry->addr = tipc_node_get_addr(n); tipc_node_dump(n, more, __get_str(buf)); ), @@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(tipc_fsm_class, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->os = os; __entry->ns = ns; __entry->evt = evt; @@ -409,8 +409,8 @@ TRACE_EVENT(tipc_l2_device_event, ), TP_fast_assign( - __assign_str(dev_name, dev->name); - __assign_str(b_name, b->name); + __assign_str(dev_name); + __assign_str(b_name); __entry->evt = evt; __entry->b_up = test_bit(0, &b->up); __entry->carrier = netif_carrier_ok(dev); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9bf987519811..87986170d1b1 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -372,7 +372,7 @@ TRACE_EVENT(rdev_add_virtual_intf, ), TP_fast_assign( WIPHY_ASSIGN; - __assign_str(vir_intf_name, name ? name : ""); + __assign_str(vir_intf_name); __entry->type = type; ), TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d", diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 500981eca74d..55f9a3da92d5 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -136,10 +136,11 @@ * * To assign a string, use the helper macro __assign_str(). * - * __assign_str(foo, bar); + * __assign_str(foo); * - * In most cases, the __assign_str() macro will take the same - * parameters as the __string() macro had to declare the string. + * The __string() macro saves off the string that is passed into + * the second parameter, and the __assign_str() will store than + * saved string into the "foo" field. * * __vstring: This is similar to __string() but instead of taking a * dynamic length, it takes a variable list va_list 'va' variable. @@ -177,7 +178,7 @@ * The length is saved via the __string_len() and is retrieved in * __assign_str(). * - * __assign_str(foo, bar); + * __assign_str(foo); * * Then len + 1 is allocated to the ring buffer, and a nul terminating * byte is added. This is similar to: @@ -311,8 +312,8 @@ TRACE_EVENT(foo_bar, __entry->bar = bar; memcpy(__get_dynamic_array(list), lst, __length_of(lst) * sizeof(int)); - __assign_str(str, string); - __assign_str(lstr, foo); + __assign_str(str); + __assign_str(lstr); __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); __assign_cpumask(cpum, cpumask_bits(mask)); @@ -418,7 +419,7 @@ TRACE_EVENT_CONDITION(foo_bar_with_cond, ), TP_fast_assign( - __assign_str(foo, foo); + __assign_str(foo); __entry->bar = bar; ), @@ -459,7 +460,7 @@ TRACE_EVENT_FN(foo_bar_with_fn, ), TP_fast_assign( - __assign_str(foo, foo); + __assign_str(foo); __entry->bar = bar; ), @@ -506,7 +507,7 @@ DECLARE_EVENT_CLASS(foo_template, ), TP_fast_assign( - __assign_str(foo, foo); + __assign_str(foo); __entry->bar = bar; ), diff --git a/sound/core/pcm_trace.h b/sound/core/pcm_trace.h index 350b40b906ca..adb9b1f3bbfa 100644 --- a/sound/core/pcm_trace.h +++ b/sound/core/pcm_trace.h @@ -95,7 +95,7 @@ TRACE_EVENT(hw_ptr_error, __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; - __assign_str(reason, why); + __assign_str(reason); ), TP_printk("pcmC%dD%d%s/sub%d: ERROR: %s", __entry->card, __entry->device, diff --git a/sound/hda/trace.h b/sound/hda/trace.h index 2cc493434a8f..280c42f3eb75 100644 --- a/sound/hda/trace.h +++ b/sound/hda/trace.h @@ -24,7 +24,7 @@ TRACE_EVENT(hda_send_cmd, __field(u32, cmd) ), TP_fast_assign( - __assign_str(name, dev_name((bus)->dev)); + __assign_str(name); __entry->cmd = cmd; ), TP_printk("[%s:%d] val=0x%08x", __get_str(name), __entry->cmd >> 28, __entry->cmd) @@ -39,7 +39,7 @@ TRACE_EVENT(hda_get_response, __field(u32, res) ), TP_fast_assign( - __assign_str(name, dev_name((bus)->dev)); + __assign_str(name); __entry->addr = addr; __entry->res = res; ), @@ -55,7 +55,7 @@ TRACE_EVENT(hda_unsol_event, __field(u32, res_ex) ), TP_fast_assign( - __assign_str(name, dev_name((bus)->dev)); + __assign_str(name); __entry->res = res; __entry->res_ex = res_ex; ), diff --git a/sound/soc/intel/avs/trace.h b/sound/soc/intel/avs/trace.h index 855b06bb14b0..c9eaa5a60ed3 100644 --- a/sound/soc/intel/avs/trace.h +++ b/sound/soc/intel/avs/trace.h @@ -24,7 +24,7 @@ TRACE_EVENT(avs_dsp_core_op, TP_fast_assign( __entry->reg = reg; __entry->mask = mask; - __assign_str(op, op); + __assign_str(op); __entry->flag = flag; ), @@ -135,7 +135,7 @@ TRACE_EVENT(avs_d0ix, ), TP_fast_assign( - __assign_str(op, op); + __assign_str(op); __entry->proceed = proceed; __entry->header = header; ), -- cgit v1.2.3 From a6c11c0a5235fb144a65e0cb2ffd360ddc1f6c32 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 22 May 2024 15:02:18 -0700 Subject: genirq/cpuhotplug, x86/vector: Prevent vector leak during CPU offline The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of interrupt affinity reconfiguration via procfs. Instead, the change is deferred until the next instance of the interrupt being triggered on the original CPU. When the interrupt next triggers on the original CPU, the new affinity is enforced within __irq_move_irq(). A vector is allocated from the new CPU, but the old vector on the original CPU remains and is not immediately reclaimed. Instead, apicd->move_in_progress is flagged, and the reclaiming process is delayed until the next trigger of the interrupt on the new CPU. Upon the subsequent triggering of the interrupt on the new CPU, irq_complete_move() adds a task to the old CPU's vector_cleanup list if it remains online. Subsequently, the timer on the old CPU iterates over its vector_cleanup list, reclaiming old vectors. However, a rare scenario arises if the old CPU is outgoing before the interrupt triggers again on the new CPU. In that case irq_force_complete_move() is not invoked on the outgoing CPU to reclaim the old apicd->prev_vector because the interrupt isn't currently affine to the outgoing CPU, and irq_needs_fixup() returns false. Even though __vector_schedule_cleanup() is later called on the new CPU, it doesn't reclaim apicd->prev_vector; instead, it simply resets both apicd->move_in_progress and apicd->prev_vector to 0. As a result, the vector remains unreclaimed in vector_matrix, leading to a CPU vector leak. To address this issue, move the invocation of irq_force_complete_move() before the irq_needs_fixup() call to reclaim apicd->prev_vector, if the interrupt is currently or used to be affine to the outgoing CPU. Additionally, reclaim the vector in __vector_schedule_cleanup() as well, following a warning message, although theoretically it should never see apicd->move_in_progress with apicd->prev_cpu pointing to an offline CPU. Fixes: f0383c24b485 ("genirq/cpuhotplug: Add support for cleaning up move in progress") Signed-off-by: Dongli Zhang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240522220218.162423-1-dongli.zhang@oracle.com --- arch/x86/kernel/apic/vector.c | 9 ++++++--- kernel/irq/cpuhotplug.c | 16 ++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 9eec52925fa3..557318145038 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1035,7 +1035,8 @@ static void __vector_schedule_cleanup(struct apic_chip_data *apicd) add_timer_on(&cl->timer, cpu); } } else { - apicd->prev_vector = 0; + pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu); + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); } @@ -1072,6 +1073,7 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { + unsigned int cpu = smp_processor_id(); struct apic_chip_data *apicd; struct irq_data *irqd; unsigned int vector; @@ -1096,10 +1098,11 @@ void irq_force_complete_move(struct irq_desc *desc) goto unlock; /* - * If prev_vector is empty, no action required. + * If prev_vector is empty or the descriptor is neither currently + * nor previously on the outgoing CPU no action required. */ vector = apicd->prev_vector; - if (!vector) + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) goto unlock; /* diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 75cadbc3c232..eb8628390156 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -69,6 +69,14 @@ static bool migrate_one_irq(struct irq_desc *desc) return false; } + /* + * Complete an eventually pending irq move cleanup. If this + * interrupt was moved in hard irq context, then the vectors need + * to be cleaned up. It can't wait until this interrupt actually + * happens and this CPU was involved. + */ + irq_force_complete_move(desc); + /* * No move required, if: * - Interrupt is per cpu @@ -87,14 +95,6 @@ static bool migrate_one_irq(struct irq_desc *desc) return false; } - /* - * Complete an eventually pending irq move cleanup. If this - * interrupt was moved in hard irq context, then the vectors need - * to be cleaned up. It can't wait until this interrupt actually - * happens and this CPU was involved. - */ - irq_force_complete_move(desc); - /* * If there is a setaffinity pending, then try to reuse the pending * mask, so the last change of the affinity does not get lost. If -- cgit v1.2.3 From ff388fe5c481d39cc0a5940d1ad46f7920f1d646 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Mon, 15 Apr 2024 16:35:20 +0000 Subject: mseal: wire up mseal syscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Introduce mseal", v10. This patchset proposes a new mseal() syscall for the Linux kernel. In a nutshell, mseal() protects the VMAs of a given virtual memory range against modifications, such as changes to their permission bits. Modern CPUs support memory permissions, such as the read/write (RW) and no-execute (NX) bits. Linux has supported NX since the release of kernel version 2.6.8 in August 2004 [1]. The memory permission feature improves the security stance on memory corruption bugs, as an attacker cannot simply write to arbitrary memory and point the code to it. The memory must be marked with the X bit, or else an exception will occur. Internally, the kernel maintains the memory permissions in a data structure called VMA (vm_area_struct). mseal() additionally protects the VMA itself against modifications of the selected seal type. Memory sealing is useful to mitigate memory corruption issues where a corrupted pointer is passed to a memory management system. For example, such an attacker primitive can break control-flow integrity guarantees since read-only memory that is supposed to be trusted can become writable or .text pages can get remapped. Memory sealing can automatically be applied by the runtime loader to seal .text and .rodata pages and applications can additionally seal security critical data at runtime. A similar feature already exists in the XNU kernel with the VM_FLAGS_PERMANENT [3] flag and on OpenBSD with the mimmutable syscall [4]. Also, Chrome wants to adopt this feature for their CFI work [2] and this patchset has been designed to be compatible with the Chrome use case. Two system calls are involved in sealing the map: mmap() and mseal(). The new mseal() is an syscall on 64 bit CPU, and with following signature: int mseal(void addr, size_t len, unsigned long flags) addr/len: memory range. flags: reserved. mseal() blocks following operations for the given memory range. 1> Unmapping, moving to another location, and shrinking the size, via munmap() and mremap(), can leave an empty space, therefore can be replaced with a VMA with a new set of attributes. 2> Moving or expanding a different VMA into the current location, via mremap(). 3> Modifying a VMA via mmap(MAP_FIXED). 4> Size expansion, via mremap(), does not appear to pose any specific risks to sealed VMAs. It is included anyway because the use case is unclear. In any case, users can rely on merging to expand a sealed VMA. 5> mprotect() and pkey_mprotect(). 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous memory, when users don't have write permission to the memory. Those behaviors can alter region contents by discarding pages, effectively a memset(0) for anonymous memory. The idea that inspired this patch comes from Stephen Röttger’s work in V8 CFI [5]. Chrome browser in ChromeOS will be the first user of this API. Indeed, the Chrome browser has very specific requirements for sealing, which are distinct from those of most applications. For example, in the case of libc, sealing is only applied to read-only (RO) or read-execute (RX) memory segments (such as .text and .RELRO) to prevent them from becoming writable, the lifetime of those mappings are tied to the lifetime of the process. Chrome wants to seal two large address space reservations that are managed by different allocators. The memory is mapped RW- and RWX respectively but write access to it is restricted using pkeys (or in the future ARM permission overlay extensions). The lifetime of those mappings are not tied to the lifetime of the process, therefore, while the memory is sealed, the allocators still need to free or discard the unused memory. For example, with madvise(DONTNEED). However, always allowing madvise(DONTNEED) on this range poses a security risk. For example if a jump instruction crosses a page boundary and the second page gets discarded, it will overwrite the target bytes with zeros and change the control flow. Checking write-permission before the discard operation allows us to control when the operation is valid. In this case, the madvise will only succeed if the executing thread has PKEY write permissions and PKRU changes are protected in software by control-flow integrity. Although the initial version of this patch series is targeting the Chrome browser as its first user, it became evident during upstream discussions that we would also want to ensure that the patch set eventually is a complete solution for memory sealing and compatible with other use cases. The specific scenario currently in mind is glibc's use case of loading and sealing ELF executables. To this end, Stephen is working on a change to glibc to add sealing support to the dynamic linker, which will seal all non-writable segments at startup. Once this work is completed, all applications will be able to automatically benefit from these new protections. In closing, I would like to formally acknowledge the valuable contributions received during the RFC process, which were instrumental in shaping this patch: Jann Horn: raising awareness and providing valuable insights on the destructive madvise operations. Liam R. Howlett: perf optimization. Linus Torvalds: assisting in defining system call signature and scope. Theo de Raadt: sharing the experiences and insight gained from implementing mimmutable() in OpenBSD. MM perf benchmarks ================== This patch adds a loop in the mprotect/munmap/madvise(DONTNEED) to check the VMAs’ sealing flag, so that no partial update can be made, when any segment within the given memory range is sealed. To measure the performance impact of this loop, two tests are developed. [8] The first is measuring the time taken for a particular system call, by using clock_gettime(CLOCK_MONOTONIC). The second is using PERF_COUNT_HW_REF_CPU_CYCLES (exclude user space). Both tests have similar results. The tests have roughly below sequence: for (i = 0; i < 1000, i++) create 1000 mappings (1 page per VMA) start the sampling for (j = 0; j < 1000, j++) mprotect one mapping stop and save the sample delete 1000 mappings calculates all samples. Below tests are performed on Intel(R) Pentium(R) Gold 7505 @ 2.00GHz, 4G memory, Chromebook. Based on the latest upstream code: The first test (measuring time) syscall__ vmas t t_mseal delta_ns per_vma % munmap__ 1 909 944 35 35 104% munmap__ 2 1398 1502 104 52 107% munmap__ 4 2444 2594 149 37 106% munmap__ 8 4029 4323 293 37 107% munmap__ 16 6647 6935 288 18 104% munmap__ 32 11811 12398 587 18 105% mprotect 1 439 465 26 26 106% mprotect 2 1659 1745 86 43 105% mprotect 4 3747 3889 142 36 104% mprotect 8 6755 6969 215 27 103% mprotect 16 13748 14144 396 25 103% mprotect 32 27827 28969 1142 36 104% madvise_ 1 240 262 22 22 109% madvise_ 2 366 442 76 38 121% madvise_ 4 623 751 128 32 121% madvise_ 8 1110 1324 215 27 119% madvise_ 16 2127 2451 324 20 115% madvise_ 32 4109 4642 534 17 113% The second test (measuring cpu cycle) syscall__ vmas cpu cmseal delta_cpu per_vma % munmap__ 1 1790 1890 100 100 106% munmap__ 2 2819 3033 214 107 108% munmap__ 4 4959 5271 312 78 106% munmap__ 8 8262 8745 483 60 106% munmap__ 16 13099 14116 1017 64 108% munmap__ 32 23221 24785 1565 49 107% mprotect 1 906 967 62 62 107% mprotect 2 3019 3203 184 92 106% mprotect 4 6149 6569 420 105 107% mprotect 8 9978 10524 545 68 105% mprotect 16 20448 21427 979 61 105% mprotect 32 40972 42935 1963 61 105% madvise_ 1 434 497 63 63 115% madvise_ 2 752 899 147 74 120% madvise_ 4 1313 1513 200 50 115% madvise_ 8 2271 2627 356 44 116% madvise_ 16 4312 4883 571 36 113% madvise_ 32 8376 9319 943 29 111% Based on the result, for 6.8 kernel, sealing check adds 20-40 nano seconds, or around 50-100 CPU cycles, per VMA. In addition, I applied the sealing to 5.10 kernel: The first test (measuring time) syscall__ vmas t tmseal delta_ns per_vma % munmap__ 1 357 390 33 33 109% munmap__ 2 442 463 21 11 105% munmap__ 4 614 634 20 5 103% munmap__ 8 1017 1137 120 15 112% munmap__ 16 1889 2153 263 16 114% munmap__ 32 4109 4088 -21 -1 99% mprotect 1 235 227 -7 -7 97% mprotect 2 495 464 -30 -15 94% mprotect 4 741 764 24 6 103% mprotect 8 1434 1437 2 0 100% mprotect 16 2958 2991 33 2 101% mprotect 32 6431 6608 177 6 103% madvise_ 1 191 208 16 16 109% madvise_ 2 300 324 24 12 108% madvise_ 4 450 473 23 6 105% madvise_ 8 753 806 53 7 107% madvise_ 16 1467 1592 125 8 108% madvise_ 32 2795 3405 610 19 122% The second test (measuring cpu cycle) syscall__ nbr_vma cpu cmseal delta_cpu per_vma % munmap__ 1 684 715 31 31 105% munmap__ 2 861 898 38 19 104% munmap__ 4 1183 1235 51 13 104% munmap__ 8 1999 2045 46 6 102% munmap__ 16 3839 3816 -23 -1 99% munmap__ 32 7672 7887 216 7 103% mprotect 1 397 443 46 46 112% mprotect 2 738 788 50 25 107% mprotect 4 1221 1256 35 9 103% mprotect 8 2356 2429 72 9 103% mprotect 16 4961 4935 -26 -2 99% mprotect 32 9882 10172 291 9 103% madvise_ 1 351 380 29 29 108% madvise_ 2 565 615 49 25 109% madvise_ 4 872 933 61 15 107% madvise_ 8 1508 1640 132 16 109% madvise_ 16 3078 3323 245 15 108% madvise_ 32 5893 6704 811 25 114% For 5.10 kernel, sealing check adds 0-15 ns in time, or 10-30 CPU cycles, there is even decrease in some cases. It might be interesting to compare 5.10 and 6.8 kernel The first test (measuring time) syscall__ vmas t_5_10 t_6_8 delta_ns per_vma % munmap__ 1 357 909 552 552 254% munmap__ 2 442 1398 956 478 316% munmap__ 4 614 2444 1830 458 398% munmap__ 8 1017 4029 3012 377 396% munmap__ 16 1889 6647 4758 297 352% munmap__ 32 4109 11811 7702 241 287% mprotect 1 235 439 204 204 187% mprotect 2 495 1659 1164 582 335% mprotect 4 741 3747 3006 752 506% mprotect 8 1434 6755 5320 665 471% mprotect 16 2958 13748 10790 674 465% mprotect 32 6431 27827 21397 669 433% madvise_ 1 191 240 49 49 125% madvise_ 2 300 366 67 33 122% madvise_ 4 450 623 173 43 138% madvise_ 8 753 1110 357 45 147% madvise_ 16 1467 2127 660 41 145% madvise_ 32 2795 4109 1314 41 147% The second test (measuring cpu cycle) syscall__ vmas cpu_5_10 c_6_8 delta_cpu per_vma % munmap__ 1 684 1790 1106 1106 262% munmap__ 2 861 2819 1958 979 327% munmap__ 4 1183 4959 3776 944 419% munmap__ 8 1999 8262 6263 783 413% munmap__ 16 3839 13099 9260 579 341% munmap__ 32 7672 23221 15549 486 303% mprotect 1 397 906 509 509 228% mprotect 2 738 3019 2281 1140 409% mprotect 4 1221 6149 4929 1232 504% mprotect 8 2356 9978 7622 953 423% mprotect 16 4961 20448 15487 968 412% mprotect 32 9882 40972 31091 972 415% madvise_ 1 351 434 82 82 123% madvise_ 2 565 752 186 93 133% madvise_ 4 872 1313 442 110 151% madvise_ 8 1508 2271 763 95 151% madvise_ 16 3078 4312 1234 77 140% madvise_ 32 5893 8376 2483 78 142% From 5.10 to 6.8 munmap: added 250-550 ns in time, or 500-1100 in cpu cycle, per vma. mprotect: added 200-750 ns in time, or 500-1200 in cpu cycle, per vma. madvise: added 33-50 ns in time, or 70-110 in cpu cycle, per vma. In comparison to mseal, which adds 20-40 ns or 50-100 CPU cycles, the increase from 5.10 to 6.8 is significantly larger, approximately ten times greater for munmap and mprotect. When I discuss the mm performance with Brian Makin, an engineer who worked on performance, it was brought to my attention that such performance benchmarks, which measuring millions of mm syscall in a tight loop, may not accurately reflect real-world scenarios, such as that of a database service. Also this is tested using a single HW and ChromeOS, the data from another HW or distribution might be different. It might be best to take this data with a grain of salt. This patch (of 5): Wire up mseal syscall for all architectures. Link: https://lkml.kernel.org/r/20240415163527.626541-1-jeffxu@chromium.org Link: https://lkml.kernel.org/r/20240415163527.626541-2-jeffxu@chromium.org Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Reviewed-by: Liam R. Howlett Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Jann Horn [Bug #2] Cc: Jeff Xu Cc: Jonathan Corbet Cc: Jorge Lucangeli Obes Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Cc: Muhammad Usama Anjum Cc: Pedro Falcato Cc: Stephen Röttger Cc: Suren Baghdasaryan Cc: Amer Al Shanawany Cc: Javier Carrasco Cc: Shuah Khan Signed-off-by: Andrew Morton --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/uapi/asm-generic/unistd.h | 5 ++++- kernel/sys_ni.c | 1 + 19 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 26cce7e7f70b..74720667fe09 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -501,3 +501,4 @@ 569 common lsm_get_self_attr sys_lsm_get_self_attr 570 common lsm_set_self_attr sys_lsm_set_self_attr 571 common lsm_list_modules sys_lsm_list_modules +572 common mseal sys_mseal diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index b6c9e01e14f5..2ed7d229c8f9 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -475,3 +475,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 491b2b9bd553..1346579f802f 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 462 +#define __NR_compat_syscalls 463 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 7118282d1c79..266b96acc014 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -929,6 +929,8 @@ __SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr) __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) #define __NR_lsm_list_modules 461 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) +#define __NR_mseal 462 +__SYSCALL(__NR_mseal, sys_mseal) /* * Please add new compat syscalls above this comment and update diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 7fd43fd4c9f2..22a3cbd4c602 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -461,3 +461,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index b00ab2cabab9..2b81a6bd78b2 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -467,3 +467,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 83cfc9eb6b88..cc869f5d5693 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -400,3 +400,4 @@ 459 n32 lsm_get_self_attr sys_lsm_get_self_attr 460 n32 lsm_set_self_attr sys_lsm_set_self_attr 461 n32 lsm_list_modules sys_lsm_list_modules +462 n32 mseal sys_mseal diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 532b855df589..1464c6be6eb3 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -376,3 +376,4 @@ 459 n64 lsm_get_self_attr sys_lsm_get_self_attr 460 n64 lsm_set_self_attr sys_lsm_set_self_attr 461 n64 lsm_list_modules sys_lsm_list_modules +462 n64 mseal sys_mseal diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index f45c9530ea93..008ebe60263e 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -449,3 +449,4 @@ 459 o32 lsm_get_self_attr sys_lsm_get_self_attr 460 o32 lsm_set_self_attr sys_lsm_set_self_attr 461 o32 lsm_list_modules sys_lsm_list_modules +462 o32 mseal sys_mseal diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b236a84c4e12..b13c21373974 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -460,3 +460,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 17173b82ca21..3656f1ca7a21 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -548,3 +548,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 095bb86339a7..bd0fee24ad10 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -464,3 +464,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal sys_mseal diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 86fe269f0220..bbf83a2db986 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -464,3 +464,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b23d59313589..ac6c281ccfe0 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -507,3 +507,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 5f8591ce7f25..7fd1f57ad3d3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -466,3 +466,4 @@ 459 i386 lsm_get_self_attr sys_lsm_get_self_attr 460 i386 lsm_set_self_attr sys_lsm_set_self_attr 461 i386 lsm_list_modules sys_lsm_list_modules +462 i386 mseal sys_mseal diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cc78226ffc35..a396f6e6ab5b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -383,6 +383,7 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index dd116598fb25..67083fc1b2f5 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -432,3 +432,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 75f00965ab15..d983c48a3b6a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) #define __NR_lsm_list_modules 461 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) +#define __NR_mseal 462 +__SYSCALL(__NR_mseal, sys_mseal) + #undef __NR_syscalls -#define __NR_syscalls 462 +#define __NR_syscalls 463 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index faad00cce269..d7eee421d4bc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -196,6 +196,7 @@ COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(cachestat); +COND_SYSCALL(mseal); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit v1.2.3 From b84a8aba806261d2f759ccedf4a2a6a80a5e55ba Mon Sep 17 00:00:00 2001 From: "dicken.ding" Date: Fri, 24 May 2024 17:17:39 +0800 Subject: genirq/irqdesc: Prevent use-after-free in irq_find_at_or_after() irq_find_at_or_after() dereferences the interrupt descriptor which is returned by mt_find() while neither holding sparse_irq_lock nor RCU read lock, which means the descriptor can be freed between mt_find() and the dereference: CPU0 CPU1 desc = mt_find() delayed_free_desc(desc) irq_desc_get_irq(desc) The use-after-free is reported by KASAN: Call trace: irq_get_next_irq+0x58/0x84 show_stat+0x638/0x824 seq_read_iter+0x158/0x4ec proc_reg_read_iter+0x94/0x12c vfs_read+0x1e0/0x2c8 Freed by task 4471: slab_free_freelist_hook+0x174/0x1e0 __kmem_cache_free+0xa4/0x1dc kfree+0x64/0x128 irq_kobj_release+0x28/0x3c kobject_put+0xcc/0x1e0 delayed_free_desc+0x14/0x2c rcu_do_batch+0x214/0x720 Guard the access with a RCU read lock section. Fixes: 721255b9826b ("genirq: Use a maple tree for interrupt descriptor management") Signed-off-by: dicken.ding Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240524091739.31611-1-dicken.ding@mediatek.com --- kernel/irq/irqdesc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 88ac3652fcf2..07e99c936ba5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -160,7 +160,10 @@ static int irq_find_free_area(unsigned int from, unsigned int cnt) static unsigned int irq_find_at_or_after(unsigned int offset) { unsigned long index = offset; - struct irq_desc *desc = mt_find(&sparse_irqs, &index, nr_irqs); + struct irq_desc *desc; + + guard(rcu)(); + desc = mt_find(&sparse_irqs, &index, nr_irqs); return desc ? irq_desc_get_irq(desc) : nr_irqs; } -- cgit v1.2.3