From 214c1b7f13954559cf09d5d04b934bf32ba4d618 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:52 +0100 Subject: sched/balancing: Switch the 'DEFINE_SPINLOCK(balancing)' spinlock into an 'atomic_t sched_balance_running' flag The 'balancing' spinlock added in: 08c183f31bdb ("[PATCH] sched: add option to serialize load balancing") ... is taken when the SD_SERIALIZE flag is set in a domain, but in reality it is a glorified global atomic flag serializing the load-balancing of those domains. It doesn't have any explicit locking semantics per se: we just spin_trylock() it. Turn it into a ... global atomic flag. This makes it more clear what is going on here, and reduces overhead and code size a bit: # kernel/sched/fair.o: [x86-64 defconfig] text data bss dec hex filename 60730 2721 104 63555 f843 fair.o.before 60718 2721 104 63543 f837 fair.o.after Also document the flag a bit. No change in functionality intended. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Cc: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308105901.1096078-2-mingo@kernel.org --- kernel/sched/fair.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a16129f9a5c..2ef89b36aed1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11633,7 +11633,20 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* * Scale the max load_balance interval with the number of CPUs in the system. @@ -11711,7 +11724,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } @@ -11729,7 +11742,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; -- cgit v1.2.3 From 02a61f325a8e62a7c76479c5f2f7ddcba16877e8 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 8 Mar 2024 11:58:53 +0100 Subject: sched/balancing: Remove reliance on 'enum cpu_idle_type' ordering when iterating [CPU_MAX_IDLE_TYPES] arrays in show_schedstat() show_schedstat() output breaks and doesn't print all entries if the ordering of the definitions in 'enum cpu_idle_type' is changed, because show_schedstat() assumes that 'CPU_IDLE' is 0. Fix it before we change the definition order & values. [ mingo: Added changelog. ] Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-3-mingo@kernel.org --- kernel/sched/stats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 857f837f52cb..85277953cc72 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %*pb", dcount++, cpumask_pr_args(sched_domain_span(sd))); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { + for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], -- cgit v1.2.3 From 38d707c54df4ca58cd9ceae2ddcbd6f606b99e9f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:54 +0100 Subject: sched/balancing: Change 'enum cpu_idle_type' to have more natural definitions The cpu_idle_type enum has the confusingly inverted property that 'not idle' is 1, and 'idle' is '0'. This resulted in a number of unnecessary complications in the code. Reverse the order, remove the CPU_NOT_IDLE type, and convert all code to a natural boolean form. It's much more readable: - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; -------------------------------- - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) -------------------------------- And gets rid of the double negation in these usages: - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) Furthermore, this makes code much more obvious where there's differentiation between CPU_IDLE and CPU_NEWLY_IDLE. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Cc: "Gautham R. Shenoy" Link: https://lore.kernel.org/r/20240308105901.1096078-4-mingo@kernel.org --- include/linux/sched/idle.h | 2 +- kernel/sched/fair.c | 27 ++++++++++++--------------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 478084f9105e..e670ac282333 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -5,8 +5,8 @@ #include enum cpu_idle_type { + __CPU_NOT_IDLE = 0, CPU_IDLE, - CPU_NOT_IDLE, CPU_NEWLY_IDLE, CPU_MAX_IDLE_TYPES }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ef89b36aed1..3a510cf1fb00 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9070,7 +9070,7 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; env->loop++; @@ -9803,7 +9803,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) return false; /* @@ -9827,7 +9827,7 @@ static inline long sibling_imbalance(struct lb_env *env, int ncores_busiest, ncores_local; long imbalance; - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) return 0; ncores_busiest = sds->busiest->cores; @@ -9927,8 +9927,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_misfit_task_load = rq->misfit_task_load; *sg_status |= SG_OVERLOAD; } - } else if ((env->idle != CPU_NOT_IDLE) && - sched_reduced_capacity(rq, env->sd)) { + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ if (sgs->group_misfit_task_load < load) sgs->group_misfit_task_load = load; @@ -9940,7 +9939,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + if (!local_group && env->idle && sgs->sum_h_nr_running && sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; @@ -10698,7 +10697,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -10913,7 +10912,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) { + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already @@ -11121,7 +11120,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11159,7 +11158,7 @@ static int need_active_balance(struct lb_env *env) * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && + if (env->idle && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) @@ -11735,8 +11734,8 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); @@ -12416,9 +12415,7 @@ out: static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* * If this CPU has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle CPUs whose ticks are -- cgit v1.2.3 From 11b0bfa5d463b17cac5bf6b94fea4921713530c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:55 +0100 Subject: sched/debug: Increase SCHEDSTAT_VERSION to 16 We changed the order of definitions within 'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] columns in show_schedstat(). Suggested-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Cc: "Gautham R. Shenoy" Link: https://lore.kernel.org/r/20240308105901.1096078-5-mingo@kernel.org --- Documentation/scheduler/sched-stats.rst | 5 +++++ kernel/sched/stats.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index 03c062915998..73c412666655 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -2,6 +2,11 @@ Scheduler Statistics ==================== +Version 16 of schedstats changed the order of definitions within +'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] +columns in show_schedstat(). In particular the position of CPU_IDLE +and __CPU_NOT_IDLE changed places. The size of the array is unchanged. + Version 15 of schedstats dropped counters for some sched_yield: yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is identical to version 14. diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 85277953cc72..78e48f5426ee 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 16 static int show_schedstat(struct seq_file *seq, void *v) { -- cgit v1.2.3 From be8858dba9a2c3aec454a6b382671101fd0dc3b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:57 +0100 Subject: sched/balancing: Change comment formatting to not overlap Git conflict marker lines So the scheduler has two such comment blocks, with '=' used as a double underline: /* * VRUNTIME * ======== * '========' also happens to be a Git conflict marker, throwing off a simple search in an editor for this pattern. Change them to '-------' type of underline instead - it looks just as good. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-7-mingo@kernel.org --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a510cf1fb00..84d4791cf628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3679,7 +3679,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, /* * VRUNTIME - * ======== + * -------- * * COROLLARY #1: The virtual runtime of the entity needs to be * adjusted if re-weight at !0-lag point. @@ -3762,7 +3762,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, /* * DEADLINE - * ======== + * -------- * * When the weight changes, the virtual time slope changes and * we should adjust the relative virtual deadline accordingly. -- cgit v1.2.3 From 3a5fe9305719c680ccf63216781a4d4068c8e3f3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:58 +0100 Subject: sched/balancing: Fix comments (trying to) refer to NOHZ_BALANCE_KICK Fix two typos: - There's no such thing as 'nohz_balancing_kick', the flag is named 'BALANCE' and is capitalized: NOHZ_BALANCE_KICK. - Likewise there's no such thing as a 'pending nohz_balance_kick' either, the NOHZ_BALANCE_KICK flag is all upper-case. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-8-mingo@kernel.org --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84d4791cf628..f3c03c6db3c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12409,15 +12409,16 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq may be triggered from the scheduler tick, or by + * any of the flags in NOHZ_KICK_MASK: NOHZ_BALANCE_KICK, + * NOHZ_STATS_KICK or NOHZ_NEXT_KICK. */ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to * give the idle CPUs a chance to load balance. Else we may -- cgit v1.2.3 From 3dc6f6c8efe2d9148865664fffbe9dd89fb0d157 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:58:59 +0100 Subject: sched/balancing: Update run_rebalance_domains() comments The first sentence of the comment explaining run_rebalance_domains() is historic and not true anymore: * run_rebalance_domains is triggered when needed from the scheduler tick. ... contradicted/modified by the second sentence: * Also triggered for NOHZ idle balancing (with NOHZ_BALANCE_KICK set). Avoid that kind of confusion straight away and explain from what places sched_balance_softirq() is triggered. Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Acked-by: Valentin Schneider Link: https://lore.kernel.org/r/20240308105901.1096078-9-mingo@kernel.org --- kernel/sched/fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3c03c6db3c8..b567c0790f44 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12409,9 +12409,12 @@ out: } /* - * This softirq may be triggered from the scheduler tick, or by - * any of the flags in NOHZ_KICK_MASK: NOHZ_BALANCE_KICK, - * NOHZ_STATS_KICK or NOHZ_NEXT_KICK. + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local scheduler_tick() for periodic load balancing + * + * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { -- cgit v1.2.3 From e492e1b0e0721f3929ef9d9708d029144b396dd7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:59:00 +0100 Subject: sched/balancing: Vertically align the comments of 'struct sg_lb_stats' and 'struct sd_lb_stats' Make them easier to read. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-10-mingo@kernel.org --- kernel/sched/fair.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b567c0790f44..40b98e43d794 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9436,19 +9436,19 @@ static void update_blocked_averages(int cpu) * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long avg_load; /* Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned int group_smt_balance; /* Task on busy SMT be moved */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -9460,15 +9460,15 @@ struct sg_lb_stats { * during load balancing. */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) -- cgit v1.2.3 From 33928ed8bde066316dc3cb6ccf7f74400073652f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 11:59:01 +0100 Subject: sched/balancing: Update comments in 'struct sg_lb_stats' and 'struct sd_lb_stats' - Align for readability - Capitalize consistently Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240308105901.1096078-11-mingo@kernel.org --- kernel/sched/fair.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40b98e43d794..116a640534b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9433,17 +9433,17 @@ static void update_blocked_averages(int cpu) /********** Helpers for find_busiest_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /* Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ @@ -9456,8 +9456,7 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ @@ -9465,7 +9464,7 @@ struct sd_lb_stats { unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ -- cgit v1.2.3 From 70a27d6d1b19392a23bb4a41de7788fbc539f18d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:07 +0100 Subject: sched/balancing: Rename run_rebalance_domains() => sched_balance_softirq() run_rebalance_domains() is a misnomer, as it doesn't only run rebalance_domains(), but since the introduction of the NOHZ code it also runs nohz_idle_balance(). Rename it to sched_balance_softirq(), reflecting its more generic purpose and that it's a softirq handler. Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-2-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 2 +- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 2 +- kernel/sched/fair.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index e57ad28301bd..6577b068f921 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -34,7 +34,7 @@ out of balance are tasks moved between groups. In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU through scheduler_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load -balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run +balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run in softirq context (SCHED_SOFTIRQ). The latter function takes two arguments: the runqueue of current CPU and whether diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index e814d4c01141..fbc326668e37 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -36,7 +36,7 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过scheduler_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 -的工作由run_rebalance_domains()->rebalance_domains()完成,在软中断上下文中执行 +的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 后一个函数有两个入参:当前CPU的运行队列、它在scheduler_tick()调用时是否空闲。函数会从 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 116a640534b9..953f39deb68e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12415,7 +12415,7 @@ out: * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; @@ -13216,7 +13216,7 @@ __init void init_sched_fair_class(void) #endif } - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; -- cgit v1.2.3 From 86dd6c04ef9f213e14d60c9f64bce1cc019f816e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:08 +0100 Subject: sched/balancing: Rename scheduler_tick() => sched_tick() - Standardize on prefixing scheduler-internal functions defined in with sched_*() prefix. scheduler_tick() was the only function using the scheduler_ prefix. Harmonize it. - The other reason to rename it is the NOHZ scheduler tick handling functions are already named sched_tick_*(). Make the 'git grep sched_tick' more meaningful. Signed-off-by: Ingo Molnar Acked-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-3-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 4 ++-- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 4 ++-- include/linux/sched.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/loadavg.c | 2 +- kernel/time/timer.c | 2 +- kernel/workqueue.c | 2 +- tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 6577b068f921..541d6c617971 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -32,13 +32,13 @@ load of each of its member CPUs, and only when the load of a group becomes out of balance are tasks moved between groups. In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU -through scheduler_tick(). It raises a softirq after the next regularly scheduled +through sched_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run in softirq context (SCHED_SOFTIRQ). The latter function takes two arguments: the runqueue of current CPU and whether -the CPU was idle at the time the scheduler_tick() happened and iterates over all +the CPU was idle at the time the sched_tick() happened and iterates over all sched domains our CPU is on, starting from its base domain and going up the ->parent chain. While doing that, it checks to see if the current domain has exhausted its rebalance interval. If so, it runs load_balance() on that domain. It then checks diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index fbc326668e37..fa0c0bcc6ba5 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -34,12 +34,12 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 调度域中的负载均衡发生在调度组中。也就是说,每个组被视为一个实体。组的负载被定义为它 管辖的每个CPU的负载之和。仅当组的负载不均衡后,任务才在组之间发生迁移。 -在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过scheduler_tick() +在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过sched_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 -后一个函数有两个入参:当前CPU的运行队列、它在scheduler_tick()调用时是否空闲。函数会从 +后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从 当前CPU所在的基调度域开始迭代执行,并沿着parent指针链向上进入更高层级的调度域。在迭代 过程中,函数会检查当前调度域是否已经耗尽了再平衡的时间间隔,如果是,它在该调度域运行 load_balance()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。 diff --git a/include/linux/sched.h b/include/linux/sched.h index 17cb0761ff65..7eb7f31af796 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -301,7 +301,7 @@ enum { TASK_COMM_LEN = 16, }; -extern void scheduler_tick(void); +extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d44efa0d0611..71b7a08a6502 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5662,7 +5662,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). + * interrupt handler sched_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 52c8f8226b0d..ca9da66cc894 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -379,7 +379,7 @@ void calc_global_load(void) } /* - * Called from scheduler_tick() to periodically update this CPU's + * Called from sched_tick() to periodically update this CPU's * active count. */ void calc_global_load_tick(struct rq *this_rq) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e69e75d3858c..ff49ddcc9800 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2478,7 +2478,7 @@ void update_process_times(int user_tick) if (in_irq()) irq_work_tick(); #endif - scheduler_tick(); + sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bf2bdac46843..8fbb0ec39079 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1464,7 +1464,7 @@ void wq_worker_sleeping(struct task_struct *task) * wq_worker_tick - a scheduler tick occurred while a kworker is running * @task: task currently running * - * Called from scheduler_tick(). We're in the IRQ context and the current + * Called from sched_tick(). We're in the IRQ context and the current * worker's fields which follow the 'K' locking rule can be accessed safely. */ void wq_worker_tick(struct task_struct *task) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 25432b8cd5bd..073a748b9380 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,7 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="scheduler_tick" +FUNC2="sched_tick" ALL_FUNCS="#### all functions enabled ####" -- cgit v1.2.3 From 983be0628c061989b6cc175d2f5e429b40699fbb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:09 +0100 Subject: sched/balancing: Rename trigger_load_balance() => sched_balance_trigger() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-4-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 2 +- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 2 +- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 541d6c617971..c7ea05f4107b 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -31,7 +31,7 @@ is treated as one entity. The load of a group is defined as the sum of the load of each of its member CPUs, and only when the load of a group becomes out of balance are tasks moved between groups. -In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU +In kernel/sched/core.c, sched_balance_trigger() is run periodically on each CPU through sched_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index fa0c0bcc6ba5..1a8587a971f9 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -34,7 +34,7 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 调度域中的负载均衡发生在调度组中。也就是说,每个组被视为一个实体。组的负载被定义为它 管辖的每个CPU的负载之和。仅当组的负载不均衡后,任务才在组之间发生迁移。 -在kernel/sched/core.c中,trigger_load_balance()在每个CPU上通过sched_tick() +在kernel/sched/core.c中,sched_balance_trigger()在每个CPU上通过sched_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 71b7a08a6502..929fce69f555 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5700,7 +5700,7 @@ void sched_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + sched_balance_trigger(rq); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 953f39deb68e..e377b675920a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12438,7 +12438,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { /* * Don't need to rebalance while attached to NULL domain or diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d2242679239e..5b0ddb0e6017 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2397,7 +2397,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); -- cgit v1.2.3 From 14ff4dbd34f46cc6b6105f549983321241ccbba9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:10 +0100 Subject: sched/balancing: Rename rebalance_domains() => sched_balance_domains() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-5-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 2 +- Documentation/translations/zh_CN/scheduler/sched-domains.rst | 2 +- arch/arm/kernel/topology.c | 2 +- kernel/sched/fair.c | 8 ++++---- kernel/sched/sched.h | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index c7ea05f4107b..5d8e8b8b269e 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -34,7 +34,7 @@ out of balance are tasks moved between groups. In kernel/sched/core.c, sched_balance_trigger() is run periodically on each CPU through sched_tick(). It raises a softirq after the next regularly scheduled rebalancing event for the current runqueue has arrived. The actual load -balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run +balancing workhorse, sched_balance_softirq()->sched_balance_domains(), is then run in softirq context (SCHED_SOFTIRQ). The latter function takes two arguments: the runqueue of current CPU and whether diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index 1a8587a971f9..e6590fd80640 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -36,7 +36,7 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 在kernel/sched/core.c中,sched_balance_trigger()在每个CPU上通过sched_tick() 周期执行。在当前运行队列下一个定期调度再平衡事件到达后,它引发一个软中断。负载均衡真正 -的工作由sched_balance_softirq()->rebalance_domains()完成,在软中断上下文中执行 +的工作由sched_balance_softirq()->sched_balance_domains()完成,在软中断上下文中执行 (SCHED_SOFTIRQ)。 后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从 diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index ef0058de432b..2336ee2aa44a 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ * can take this difference into account during load balance. A per cpu * structure is preferred because each CPU updates its own cpu_capacity field * during the load balance except for idle cores. One idle core is selected - * to run the rebalance_domains for all idle cores and the cpu_capacity can be + * to run the sched_balance_domains for all idle cores and the cpu_capacity can be * updated during this sequence. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e377b675920a..330788b0c617 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11685,7 +11685,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -12161,7 +12161,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -12422,7 +12422,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) /* * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -12432,7 +12432,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) /* normal load balance */ update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_domains(this_rq, idle); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b0ddb0e6017..41024c1c49b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2904,7 +2904,7 @@ extern void cfs_bandwidth_usage_dec(void); #define NOHZ_NEWILB_KICK_BIT 2 #define NOHZ_NEXT_KICK_BIT 3 -/* Run rebalance_domains() */ +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) /* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -- cgit v1.2.3 From 4c3e509ea9f249458e8692f8298cceac73105948 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:11 +0100 Subject: sched/balancing: Rename load_balance() => sched_balance_rq() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also load_balance() has become somewhat of a misnomer: historically it was the first and primary load-balancing function that was called, but with the introduction of sched domains, it's become a lower layer function that balances runqueues. Rename it to sched_balance_rq() accordingly. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-6-mingo@kernel.org --- Documentation/scheduler/sched-domains.rst | 4 +-- Documentation/scheduler/sched-stats.rst | 32 +++++++++++----------- .../translations/zh_CN/scheduler/sched-domains.rst | 4 +-- .../translations/zh_CN/scheduler/sched-stats.rst | 30 ++++++++++---------- include/linux/sched/topology.h | 2 +- kernel/sched/fair.c | 10 +++---- 6 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 5d8e8b8b269e..5e996fe973b1 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -41,11 +41,11 @@ The latter function takes two arguments: the runqueue of current CPU and whether the CPU was idle at the time the sched_tick() happened and iterates over all sched domains our CPU is on, starting from its base domain and going up the ->parent chain. While doing that, it checks to see if the current domain has exhausted its -rebalance interval. If so, it runs load_balance() on that domain. It then checks +rebalance interval. If so, it runs sched_balance_rq() on that domain. It then checks the parent sched_domain (if it exists), and the parent of the parent and so forth. -Initially, load_balance() finds the busiest group in the current sched domain. +Initially, sched_balance_rq() finds the busiest group in the current sched domain. If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in that group. If it manages to find such a runqueue, it locks both our initial CPU's runqueue and the newly found busiest one and starts moving tasks from it diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index 73c412666655..7c2b16c4729d 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -77,53 +77,53 @@ domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 The first field is a bit mask indicating what cpus this domain operates over. -The next 24 are a variety of load_balance() statistics in grouped into types +The next 24 are a variety of sched_balance_rq() statistics in grouped into types of idleness (idle, busy, and newly idle): - 1) # of times in this domain load_balance() was called when the + 1) # of times in this domain sched_balance_rq() was called when the cpu was idle - 2) # of times in this domain load_balance() checked but found + 2) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when the cpu was idle - 3) # of times in this domain load_balance() tried to move one or + 3) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was idle 4) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was idle + sched_balance_rq() in this domain when the cpu was idle 5) # of times in this domain pull_task() was called when the cpu was idle 6) # of times in this domain pull_task() was called even though the target task was cache-hot when idle - 7) # of times in this domain load_balance() was called but did + 7) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was idle 8) # of times in this domain a busier queue was found while the cpu was idle but no busier group was found - 9) # of times in this domain load_balance() was called when the + 9) # of times in this domain sched_balance_rq() was called when the cpu was busy - 10) # of times in this domain load_balance() checked but found the + 10) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when busy - 11) # of times in this domain load_balance() tried to move one or + 11) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was busy 12) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was busy + sched_balance_rq() in this domain when the cpu was busy 13) # of times in this domain pull_task() was called when busy 14) # of times in this domain pull_task() was called even though the target task was cache-hot when busy - 15) # of times in this domain load_balance() was called but did not + 15) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was busy 16) # of times in this domain a busier queue was found while the cpu was busy but no busier group was found - 17) # of times in this domain load_balance() was called when the + 17) # of times in this domain sched_balance_rq() was called when the cpu was just becoming idle - 18) # of times in this domain load_balance() checked but found the + 18) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when the cpu was just becoming idle - 19) # of times in this domain load_balance() tried to move one or more + 19) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was just becoming idle 20) sum of imbalances discovered (if any) with each call to - load_balance() in this domain when the cpu was just becoming idle + sched_balance_rq() in this domain when the cpu was just becoming idle 21) # of times in this domain pull_task() was called when newly idle 22) # of times in this domain pull_task() was called even though the target task was cache-hot when just becoming idle - 23) # of times in this domain load_balance() was called but did not + 23) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was just becoming idle 24) # of times in this domain a busier queue was found while the cpu was just becoming idle but no busier group was found diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst index e6590fd80640..06363169c56b 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst @@ -42,9 +42,9 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空,如果是这 后一个函数有两个入参:当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从 当前CPU所在的基调度域开始迭代执行,并沿着parent指针链向上进入更高层级的调度域。在迭代 过程中,函数会检查当前调度域是否已经耗尽了再平衡的时间间隔,如果是,它在该调度域运行 -load_balance()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。 +sched_balance_rq()。接下来它检查父调度域(如果存在),再后来父调度域的父调度域,以此类推。 -起初,load_balance()查找当前调度域中最繁忙的调度组。如果成功,在该调度组管辖的全部CPU +起初,sched_balance_rq()查找当前调度域中最繁忙的调度组。如果成功,在该调度组管辖的全部CPU 的运行队列中找出最繁忙的运行队列。如能找到,对当前的CPU运行队列和新找到的最繁忙运行 队列均加锁,并把任务从最繁忙队列中迁移到当前CPU上。被迁移的任务数量等于在先前迭代执行 中计算出的该调度域的调度组的不均衡值。 diff --git a/Documentation/translations/zh_CN/scheduler/sched-stats.rst b/Documentation/translations/zh_CN/scheduler/sched-stats.rst index c5e0be663837..09eee2517610 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-stats.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-stats.rst @@ -75,42 +75,42 @@ domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 繁忙,新空闲): - 1) 当CPU空闲时,load_balance()在这个调度域中被调用了#次 - 2) 当CPU空闲时,load_balance()在这个调度域中被调用,但是发现负载无需 + 1) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用了#次 + 2) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需 均衡#次 - 3) 当CPU空闲时,load_balance()在这个调度域中被调用,试图迁移1个或更多 + 3) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多 任务且失败了#次 - 4) 当CPU空闲时,load_balance()在这个调度域中被调用,发现不均衡(如果有) + 4) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有) #次 5) 当CPU空闲时,pull_task()在这个调度域中被调用#次 6) 当CPU空闲时,尽管目标任务是热缓存状态,pull_task()依然被调用#次 - 7) 当CPU空闲时,load_balance()在这个调度域中被调用,未能找到更繁忙的 + 7) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的 队列#次 8) 当CPU空闲时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组 #次 - 9) 当CPU繁忙时,load_balance()在这个调度域中被调用了#次 - 10) 当CPU繁忙时,load_balance()在这个调度域中被调用,但是发现负载无需 + 9) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用了#次 + 10) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需 均衡#次 - 11) 当CPU繁忙时,load_balance()在这个调度域中被调用,试图迁移1个或更多 + 11) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多 任务且失败了#次 - 12) 当CPU繁忙时,load_balance()在这个调度域中被调用,发现不均衡(如果有) + 12) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有) #次 13) 当CPU繁忙时,pull_task()在这个调度域中被调用#次 14) 当CPU繁忙时,尽管目标任务是热缓存状态,pull_task()依然被调用#次 - 15) 当CPU繁忙时,load_balance()在这个调度域中被调用,未能找到更繁忙的 + 15) 当CPU繁忙时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的 队列#次 16) 当CPU繁忙时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组 #次 - 17) 当CPU新空闲时,load_balance()在这个调度域中被调用了#次 - 18) 当CPU新空闲时,load_balance()在这个调度域中被调用,但是发现负载无需 + 17) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用了#次 + 18) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,但是发现负载无需 均衡#次 - 19) 当CPU新空闲时,load_balance()在这个调度域中被调用,试图迁移1个或更多 + 19) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,试图迁移1个或更多 任务且失败了#次 - 20) 当CPU新空闲时,load_balance()在这个调度域中被调用,发现不均衡(如果有) + 20) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,发现不均衡(如果有) #次 21) 当CPU新空闲时,pull_task()在这个调度域中被调用#次 22) 当CPU新空闲时,尽管目标任务是热缓存状态,pull_task()依然被调用#次 - 23) 当CPU新空闲时,load_balance()在这个调度域中被调用,未能找到更繁忙的 + 23) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更繁忙的 队列#次 24) 当CPU新空闲时,在调度域中找到了更繁忙的队列,但未找到更繁忙的调度组 #次 diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 18572c9ea724..c8fe9bab981b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -110,7 +110,7 @@ struct sched_domain { unsigned long last_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ + /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 330788b0c617..0d2753c50be9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6866,7 +6866,7 @@ dequeue_throttle: #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq, load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); @@ -11242,7 +11242,7 @@ static int should_we_balance(struct lb_env *env) * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -11647,7 +11647,7 @@ out_unlock: static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -11727,7 +11727,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle @@ -12353,7 +12353,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); -- cgit v1.2.3 From f1cd2e2e79d283e315356bd403c7f928e994f057 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Oct 2023 13:04:12 +0200 Subject: sched/balancing: Rename find_busiest_queue() => sched_balance_find_src_rq() The find_busiest_queue() naming has two small quirks: - Scheduler functions that deal with runqueues usually have a rq_ prefix or _rq postfix, but this function has neither. - Plus the 'busiest' qualifier to this function was historically correct, but has become somewhat of a misnomer: in quite a few cases we will not pick the busiest runqueue - but the best (possible) runqueue we can balance tasks from. So name it a bit more neutrally, similar to the 'src/dst' nomenclature we are already using when moving tasks between runqueues. To fix both quirks, and to standardize scheduler load-balancing function names on the sched_balance_() prefix, rename the function to sched_balance_find_src_rq(). Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-7-mingo@kernel.org --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d2753c50be9..1cd9a18b35e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10959,9 +10959,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -11280,7 +11280,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; -- cgit v1.2.3 From 82cf921432fc184adbbb9c1bced182564876ec5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:14 +0100 Subject: sched/balancing: Rename find_busiest_group() => sched_balance_find_src_group() Make two naming changes: 1) Standardize scheduler load-balancing function names on the sched_balance_() prefix. 2) Similar to find_busiest_queue(), the find_busiest_group() naming has become a bit of a misnomer: the 'busiest' qualifier to this function was historically correct but in the current code in quite a few cases we will not pick the 'busiest' group - but the best (possible) group we can balance from based on a complex set of constraints. So name it a bit more neutrally, similar to the 'src/dst' nomenclature we are already using when moving tasks between runqueues, and also use the sched_balance_ prefix: sched_balance_find_src_group(). Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-9-mingo@kernel.org --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cd9a18b35e0..96a81b2fa281 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9430,7 +9430,7 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* * sg_lb_stats - stats of a sched_group required for load-balancing: @@ -9637,7 +9637,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -10788,7 +10788,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -10811,7 +10811,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * @@ -10820,7 +10820,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -11274,7 +11274,7 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; @@ -11298,7 +11298,7 @@ redo: env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. -- cgit v1.2.3 From 391b7a5335c45b2bafe535cb440836ccd17515aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:15 +0100 Subject: sched/balancing: Rename update_blocked_averages() => sched_balance_update_blocked_averages() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-10-mingo@kernel.org --- kernel/sched/fair.c | 8 ++++---- kernel/sched/pelt.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96a81b2fa281..95f7092043f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9411,7 +9411,7 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); @@ -12079,7 +12079,7 @@ static bool update_nohz_stats(struct rq *rq) if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) return true; - update_blocked_averages(cpu); + sched_balance_update_blocked_averages(cpu); return rq->has_blocked_load; } @@ -12339,7 +12339,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); - update_blocked_averages(this_cpu); + sched_balance_update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -12431,7 +12431,7 @@ static __latent_entropy void sched_balance_softirq(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); + sched_balance_update_blocked_averages(this_rq->cpu); sched_balance_domains(this_rq, idle); } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 63b6cf898220..f80955ecdce6 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -209,7 +209,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages(). + * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). */ -- cgit v1.2.3 From 7d058285cd77cc1411c91efd1b1673530bb1bee8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:16 +0100 Subject: sched/balancing: Rename newidle_balance() => sched_balance_newidle() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-11-mingo@kernel.org --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95f7092043f3..aa5ff0efcca8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4816,7 +4816,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -5136,7 +5136,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -8253,7 +8253,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (rq->nr_running) return 1; - return newidle_balance(rq, rf) != 0; + return sched_balance_newidle(rq, rf) != 0; } #endif /* CONFIG_SMP */ @@ -8505,10 +8505,10 @@ idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -11493,7 +11493,7 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. @@ -12277,7 +12277,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -12285,7 +12285,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; -- cgit v1.2.3 From 646ebaf51c64c6416ca89765c20041363fc1b518 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:17 +0100 Subject: sched/balancing: Rename find_idlest_group_cpu() => sched_balance_find_dst_group_cpu() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also use 'dst' instead of 'idlest': while historically correct, today it's not really true anymore that we return the 'idlest' group or CPU, we sort by idle-exit latency and only return the idlest CPUs from the lowest-latency set of CPUs. The true 'idlest' CPUs often remain idle for a long time and are never returned as long as the system is under-loaded. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-12-mingo@kernel.org --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa5ff0efcca8..02ff0272b2e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7101,10 +7101,10 @@ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -7191,7 +7191,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; -- cgit v1.2.3 From a88b17080294f735c4124acccfa2d803a6a7d46f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:18 +0100 Subject: sched/balancing: Rename find_idlest_group() => sched_balance_find_dst_group() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also use 'dst' instead of 'idlest', because it's not really true that we return the 'idlest' group or CPU, we sort by idle-exit latency and only return the idlest CPUs from the lowest-latency set of CPUs. The true 'idlest' CPUs often remain idle for a long time and are never returned as long as the system is under-loaded. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-13-mingo@kernel.org --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02ff0272b2e4..d0c3a091d7d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7098,7 +7098,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. @@ -7185,7 +7185,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -10296,13 +10296,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; -- cgit v1.2.3 From 686d148cbb5a1c2891914b8d11147d3c5556a29a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:19 +0100 Subject: sched/balancing: Rename find_idlest_cpu() => sched_balance_find_dst_cpu() Standardize scheduler load-balancing function names on the sched_balance_() prefix. Also use 'dst' instead of 'idlest', because it's not really true that we return the 'idlest' group or CPU, we sort by idle-exit latency and only return the idlest CPUs from the lowest-latency set of CPUs. The true 'idlest' CPUs often remain idle for a long time and are never returned as long as the system is under-loaded. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-14-mingo@kernel.org --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0c3a091d7d1..4b3c4a181a91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7160,7 +7160,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct * return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -7936,7 +7936,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -8201,7 +8201,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.2.3 From d72cf62438d67b911212f8d4cf65d6167c1541ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Mar 2024 11:33:50 +0100 Subject: sched/balancing: Fix a couple of outdated function names in comments The 'idle_balance()' function hasn't existed for years, and there's no load_balance_newidle() either - both are sched_balance_newidle() today. Reported-by: Honglei Wang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ZfAwNufbiyt/5biu@gmail.com --- kernel/sched/fair.c | 2 +- kernel/sched/pelt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b3c4a181a91..a19ea290b790 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6866,7 +6866,7 @@ dequeue_throttle: #ifdef CONFIG_SMP -/* Working cpumask for: sched_balance_rq, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index f80955ecdce6..3a96da25b67c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -208,7 +208,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * se has been already dequeued but cfs_rq->curr still points to it. * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls + * this happens during sched_balance_newidle() which calls * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). -- cgit v1.2.3 From b9e6e28663928cab836a19abbdec3d036a07db3b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Mar 2024 12:06:03 +0100 Subject: sched/fair: Fix typos in comments So I made all speling mistakes / typos red in my editor. Big mistake... Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org --- kernel/sched/fair.c | 68 ++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a19ea290b790..c8e50fbac345 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -388,8 +388,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -406,7 +406,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq) SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -595,13 +595,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the - * virtual time has non-continguous motion equivalent to: + * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * - * However, since v_i is u64, and the multiplcation could easily overflow + * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 @@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) } if (load) { - /* sign flips effective floor / ceil */ + /* sign flips effective floor / ceiling */ if (avg < 0) avg -= (load - 1); avg = div_s64(avg, load); @@ -721,7 +721,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * - * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. */ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) @@ -1024,7 +1024,7 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } /* @@ -1616,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -3284,7 +3284,7 @@ retry_pids: /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || @@ -3295,7 +3295,7 @@ retry_pids: /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ if (!vma_is_accessible(vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); @@ -3327,7 +3327,7 @@ retry_pids: } /* - * Scanning the VMA's of short lived tasks add more overhead. So + * Scanning the VMAs of short lived tasks add more overhead. So * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, @@ -3371,7 +3371,7 @@ retry_pids: /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -4733,7 +4733,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -5014,14 +5014,14 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * In the above example if a task is capped to a specific performance * point, y, then when: * - * * util = 80% of x then it does not fit on cpu0 and should migrate - * to cpu1 - * * util = 80% of y then it is forced to fit on cpu1 to honour + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour * uclamp_max request. * * which is what we're enforcing here. A task always fits if @@ -5052,7 +5052,7 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | (region c, boosted, util < uclamp_min) * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * a) If util > uclamp_max, then we're capped, we don't care about * actual fitness value here. We only care if uclamp_max fits @@ -5242,7 +5242,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; /* - * When joining the competition; the exisiting tasks will be, + * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition. */ @@ -5391,7 +5391,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); @@ -5427,7 +5427,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && @@ -7503,7 +7503,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * On asymmetric system, update task utilization because we will check - * that the task fits with cpu's capacity. + * that the task fits with CPU's capacity. */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); @@ -8027,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { /* * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation + * the clamp() part. I.e.: apply max aggregation * only. util_fits_cpu() logic requires to * operate on non clamped util but must use the * max-aggregated uclamp_{min, max}. @@ -8586,7 +8586,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -8924,7 +8924,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - /* Disregard pcpu kthreads; they are where they need to be. */ + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -10076,7 +10076,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -10715,7 +10715,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; env->imbalance = max_t(long, 0, @@ -11900,7 +11900,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. * - * When balancing betwen cores, all the SMT siblings of the + * When balancing between cores, all the SMT siblings of the * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { @@ -12061,7 +12061,7 @@ void nohz_balance_enter_idle(int cpu) out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } @@ -12085,7 +12085,7 @@ static bool update_nohz_stats(struct rq *rq) } /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ @@ -12190,7 +12190,7 @@ abort: /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -12221,7 +12221,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of - * those idle CPUs and outside the preempt disable / irq off phase of the local + * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) @@ -12232,7 +12232,7 @@ void nohz_run_idle_balance(int cpu) /* * Update the blocked load only if no SCHED_SOFTIRQ is about to happen - * (ie NOHZ_STATS_KICK set) and will do the same. + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); -- cgit v1.2.3 From 77222b0d12e8ae6f082261842174cc2e981bf99c Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:49 +0000 Subject: sched/topology: Export asym_cap_list So that we can use it to iterate through available capacities in the system. Sort asym_cap_list in descending order as expected users are likely to be interested on the highest capacity first. Make the list RCU protected to allow for cheap access in hot paths. Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-2-qyousef@layalina.io --- kernel/sched/sched.h | 14 ++++++++++++++ kernel/sched/topology.c | 43 +++++++++++++++++++++++++------------------ 2 files changed, 39 insertions(+), 18 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 41024c1c49b4..f77c00dddfe1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -109,6 +109,20 @@ extern int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; +/* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + /* * Helpers for converting nanosecond timing to jiffy resolution */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..44ed3d0812ab 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1329,24 +1329,13 @@ next: update_group_capacity(sd, cpu); } -/* - * Asymmetric CPU capacity bits - */ -struct asym_cap_data { - struct list_head link; - unsigned long capacity; - unsigned long cpus[]; -}; - /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same * capacity. * The lifespan of data is unlimited. */ -static LIST_HEAD(asym_cap_list); - -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) +LIST_HEAD(asym_cap_list); /* * Verify whether there is any CPU capacity asymmetry in a given sched domain. @@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span, } +static void free_asym_cap_entry(struct rcu_head *head) +{ + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); + kfree(entry); +} + static inline void asym_cpu_capacity_update_data(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - struct asym_cap_data *entry = NULL; + struct asym_cap_data *insert_entry = NULL; + struct asym_cap_data *entry; + /* + * Search if capacity already exits. If not, track which the entry + * where we should insert to keep the list ordered descendingly. + */ list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) goto done; + else if (!insert_entry && capacity > entry->capacity) + insert_entry = list_prev_entry(entry, link); } entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) return; entry->capacity = capacity; - list_add(&entry->link, &asym_cap_list); + + /* If NULL then the new capacity is the smallest, add last. */ + if (!insert_entry) + list_add_tail_rcu(&entry->link, &asym_cap_list); + else + list_add_rcu(&entry->link, &insert_entry->link); done: __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); } @@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry_safe(entry, next, &asym_cap_list, link) { if (cpumask_empty(cpu_capacity_span(entry))) { - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void) */ if (list_is_singular(&asym_cap_list)) { entry = list_first_entry(&asym_cap_list, typeof(*entry), link); - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } -- cgit v1.2.3 From 22d5607400c62c72da9b60e3324744be83e147a4 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:50 +0000 Subject: sched/fair: Check if a task has a fitting CPU when updating misfit If a misfit task is affined to a subset of the possible CPUs, we need to verify that one of these CPUs can fit it. Otherwise the load balancer code will continuously trigger needlessly leading the balance_interval to increase in return and eventually end up with a situation where real imbalances take a long time to address because of this impossible imbalance situation. This can happen in Android world where it's common for background tasks to be restricted to little cores. Similarly if we can't fit the biggest core, triggering misfit is pointless as it is the best we can ever get on this system. To be able to detect that; we use asym_cap_list to iterate through capacities in the system to see if the task is able to run at a higher capacity level based on its p->cpus_ptr. We do that when the affinity change, a fair task is forked, or when a task switched to fair policy. We store the max_allowed_capacity in task_struct to allow for cheap comparison in the fast path. Improve check_misfit_status() function by removing redundant checks. misfit_task_load will be 0 if the task can't move to a bigger CPU. And nohz_balancer_kick() already checks for cpu_check_capacity() before calling check_misfit_status(). Test: ===== Add trace_printk("balance_interval = %lu\n", interval) in get_sd_balance_interval(). run if [ "$MASK" != "0" ]; then adb shell "taskset -a $MASK cat /dev/zero > /dev/null" fi sleep 10 // parse ftrace buffer counting the occurrence of each valaue Where MASK is either: * 0: no busy task running * 1: busy task is pinned to 1 cpu; handled today to not cause misfit * f: busy task pinned to little cores, simulates busy background task, demonstrates the problem to be fixed Results: ======== Note how occurrence of balance_interval = 128 overshoots for MASK = f. BEFORE ------ MASK=0 1 balance_interval = 175 120 balance_interval = 128 846 balance_interval = 64 55 balance_interval = 63 215 balance_interval = 32 2 balance_interval = 31 2 balance_interval = 16 4 balance_interval = 8 1870 balance_interval = 4 65 balance_interval = 2 MASK=1 27 balance_interval = 175 37 balance_interval = 127 840 balance_interval = 64 167 balance_interval = 63 449 balance_interval = 32 84 balance_interval = 31 304 balance_interval = 16 1156 balance_interval = 8 2781 balance_interval = 4 428 balance_interval = 2 MASK=f 1 balance_interval = 175 1328 balance_interval = 128 44 balance_interval = 64 101 balance_interval = 63 25 balance_interval = 32 5 balance_interval = 31 23 balance_interval = 16 23 balance_interval = 8 4306 balance_interval = 4 177 balance_interval = 2 AFTER ----- Note how the high values almost disappear for all MASK values. The system has background tasks that could trigger the problem without simulate it even with MASK=0. MASK=0 103 balance_interval = 63 19 balance_interval = 31 194 balance_interval = 8 4827 balance_interval = 4 179 balance_interval = 2 MASK=1 131 balance_interval = 63 1 balance_interval = 31 87 balance_interval = 8 3600 balance_interval = 4 7 balance_interval = 2 MASK=f 8 balance_interval = 127 182 balance_interval = 63 3 balance_interval = 31 9 balance_interval = 16 415 balance_interval = 8 3415 balance_interval = 4 21 balance_interval = 2 Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-3-qyousef@layalina.io --- include/linux/sched.h | 1 + init/init_task.c | 1 + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 16 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ed40e9f6155..c75fd46506df 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct task_struct { #endif unsigned int policy; + unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; diff --git a/init/init_task.c b/init/init_task.c index 4daee6d761c8..2558b719e053 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, + .max_allowed_capacity = SCHED_CAPACITY_SCALE, .nr_cpus_allowed= NR_CPUS, .mm = NULL, .active_mm = &init_mm, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8270e2e15cb..c47c4f2e28f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5098,15 +5098,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + int cpu = cpu_of(rq); + if (!sched_asym_cpucap_active()) return; - if (!p || p->nr_cpus_allowed == 1) { - rq->misfit_task_load = 0; - return; - } + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { - if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -8253,6 +8257,36 @@ static void task_dead_fair(struct task_struct *p) remove_entity_load_avg(&p->se); } +/* + * Set the max capacity the task is allowed to run at for misfit detection. + */ +static void set_task_max_allowed_capacity(struct task_struct *p) +{ + struct asym_cap_data *entry; + + if (!sched_asym_cpucap_active()) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; + + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); +} + +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) +{ + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} + static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -8261,6 +8295,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) @@ -9610,16 +9646,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -11923,7 +11953,7 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { + if (check_misfit_status(rq)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12648,6 +12678,8 @@ static void task_fork_fair(struct task_struct *p) rq_lock(rq, &rf); update_rq_clock(rq); + set_task_max_allowed_capacity(p); + cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (curr) @@ -12771,6 +12803,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) { attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so @@ -13142,7 +13176,7 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, -- cgit v1.2.3 From fa427e8e53d8db15090af7e952a55870dc2a453f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:51 +0000 Subject: sched/topology: Remove root_domain::max_cpu_capacity The value is no longer used as we now keep track of max_allowed_capacity for each task instead. Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-4-qyousef@layalina.io --- kernel/sched/sched.h | 2 -- kernel/sched/topology.c | 13 ++----------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f77c00dddfe1..4f9e952d4fad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -917,8 +917,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 44ed3d0812ab..63aecd2a7a9f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2514,16 +2514,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - unsigned long capacity; - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - capacity = arch_scale_cpu_capacity(i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, capacity); - cpu_attach_domain(sd, d.rd, i); if (lowest_flag_domain(i, SD_CLUSTER)) @@ -2537,10 +2530,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_cluster) static_branch_inc_cpuslocked(&sched_cluster_active); - if (rq && sched_debug_verbose) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); ret = 0; error: -- cgit v1.2.3 From 58eeb2d79b542c678c46e245dba6b66936368a99 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:52 +0000 Subject: sched/fair: Don't double balance_interval for migrate_misfit It is not necessarily an indication of the system being busy and requires a backoff of the load balancer activities. But pushing it high could mean generally delaying other misfit activities or other type of imbalances. Also don't pollute nr_balance_failed because of misfit failures. The value is used for enabling cache hot migration and in migrate_util/load types. None of which should be impacted (skewed) by misfit failures. Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-5-qyousef@layalina.io --- kernel/sched/fair.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c47c4f2e28f7..dbf4f1c44259 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11449,8 +11449,12 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -11533,8 +11537,13 @@ out_one_pinned: * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ -- cgit v1.2.3 From be3a51e68f2f1b17250ce40d8872c7645b7a2991 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 7 Mar 2024 14:27:23 +0530 Subject: sched/fair: Add EAS checks before updating root_domain::overutilized root_domain::overutilized is only used for EAS(energy aware scheduler) to decide whether to do load balance or not. It is not used if EAS not possible. Currently enqueue_task_fair and task_tick_fair accesses, sometime updates this field. In update_sd_lb_stats it is updated often. This causes cache contention due to true sharing and burns a lot of cycles. ::overload and ::overutilized are part of the same cacheline. Updating it often invalidates the cacheline. That causes access to ::overload to slow down due to false sharing. Hence add EAS check before accessing/updating this field. EAS check is optimized at compile time or it is a static branch. Hence it shouldn't cost much. With the patch, both enqueue_task_fair and newidle_balance don't show up as hot routines in perf profile. 6.8-rc4: 7.18% swapper [kernel.vmlinux] [k] enqueue_task_fair 6.78% s [kernel.vmlinux] [k] newidle_balance +patch: 0.14% swapper [kernel.vmlinux] [k] enqueue_task_fair 0.00% swapper [kernel.vmlinux] [k] newidle_balance While at it: trace_sched_overutilized_tp expect that second argument to be bool. So do a int to bool conversion for that. Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator") Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Srikar Dronamraju Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240307085725.444486-2-sshegde@linux.ibm.com --- kernel/sched/fair.c | 53 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 19 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dbf4f1c44259..1afa4f87fb25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6673,22 +6673,42 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + unsigned long rq_util_min, rq_util_max; + + if (!sched_energy_enabled()) + return false; + + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } -static inline void update_overutilized_status(struct rq *rq) +static inline void set_rd_overutilized_status(struct root_domain *rd, + unsigned int status) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); - } + if (!sched_energy_enabled()) + return; + + WRITE_ONCE(rd->overutilized, status); + trace_sched_overutilized_tp(rd, !!status); +} + +static inline void check_update_overutilized_status(struct rq *rq) +{ + /* + * overutilized field is used for load balancing decisions only + * if energy aware scheduler is being used + */ + if (!sched_energy_enabled()) + return; + + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); } #else -static inline void update_overutilized_status(struct rq *rq) { } +static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ @@ -6789,7 +6809,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * and the following generally works well enough in practice. */ if (!task_new) - update_overutilized_status(rq); + check_update_overutilized_status(rq); enqueue_throttle: assert_list_leaf_cfs_rq(rq); @@ -10630,19 +10650,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { - struct root_domain *rd = env->dst_rq->rd; - /* update overload indicator if we are at root domain */ - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); + set_rd_overutilized_status(env->dst_rq->rd, + sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - struct root_domain *rd = env->dst_rq->rd; - - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); } update_idle_cpu_scan(env, sum_util); @@ -12667,7 +12682,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); - update_overutilized_status(task_rq(curr)); + check_update_overutilized_status(task_rq(curr)); task_tick_core(rq, curr); } -- cgit v1.2.3 From d0f5d3cefc259f498456338d319098dc84393b24 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 7 Mar 2024 14:27:24 +0530 Subject: sched/fair: Introduce is_rd_overutilized() helper function to access root_domain::overutilized The root_domain::overutilized field is READ_ONCE() accessed in multiple places, which could be simplified with a helper function. This might also make it more apparent that it needs to be used only in case of EAS. No change in functionality intended. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240307085725.444486-3-sshegde@linux.ibm.com --- kernel/sched/fair.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1afa4f87fb25..24a7530a7d3f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6685,6 +6685,15 @@ static inline bool cpu_overutilized(int cpu) return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } +/* + * Ensure that caller can do EAS. overutilized value + * make sense only if EAS is enabled + */ +static inline int is_rd_overutilized(struct root_domain *rd) +{ + return READ_ONCE(rd->overutilized); +} + static inline void set_rd_overutilized_status(struct root_domain *rd, unsigned int status) { @@ -6704,7 +6713,7 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!sched_energy_enabled()) return; - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); } #else @@ -7990,7 +7999,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) + if (!pd || is_rd_overutilized(rd)) goto unlock; /* @@ -10897,7 +10906,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + if (rcu_dereference(rd->pd) && !is_rd_overutilized(rd)) goto out_balanced; } -- cgit v1.2.3 From c829d6818b60c591f70c060b2bb75d76cf0cec6d Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 25 Mar 2024 21:09:26 +0530 Subject: sched/fair: Simplify the continue_balancing logic in sched_balance_newidle() newidle(CPU_NEWLY_IDLE) balancing doesn't stop the load-balancing if the continue_balancing flag is reset, but the other two balancing (IDLE, BUSY) cases do that. newidle balance stops the load balancing if rq has a task or there is wakeup pending. The same checks are present in should_we_balance for newidle. Hence use the return value and simplify continue_balancing mechanism for newidle. Update the comment surrounding it as well. No change in functionality intended. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20240325153926.274284-1-sshegde@linux.ibm.com --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24a7530a7d3f..1856e58595c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12358,6 +12358,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; @@ -12372,8 +12373,9 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) return 0; /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -12412,7 +12414,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; u64 domain_cost; update_next_balance(sd, &next_balance); @@ -12438,8 +12439,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0 || - this_rq->ttwu_pending) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); -- cgit v1.2.3 From 902e786c4a54a2c4f7462b9026bb56610888db3d Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Tue, 26 Mar 2024 20:56:16 +0530 Subject: sched/fair: Combine EAS check with root_domain::overutilized access Access to root_domainoverutilized is always used with sched_energy_enabled in the pattern: if (sched_energy_enabled && !overutilized) do something So modify the helper function to utilize this pattern. This is more readable code as it would say, do something when root domain is not overutilized. This function always return true when EAS is disabled. No change in functionality intended. Suggested-by: Vincent Guittot Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240326152616.380999-1-sshegde@linux.ibm.com --- kernel/sched/fair.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1856e58595c3..38462305bc39 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6686,12 +6686,11 @@ static inline bool cpu_overutilized(int cpu) } /* - * Ensure that caller can do EAS. overutilized value - * make sense only if EAS is enabled + * overutilized value make sense only if EAS is enabled */ static inline int is_rd_overutilized(struct root_domain *rd) { - return READ_ONCE(rd->overutilized); + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } static inline void set_rd_overutilized_status(struct root_domain *rd, @@ -6710,8 +6709,6 @@ static inline void check_update_overutilized_status(struct rq *rq) * overutilized field is used for load balancing decisions only * if energy aware scheduler is being used */ - if (!sched_energy_enabled()) - return; if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); @@ -7999,7 +7996,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || is_rd_overutilized(rd)) + if (!pd) goto unlock; /* @@ -8202,7 +8199,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) cpumask_test_cpu(cpu, p->cpus_ptr)) return cpu; - if (sched_energy_enabled()) { + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -10903,12 +10900,9 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) if (busiest->group_type == group_misfit_task) goto force_balance; - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !is_rd_overutilized(rd)) - goto out_balanced; - } + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) -- cgit v1.2.3 From c628db0a6831f80e89873ee44f1b40e3ab3216c6 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 25 Mar 2024 11:15:04 +0530 Subject: sched/fair: Check root_domain::overload value before update The root_domain::overload flag is 1 when there's any rq in the root domain that has 2 or more running tasks. (Ie. it's overloaded.) The root_domain structure itself is a global structure per cpuset island. The ::overload flag is maintained the following way: - Set when adding a second task to the runqueue. - It is cleared in update_sd_lb_stats() during load balance, if none of the rqs have 2 or more running tasks. This flag is used during newidle balance to see if its worth doing a full load balance pass, which can be an expensive operation. If it is set, then newidle balance will try to aggressively pull a task. Since commit: 630246a06ae2 ("sched/fair: Clean-up update_sg_lb_stats parameters") ::overload is being written unconditionally, even if it has the same value. The change in value of this depends on the workload, but on typical workloads, it doesn't change all that often: a system is either dominantly overloaded for substantial amounts of time, or not. Extra writes to this semi-global structure cause unnecessary overhead, extra bus traffic, etc. - so avoid it as much as possible. Perf probe stats show that it's worth making this change (numbers are with patch applied): 1M probe:sched_balance_newidle_L38 139 probe:update_sd_lb_stats_L53 <====== 1->0 writes 129K probe:add_nr_running_L12 74 probe:add_nr_running_L13 <====== 0->1 writes 54K probe:update_sd_lb_stats_L50 <====== reads These numbers prove that actual change in the ::overload value is (much) less frequent: L50 is much larger at ~54,000 accesses vs L53+L13 of 139+74. [ mingo: Rewrote the changelog. ] Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Vincent Guittot Cc: Mel Gorman Link: https://lore.kernel.org/r/20240325054505.201995-2-sshegde@linux.ibm.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 38462305bc39..600fdde1e7ac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10657,7 +10657,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + if (READ_ONCE(env->dst_rq->rd->overload) != (sg_status & SG_OVERLOAD)) + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, -- cgit v1.2.3 From caac6291728ed5493d8a53f4b086c270849ce0c4 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 25 Mar 2024 11:15:05 +0530 Subject: sched/fair: Use helper functions to access root_domain::overload Introduce two helper functions to access & set the root_domain::overload flag: get_rd_overload() set_rd_overload() To make sure code is always following READ_ONCE()/WRITE_ONCE() access methods. No change in functionality intended. [ mingo: Renamed the accessors to get_/set_rd_overload(), tidied up the changelog. ] Suggested-by: Qais Yousef Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Qais Yousef Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240325054505.201995-3-sshegde@linux.ibm.com --- kernel/sched/fair.c | 5 ++--- kernel/sched/sched.h | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 600fdde1e7ac..0cc058265308 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10657,8 +10657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (READ_ONCE(env->dst_rq->rd->overload) != (sg_status & SG_OVERLOAD)) - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + set_rd_overload(env->dst_rq->rd, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, @@ -12391,7 +12390,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!READ_ONCE(this_rq->rd->overload) || + if (!get_rd_overload(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4f9e952d4fad..e86ee262d2f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -930,6 +930,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overload(struct root_domain *rd) +{ + return READ_ONCE(rd->overload); +} + +static inline void set_rd_overload(struct root_domain *rd, int status) +{ + if (get_rd_overload(rd) != status) + WRITE_ONCE(rd->overload, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -2530,8 +2541,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); + set_rd_overload(rq->rd, SG_OVERLOAD); } #endif -- cgit v1.2.3 From dfb83ef7b8b064c15be19cf7fcbde0996712de8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:33:20 +0100 Subject: sched/fair: Rename root_domain::overload to ::overloaded It is silly to use an ambiguous noun instead of a clear adjective when naming such a flag ... Note how root_domain::overutilized already used a proper adjective. rd->overloaded is now set to 1 when the root domain is overloaded. Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e86ee262d2f3..cddc4795f72d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -874,7 +874,7 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + int overloaded; /* Indicate one or more cpus over-utilized (tipping point) */ int overutilized; @@ -932,13 +932,13 @@ extern void sched_put_rd(struct root_domain *rd); static inline int get_rd_overload(struct root_domain *rd) { - return READ_ONCE(rd->overload); + return READ_ONCE(rd->overloaded); } static inline void set_rd_overload(struct root_domain *rd, int status) { if (get_rd_overload(rd) != status) - WRITE_ONCE(rd->overload, status); + WRITE_ONCE(rd->overloaded, status); } #ifdef HAVE_RT_PUSH_IPI -- cgit v1.2.3 From 76cc4f91488af0a808bec97794bfe434dece7d67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:41:31 +0100 Subject: sched/fair: Rename {set|get}_rd_overload() to {set|get}_rd_overloaded() Follow the rename of the root_domain::overloaded flag. Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0cc058265308..bf10665b6f4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10657,7 +10657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - set_rd_overload(env->dst_rq->rd, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOAD); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, @@ -12390,7 +12390,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!get_rd_overload(this_rq->rd) || + if (!get_rd_overloaded(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cddc4795f72d..c7e7ae17c340 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -930,14 +930,14 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); -static inline int get_rd_overload(struct root_domain *rd) +static inline int get_rd_overloaded(struct root_domain *rd) { return READ_ONCE(rd->overloaded); } -static inline void set_rd_overload(struct root_domain *rd, int status) +static inline void set_rd_overloaded(struct root_domain *rd, int status) { - if (get_rd_overload(rd) != status) + if (get_rd_overloaded(rd) != status) WRITE_ONCE(rd->overloaded, status); } @@ -2541,7 +2541,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { - set_rd_overload(rq->rd, SG_OVERLOAD); + set_rd_overloaded(rq->rd, SG_OVERLOAD); } #endif -- cgit v1.2.3 From 7bda10ba7f453729f210264dd07d38989fb858d9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:44:16 +0100 Subject: sched/fair: Rename SG_OVERLOAD to SG_OVERLOADED Follow the rename of the root_domain::overloaded flag. Note that this also matches the SG_OVERUTILIZED flag better. Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf10665b6f4f..839a97a4ba2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9961,7 +9961,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (nr_running > 1) - *sg_status |= SG_OVERLOAD; + *sg_status |= SG_OVERLOADED; if (cpu_overutilized(i)) *sg_status |= SG_OVERUTILIZED; @@ -9986,7 +9986,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + *sg_status |= SG_OVERLOADED; } } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ @@ -10657,7 +10657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7e7ae17c340..07c6669b8250 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -851,7 +851,7 @@ struct perf_domain { }; /* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */ #define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ /* @@ -2541,7 +2541,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { - set_rd_overloaded(rq->rd, SG_OVERLOAD); + set_rd_overloaded(rq->rd, SG_OVERLOADED); } #endif -- cgit v1.2.3 From 4d0a63e5b841c759c9a306aff158420421ef016f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 11:54:42 +0100 Subject: sched/fair: Rename set_rd_overutilized_status() to set_rd_overutilized() The _status() postfix has no real meaning, simplify the naming and harmonize it with set_rd_overloaded(). Signed-off-by: Ingo Molnar Cc: Qais Yousef Cc: Shrikanth Hegde Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVHq65XKsOZpfgK@gmail.com --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 839a97a4ba2a..f29efd5f19f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6693,7 +6693,7 @@ static inline int is_rd_overutilized(struct root_domain *rd) return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } -static inline void set_rd_overutilized_status(struct root_domain *rd, +static inline void set_rd_overutilized(struct root_domain *rd, unsigned int status) { if (!sched_energy_enabled()) @@ -6711,7 +6711,7 @@ static inline void check_update_overutilized_status(struct rq *rq) */ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) - set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(rq->rd, SG_OVERUTILIZED); } #else static inline void check_update_overutilized_status(struct rq *rq) { } @@ -10660,10 +10660,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized_status(env->dst_rq->rd, + set_rd_overutilized(env->dst_rq->rd, sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, SG_OVERUTILIZED); } update_idle_cpu_scan(env, sum_util); -- cgit v1.2.3 From 4475cd8bfd9bcb898953fcadb2f51b3432eb68a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Mar 2024 12:07:48 +0100 Subject: sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags SG_OVERLOADED and SG_OVERUTILIZED flags plus the sg_status bitmask are an unnecessary complication that only make the code harder to read and slower. We only ever set them separately: thule:~/tip> git grep SG_OVER kernel/sched/ kernel/sched/fair.c: set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); kernel/sched/fair.c: *sg_status |= SG_OVERLOADED; kernel/sched/fair.c: *sg_status |= SG_OVERUTILIZED; kernel/sched/fair.c: *sg_status |= SG_OVERLOADED; kernel/sched/fair.c: set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); kernel/sched/fair.c: sg_status & SG_OVERUTILIZED); kernel/sched/fair.c: } else if (sg_status & SG_OVERUTILIZED) { kernel/sched/fair.c: set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); kernel/sched/sched.h:#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */ kernel/sched/sched.h:#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ kernel/sched/sched.h: set_rd_overloaded(rq->rd, SG_OVERLOADED); And use them separately, which results in suboptimal code: /* update overload indicator if we are at root domain */ set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); /* Update over-utilization (tipping point, U >= 0) indicator */ set_rd_overutilized_status(env->dst_rq->rd, Introduce separate sg_overloaded and sg_overutilized flags in update_sd_lb_stats() and its lower level functions, and change all of them to 'bool'. Remove the now unused SG_OVERLOADED and SG_OVERUTILIZED flags. Signed-off-by: Ingo Molnar Acked-by: Shrikanth Hegde Tested-by: Shrikanth Hegde Cc: Qais Yousef Cc: Vincent Guittot Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZgVPhODZ8/nbsqbP@gmail.com --- kernel/sched/fair.c | 36 ++++++++++++++++++------------------ kernel/sched/sched.h | 17 ++++++----------- 2 files changed, 24 insertions(+), 29 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f29efd5f19f6..1dd37168da50 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6688,19 +6688,18 @@ static inline bool cpu_overutilized(int cpu) /* * overutilized value make sense only if EAS is enabled */ -static inline int is_rd_overutilized(struct root_domain *rd) +static inline bool is_rd_overutilized(struct root_domain *rd) { return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } -static inline void set_rd_overutilized(struct root_domain *rd, - unsigned int status) +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { if (!sched_energy_enabled()) return; - WRITE_ONCE(rd->overutilized, status); - trace_sched_overutilized_tp(rd, !!status); + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); } static inline void check_update_overutilized_status(struct rq *rq) @@ -6711,7 +6710,7 @@ static inline void check_update_overutilized_status(struct rq *rq) */ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) - set_rd_overutilized(rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(rq->rd, 1); } #else static inline void check_update_overutilized_status(struct rq *rq) { } @@ -9934,13 +9933,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { int i, nr_running, local_group; @@ -9961,10 +9962,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (nr_running > 1) - *sg_status |= SG_OVERLOADED; + *sg_overloaded = 1; if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -9986,7 +9987,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOADED; + *sg_overloaded = 1; } } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ @@ -10612,7 +10613,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; unsigned long sum_util = 0; - int sg_status = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -10628,7 +10629,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -10657,13 +10658,12 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized(env->dst_rq->rd, - sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - set_rd_overutilized(env->dst_rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overloaded); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } update_idle_cpu_scan(env, sum_util); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 07c6669b8250..7c39dbf31f75 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -713,7 +713,7 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ @@ -757,7 +757,7 @@ struct dl_rq { u64 next; } earliest_dl; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -850,10 +850,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -874,10 +870,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overloaded; + bool overloaded; /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -2540,9 +2536,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - set_rd_overloaded(rq->rd, SG_OVERLOADED); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); -- cgit v1.2.3 From 89d6910cc562ab34d1f1c08f3cf0a9700b8bf2c4 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 10 Apr 2024 17:09:45 +0200 Subject: sched/vtime: Get rid of generic vtime_task_switch() implementation The generic vtime_task_switch() implementation gets built only if __ARCH_HAS_VTIME_TASK_SWITCH is not defined, but requires an architecture to implement arch_vtime_task_switch() callback at the same time, which is confusing. Further, arch_vtime_task_switch() is implemented for 32-bit PowerPC architecture only and vtime_task_switch() generic variant is rather superfluous. Simplify the whole vtime_task_switch() wiring by moving the existing generic implementation to PowerPC. Signed-off-by: Alexander Gordeev Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Reviewed-by: Nicholas Piggin Acked-by: Michael Ellerman Link: https://lore.kernel.org/r/2cb6e3caada93623f6d4f78ad938ac6cd0e2fda8.1712760275.git.agordeev@linux.ibm.com --- arch/powerpc/include/asm/cputime.h | 13 ------------- arch/powerpc/kernel/time.c | 22 ++++++++++++++++++++++ kernel/sched/cputime.c | 13 ------------- 3 files changed, 22 insertions(+), 26 deletions(-) (limited to 'kernel/sched') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 4961fb38e438..aff858ca99c0 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,23 +32,10 @@ #ifdef CONFIG_PPC64 #define get_accounting(tsk) (&get_paca()->accounting) #define raw_get_accounting(tsk) (&local_paca->accounting) -static inline void arch_vtime_task_switch(struct task_struct *tsk) { } #else #define get_accounting(tsk) (&task_thread_info(tsk)->accounting) #define raw_get_accounting(tsk) get_accounting(tsk) -/* - * Called from the context switch with interrupts disabled, to charge all - * accumulated times to the current process, and to prepare accounting on - * the next process. - */ -static inline void arch_vtime_task_switch(struct task_struct *prev) -{ - struct cpu_accounting_data *acct = get_accounting(current); - struct cpu_accounting_data *acct0 = get_accounting(prev); - - acct->starttime = acct0->starttime; -} #endif /* diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index df20cf201f74..c0fdc6d94fee 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -354,6 +354,28 @@ void vtime_flush(struct task_struct *tsk) acct->hardirq_time = 0; acct->softirq_time = 0; } + +/* + * Called from the context switch with interrupts disabled, to charge all + * accumulated times to the current process, and to prepare accounting on + * the next process. + */ +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_kernel(prev); + + vtime_flush(prev); + + if (!IS_ENABLED(CONFIG_PPC64)) { + struct cpu_accounting_data *acct = get_accounting(current); + struct cpu_accounting_data *acct0 = get_accounting(prev); + + acct->starttime = acct0->starttime; + } +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ void __no_kcsan __delay(unsigned long loops) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index af7952f12e6c..aa48b2ec879d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -# ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_kernel(prev); - - vtime_flush(prev); - arch_vtime_task_switch(prev); -} -# endif - void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { unsigned int pc = irq_count() - offset; -- cgit v1.2.3 From f532376e881fb370bc2901a9e1cf0b7e461638ad Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:21 +0200 Subject: scheduler: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) rm sentinel element from ctl_table arrays Acked-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Reviewed-by: Valentin Schneider Signed-off-by: Joel Granados --- kernel/sched/autogroup.c | 1 - kernel/sched/core.c | 1 - kernel/sched/deadline.c | 1 - kernel/sched/fair.c | 1 - kernel/sched/rt.c | 1 - kernel/sched/topology.c | 1 - 6 files changed, 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 991fc9002535..db68a964e34e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -19,7 +19,6 @@ static struct ctl_table sched_autogroup_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init sched_autogroup_sysctl_init(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7019a40457a6..7ce76620a308 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4741,7 +4741,6 @@ static struct ctl_table sched_core_sysctls[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_core_sysctl_init(void) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc..c75d1307d86d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = { .proc_handler = proc_douintvec_minmax, .extra2 = (void *)&sysctl_sched_dl_period_max, }, - {} }; static int __init sched_dl_sysctl_init(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03be0d1330a6..4ac2cf7a918e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -157,7 +157,6 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_fair_sysctl_init(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b67e..aa4c1c874fa4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -56,7 +56,6 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, - {} }; static int __init sched_rt_sysctl_init(void) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..42c22648d124 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -322,7 +322,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sched_energy_aware_sysctl_init(void) -- cgit v1.2.3 From cd18bec668bb6221a54f03d0b645b7aed841f825 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 4 Apr 2024 17:57:38 +0200 Subject: sched/fair: Fix update of rd->sg_overutilized sg_overloaded is used instead of sg_overutilized to update rd->sg_overutilized. Fixes: 4475cd8bfd9b ("sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240404155738.2866102-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1dd37168da50..bb1ae4ed0d91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10661,7 +10661,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized(env->dst_rq->rd, sg_overloaded); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } else if (sg_overutilized) { set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } -- cgit v1.2.3 From f1f8d0a224227e4f19a8a8881dc6355bdfc51230 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Mar 2024 10:16:13 +0100 Subject: sched/cpufreq: Take cpufreq feedback into account Aggregate the different pressures applied on the capacity of CPUs and create a new function that returns the actual capacity of the CPU: get_actual_cpu_capacity(). Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20240326091616.3696851-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb1ae4ed0d91..19199c119829 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4965,13 +4965,22 @@ done: trace_sched_util_est_se_tp(&p->se); } +static inline unsigned long get_actual_cpu_capacity(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + static inline int util_fits_cpu(unsigned long util, unsigned long uclamp_min, unsigned long uclamp_max, int cpu) { - unsigned long capacity_orig, capacity_orig_thermal; unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; bool fits, uclamp_max_fits; /* @@ -5003,7 +5012,6 @@ static inline int util_fits_cpu(unsigned long util, * goal is to cap the task. So it's okay if it's getting less. */ capacity_orig = arch_scale_cpu_capacity(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -5078,7 +5086,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) return -1; return fits; @@ -7494,7 +7503,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = get_actual_cpu_capacity(cpu); /* * First, select CPU which fits better (-1 being better than 0). @@ -7987,8 +7996,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; int prev_fits = -1, best_fits = -1; - unsigned long best_thermal_cap = 0; - unsigned long prev_thermal_cap = 0; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -8018,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; - unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cpu_cap, cpu_actual_cap, util; long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; unsigned long cur_delta, base_energy; @@ -8030,18 +8039,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpumask_empty(cpus)) continue; - /* Account thermal pressure for the energy estimation */ + /* Account external pressure for the energy estimation */ cpu = cpumask_first(cpus); - cpu_thermal_cap = arch_scale_cpu_capacity(cpu); - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + cpu_actual_cap = get_actual_cpu_capacity(cpu); - eenv.cpu_cap = cpu_thermal_cap; + eenv.cpu_cap = cpu_actual_cap; eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { struct rq *rq = cpu_rq(cpu); - eenv.pd_cap += cpu_thermal_cap; + eenv.pd_cap += cpu_actual_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; @@ -8112,7 +8120,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; - prev_thermal_cap = cpu_thermal_cap; + prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); } @@ -8127,7 +8135,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * but best energy cpu has better capacity. */ if ((max_fits < 0) && - (cpu_thermal_cap <= best_thermal_cap)) + (cpu_actual_cap <= best_actual_cap)) continue; cur_delta = compute_energy(&eenv, pd, cpus, p, @@ -8148,14 +8156,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; best_fits = max_fits; - best_thermal_cap = cpu_thermal_cap; + best_actual_cap = cpu_actual_cap; } } rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) target = best_energy_cpu; return target; @@ -9560,8 +9568,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -9573,12 +9581,9 @@ static unsigned long scale_rt_capacity(int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ used = cpu_util_rt(rq); used += cpu_util_dl(rq); - used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; -- cgit v1.2.3 From d4dbc991714eefcbd8d54a3204bd77a0a52bd32d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Mar 2024 10:16:15 +0100 Subject: sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure() Now that cpufreq provides a pressure value to the scheduler, rename arch_update_thermal_pressure into HW pressure to reflect that it returns a pressure applied by HW (i.e. with a high frequency change) and not always related to thermal mitigation but also generated by max current limitation as an example. Such high frequency signal needs filtering to be smoothed and provide an value that reflects the average available capacity into the scheduler time scale. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Qais Yousef Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20240326091616.3696851-5-vincent.guittot@linaro.org --- arch/arm/include/asm/topology.h | 6 +++--- arch/arm64/include/asm/topology.h | 6 +++--- drivers/base/arch_topology.c | 26 +++++++++++++------------- drivers/cpufreq/qcom-cpufreq-hw.c | 4 ++-- include/linux/arch_topology.h | 8 ++++---- include/linux/sched/topology.h | 8 ++++---- include/trace/events/hw_pressure.h | 29 +++++++++++++++++++++++++++++ include/trace/events/sched.h | 2 +- include/trace/events/thermal_pressure.h | 29 ----------------------------- init/Kconfig | 12 ++++++------ kernel/sched/core.c | 8 ++++---- kernel/sched/fair.c | 16 ++++++++-------- kernel/sched/pelt.c | 18 +++++++++--------- kernel/sched/pelt.h | 16 ++++++++-------- kernel/sched/sched.h | 10 +++++----- 15 files changed, 99 insertions(+), 99 deletions(-) create mode 100644 include/trace/events/hw_pressure.h delete mode 100644 include/trace/events/thermal_pressure.h (limited to 'kernel/sched') diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 853c4f81ba4a..ad36b6570067 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -22,9 +22,9 @@ /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure API */ -#define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_update_thermal_pressure topology_update_thermal_pressure +/* Replace task scheduler's default HW pressure API */ +#define arch_scale_hw_pressure topology_get_hw_pressure +#define arch_update_hw_pressure topology_update_hw_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a323b109b9c4..0f6ef432fb84 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -35,9 +35,9 @@ void update_freq_counters_refs(void); /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure API */ -#define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_update_thermal_pressure topology_update_thermal_pressure +/* Replace task scheduler's default HW pressure API */ +#define arch_scale_hw_pressure topology_get_hw_pressure +#define arch_update_hw_pressure topology_update_hw_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 024b78a0cfc1..0248912ff687 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,7 +22,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) per_cpu(cpu_scale, cpu) = capacity; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); +DEFINE_PER_CPU(unsigned long, hw_pressure); /** - * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * topology_update_hw_pressure() - Update HW pressure for CPUs * @cpus : The related CPUs for which capacity has been reduced * @capped_freq : The maximum allowed frequency that CPUs can run at * - * Update the value of thermal pressure for all @cpus in the mask. The + * Update the value of HW pressure for all @cpus in the mask. The * cpumask should include all (online+offline) affected CPUs, to avoid * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to - * thermal capping. It might be also a boost frequency value, which is bigger + * HW capping. It might be also a boost frequency value, which is bigger * than the internal 'capacity_freq_ref' max frequency. In such case the * pressure value should simply be removed, since this is an indication that - * there is no thermal throttling. The @capped_freq must be provided in kHz. + * there is no HW throttling. The @capped_freq must be provided in kHz. */ -void topology_update_thermal_pressure(const struct cpumask *cpus, +void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq) { - unsigned long max_capacity, capacity, th_pressure; + unsigned long max_capacity, capacity, hw_pressure; u32 max_freq; int cpu; @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, /* * Handle properly the boost frequencies, which should simply clean - * the thermal pressure value. + * the HW pressure value. */ if (max_freq <= capped_freq) capacity = max_capacity; else capacity = mult_frac(max_capacity, capped_freq, max_freq); - th_pressure = max_capacity - capacity; + hw_pressure = max_capacity - capacity; - trace_thermal_pressure_update(cpu, th_pressure); + trace_hw_pressure_update(cpu, hw_pressure); for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); } -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); +EXPORT_SYMBOL_GPL(topology_update_hw_pressure); static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 70b0f21968a0..ec8df5496a0c 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) throttled_freq = freq_hz / HZ_PER_KHZ; - /* Update thermal pressure (the boost frequencies are accepted) */ - arch_update_thermal_pressure(policy->related_cpus, throttled_freq); + /* Update HW pressure (the boost frequencies are accepted) */ + arch_update_hw_pressure(policy->related_cpus, throttled_freq); /* * In the unlikely case policy is unregistered do not enable diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a63d61ca55af..b721f360d759 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -60,14 +60,14 @@ void topology_scale_freq_tick(void); void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); -DECLARE_PER_CPU(unsigned long, thermal_pressure); +DECLARE_PER_CPU(unsigned long, hw_pressure); -static inline unsigned long topology_get_thermal_pressure(int cpu) +static inline unsigned long topology_get_hw_pressure(int cpu) { - return per_cpu(thermal_pressure, cpu); + return per_cpu(hw_pressure, cpu); } -void topology_update_thermal_pressure(const struct cpumask *cpus, +void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq); struct cpu_topology { diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c8fe9bab981b..4237daa5ac7a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -270,17 +270,17 @@ unsigned long arch_scale_cpu_capacity(int cpu) } #endif -#ifndef arch_scale_thermal_pressure +#ifndef arch_scale_hw_pressure static __always_inline -unsigned long arch_scale_thermal_pressure(int cpu) +unsigned long arch_scale_hw_pressure(int cpu) { return 0; } #endif -#ifndef arch_update_thermal_pressure +#ifndef arch_update_hw_pressure static __always_inline -void arch_update_thermal_pressure(const struct cpumask *cpus, +void arch_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_frequency) { } #endif diff --git a/include/trace/events/hw_pressure.h b/include/trace/events/hw_pressure.h new file mode 100644 index 000000000000..b9cd68854128 --- /dev/null +++ b/include/trace/events/hw_pressure.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hw_pressure + +#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_THERMAL_PRESSURE_H + +#include + +TRACE_EVENT(hw_pressure_update, + TP_PROTO(int cpu, unsigned long hw_pressure), + TP_ARGS(cpu, hw_pressure), + + TP_STRUCT__entry( + __field(unsigned long, hw_pressure) + __field(int, cpu) + ), + + TP_fast_assign( + __entry->hw_pressure = hw_pressure; + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure) +); +#endif /* _TRACE_THERMAL_PRESSURE_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dbb01b4b7451..d115d64c4011 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_thermal_tp, +DECLARE_TRACE(pelt_hw_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/thermal_pressure.h deleted file mode 100644 index b68680201360..000000000000 --- a/include/trace/events/thermal_pressure.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM thermal_pressure - -#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_THERMAL_PRESSURE_H - -#include - -TRACE_EVENT(thermal_pressure_update, - TP_PROTO(int cpu, unsigned long thermal_pressure), - TP_ARGS(cpu, thermal_pressure), - - TP_STRUCT__entry( - __field(unsigned long, thermal_pressure) - __field(int, cpu) - ), - - TP_fast_assign( - __entry->thermal_pressure = thermal_pressure; - __entry->cpu = cpu; - ), - - TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure) -); -#endif /* _TRACE_THERMAL_PRESSURE_H */ - -/* This part must be outside protection */ -#include diff --git a/init/Kconfig b/init/Kconfig index aa02aec6aa7d..f0c9117962ec 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -547,24 +547,24 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP -config SCHED_THERMAL_PRESSURE +config SCHED_HW_PRESSURE bool default y if ARM && ARM_CPU_TOPOLOGY default y if ARM64 depends on SMP depends on CPU_FREQ_THERMAL help - Select this option to enable thermal pressure accounting in the - scheduler. Thermal pressure is the value conveyed to the scheduler + Select this option to enable HW pressure accounting in the + scheduler. HW pressure is the value conveyed to the scheduler that reflects the reduction in CPU compute capacity resulted from - thermal throttling. Thermal throttling occurs when the performance of - a CPU is capped due to high operating temperatures. + HW throttling. HW throttling occurs when the performance of + a CPU is capped due to high operating temperatures as an example. If selected, the scheduler will be able to balance tasks accordingly, i.e. put less load on throttled CPUs than on non/less throttled ones. This requires the architecture to implement - arch_update_thermal_pressure() and arch_scale_thermal_pressure(). + arch_update_hw_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT bool "BSD Process Accounting" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0621e4ee31de..67a8302c3131 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -5668,7 +5668,7 @@ void sched_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; u64 resched_latency; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) @@ -5679,8 +5679,8 @@ void sched_tick(void) rq_lock(rq, &rf); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 19199c119829..eef39ae3efcf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,7 +78,7 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; +int sched_hw_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { int _shift = 0; @@ -86,7 +86,7 @@ static int __init setup_sched_thermal_decay_shift(char *str) if (kstrtoint(str, 0, &_shift)) pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - sched_thermal_decay_shift = clamp(_shift, 0, 10); + sched_hw_decay_shift = clamp(_shift, 0, 10); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -4969,7 +4969,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); return capacity; } @@ -5002,7 +5002,7 @@ static inline int util_fits_cpu(unsigned long util, * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * - * Only exception is for thermal pressure since it has a direct impact + * Only exception is for HW or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level @@ -9324,7 +9324,7 @@ static inline bool others_have_blocked(struct rq *rq) if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; if (cpu_util_irq(rq)) @@ -9354,7 +9354,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; + unsigned long hw_pressure; bool decayed; /* @@ -9363,11 +9363,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 3a96da25b67c..ef00382de595 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#ifdef CONFIG_SCHED_THERMAL_PRESSURE +#ifdef CONFIG_SCHED_HW_PRESSURE /* - * thermal: + * hardware: * * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked * * util_avg and runnable_load_avg are not supported and meaningless. * * Unlike rt/dl utilization tracking that track time spent by a cpu - * running a rt/dl task through util_avg, the average thermal pressure is - * tracked through load_avg. This is because thermal pressure signal is + * running a rt/dl task through util_avg, the average HW pressure is + * tracked through load_avg. This is because HW pressure signal is * time weighted "delta" capacity unlike util_avg which is binary. * "delta capacity" = actual capacity - - * capped capacity a cpu due to a thermal event. + * capped capacity a cpu due to a HW event. */ -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { - if (___update_load_sum(now, &rq->avg_thermal, + if (___update_load_sum(now, &rq->avg_hw, capacity, capacity, capacity)) { - ___update_load_avg(&rq->avg_thermal, 1); - trace_pelt_thermal_tp(rq); + ___update_load_avg(&rq->avg_hw, 1); + trace_pelt_hw_tp(rq); return 1; } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 9e1083465fbc..2150062949d4 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#ifdef CONFIG_SCHED_THERMAL_PRESSURE -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +#ifdef CONFIG_SCHED_HW_PRESSURE +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { - return READ_ONCE(rq->avg_thermal.load_avg); + return READ_ONCE(rq->avg_hw.load_avg); } #else static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7c39dbf31f75..993edb02fb0d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1108,8 +1108,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; @@ -1561,11 +1561,11 @@ static inline u64 rq_clock_task(struct rq *rq) * 3 256 * 4 512 */ -extern int sched_thermal_decay_shift; +extern int sched_hw_decay_shift; -static inline u64 rq_clock_thermal(struct rq *rq) +static inline u64 rq_clock_hw(struct rq *rq) { - return rq_clock_task(rq) >> sched_thermal_decay_shift; + return rq_clock_task(rq) >> sched_hw_decay_shift; } static inline void rq_clock_skip_update(struct rq *rq) -- cgit v1.2.3 From 97450eb909658573dcacc1063b06d3d08642c0c1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Mar 2024 10:16:16 +0100 Subject: sched/pelt: Remove shift of thermal clock The optional shift of the clock used by thermal/hw load avg has been introduced to handle case where the signal was not always a high frequency hw signal. Now that cpufreq provides a signal for firmware and SW pressure, we can remove this exception and always keep this PELT signal aligned with other signals. Mark sysctl_sched_migration_cost boot parameter as deprecated Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Qais Yousef Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20240326091616.3696851-6-vincent.guittot@linaro.org --- Documentation/admin-guide/kernel-parameters.txt | 1 + kernel/sched/core.c | 2 +- kernel/sched/fair.c | 10 ++-------- kernel/sched/sched.h | 18 ------------------ 4 files changed, 4 insertions(+), 27 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..3f390cc5f77e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5807,6 +5807,7 @@ but is useful for debugging and performance tuning. sched_thermal_decay_shift= + [Deprecated] [KNL, SMP] Set a decay shift for scheduler thermal pressure signal. Thermal pressure signal follows the default decay period of other scheduler pelt diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67a8302c3131..1a914388144a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5680,7 +5680,7 @@ void sched_tick(void) update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); - update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eef39ae3efcf..9eb63573110c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_hw_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_hw_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -9367,7 +9361,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) | + update_hw_load_avg(now, rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 993edb02fb0d..cb3792c04eea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1550,24 +1550,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_hw_decay_shift; - -static inline u64 rq_clock_hw(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_hw_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_rq_held(rq); -- cgit v1.2.3 From 05037e5f0f17935a86861f9610f941ebf346a95e Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 10 Apr 2024 16:33:11 -0500 Subject: sched/topology: Optimize topology_span_sane() Optimize topology_span_sane() by removing duplicate comparisons. Since topology_span_sane() is called inside of for_each_cpu(), each previous CPU has already been compared against every other CPU. The current CPU only needs to be compared against higher-numbered CPUs. The total number of comparisons is reduced from N * (N - 1) to N * (N - 1) / 2 on each non-NUMA scheduling domain level. Signed-off-by: Kyle Meyer Reviewed-by: Yury Norov Acked-by: Vincent Guittot Signed-off-by: Yury Norov --- kernel/sched/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..b6bcafc09969 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2347,7 +2347,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve static bool topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu) { - int i; + int i = cpu + 1; /* NUMA levels are allowed to overlap */ if (tl->flags & SDTL_OVERLAP) @@ -2359,9 +2359,7 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, * breaking the sched_group lists - i.e. a later get_group() pass * breaks the linking done for an earlier span. */ - for_each_cpu(i, cpu_map) { - if (i == cpu) - continue; + for_each_cpu_from(i, cpu_map) { /* * We should 'and' all those masks with 'cpu_map' to exactly * match the topology we're about to build, but that can only -- cgit v1.2.3 From a1fd0b9d751f840df23ef0e75b691fc00cfd4743 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:23 +0300 Subject: sched/fair: Allow disabling sched_balance_newidle with sched_relax_domain_level Change relax_domain_level checks so that it would be possible to include or exclude all domains from newidle balancing. This matches the behavior described in the documentation: -1 no request. use system default or follow request of others. 0 no search. 1 search siblings (hyperthreads in a core). "2" enables levels 0 and 1, level_max excludes the last (level_max) level, and level_max+1 includes all levels. Fixes: 1d3504fcf560 ("sched, cpuset: customize sched domains, core") Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Tested-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/bd6de28e80073c79466ec6401cdeae78f0d4423d.1714488502.git.vitaly@bursov.com --- kernel/cgroup/cpuset.c | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..da24187c4e02 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2948,7 +2948,7 @@ bool current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) + if (val < -1 || val > sched_domain_level_max + 1) return -EINVAL; #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63aecd2a7a9f..67a777b31743 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1475,7 +1475,7 @@ static void set_domain_attribute(struct sched_domain *sd, } else request = attr->relax_domain_level; - if (sd->level > request) { + if (sd->level >= request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } -- cgit v1.2.3 From 287372fa39f579a61e17b000aa74c8418d230528 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:24 +0300 Subject: sched/debug: Dump domains' level Knowing domain's level exactly can be useful when setting relax_domain_level or cpuset.sched_relax_domain_level Usage: cat /debug/sched/domains/cpu0/domain1/level to dump cpu0 domain1's level. SDM macro is not used because sd->level is 'int' and it would hide the type mismatch between 'int' and 'u32'. Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Tested-by: Dietmar Eggemann Acked-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/9489b6475f6dd6fbc67c617752d4216fa094da53.1714488502.git.vitaly@bursov.com --- kernel/sched/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a5834d..c1eb9a1afd13 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -425,6 +425,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); } void update_sched_domain_debugfs(void) -- cgit v1.2.3 From 72bffbf57c5247ac6146d1103ef42e9f8d094bc8 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Thu, 14 Mar 2024 18:59:16 -0700 Subject: sched/fair: Fix initial util_avg calculation Change se->load.weight to se_weight(se) in the calculation for the initial util_avg to avoid unnecessarily inflating the util_avg by 1024 times. The reason is that se->load.weight has the unit/scale as the scaled-up load, while cfs_rg->avg.load_avg has the unit/scale as the true task weight (as mapped directly from the task's nice/priority value). With CONFIG_32BIT, the scaled-up load is equal to the true task weight. With CONFIG_64BIT, the scaled-up load is 1024 times the true task weight. Thus, the current code may inflate the util_avg by 1024 times. The follow-up capping will not allow the util_avg value to go wild. But the calculation should have the correct logic. Signed-off-by: Dawei Li Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Vishal Chourasia Link: https://lore.kernel.org/r/20240315015916.21545-1-daweilics@gmail.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 146ecf9cc3af..900978741a81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1031,7 +1031,8 @@ void init_entity_runnable_average(struct sched_entity *se) * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -1078,7 +1079,7 @@ void post_init_entity_util_avg(struct task_struct *p) if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); sa->util_avg /= (cfs_rq->avg.load_avg + 1); if (sa->util_avg > cap) -- cgit v1.2.3 From 7cb7fb5b49399fc59f1c44686d82c0df0776c8c6 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 5 Mar 2024 15:18:20 +0000 Subject: sched/fair: Remove stale FREQUENCY_UTIL comment On 05/03/2024 15:05, Vincent Guittot wrote: I'm fine with either and that was my first thought here, too, but it did seem like the comment was mostly placed there to justify the 'unexpected' high utilization when explicitly passing FREQUENCY_UTIL and the need to clamp it then. So removing did feel slightly more natural to me anyway. So alternatively: From: Christian Loehle Date: Tue, 5 Mar 2024 09:34:41 +0000 Subject: [PATCH] sched/fair: Remove stale FREQUENCY_UTIL mention effective_cpu_util() flags were removed, so remove mentioning of the flag. commit 9c0b4bb7f6303 ("sched/cpufreq: Rework schedutil governor performance estimation") reworked effective_cpu_util() removing enum cpu_util_type. Modify the comment accordingly. Signed-off-by: Christian Loehle Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/0e2833ee-0939-44e0-82a2-520a585a0153@arm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 900978741a81..9744b5036dd8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7900,8 +7900,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ eff_util = effective_cpu_util(cpu, util, &min, &max); -- cgit v1.2.3 From 49217ea147df7647cb89161b805c797487783fc0 Mon Sep 17 00:00:00 2001 From: Cheng Yu Date: Wed, 24 Apr 2024 21:24:38 +0800 Subject: sched/core: Fix incorrect initialization of the 'burst' parameter in cpu_max_write() In the cgroup v2 CPU subsystem, assuming we have a cgroup named 'test', and we set cpu.max and cpu.max.burst: # echo 1000000 > /sys/fs/cgroup/test/cpu.max # echo 1000000 > /sys/fs/cgroup/test/cpu.max.burst then we check cpu.max and cpu.max.burst: # cat /sys/fs/cgroup/test/cpu.max 1000000 100000 # cat /sys/fs/cgroup/test/cpu.max.burst 1000000 Next we set cpu.max again and check cpu.max and cpu.max.burst: # echo 2000000 > /sys/fs/cgroup/test/cpu.max # cat /sys/fs/cgroup/test/cpu.max 2000000 100000 # cat /sys/fs/cgroup/test/cpu.max.burst 1000 ... we find that the cpu.max.burst value changed unexpectedly. In cpu_max_write(), the unit of the burst value returned by tg_get_cfs_burst() is microseconds, while in cpu_max_write(), the burst unit used for calculation should be nanoseconds, which leads to the bug. To fix it, get the burst value directly from tg->cfs_bandwidth.burst. Fixes: f4183717b370 ("sched/fair: Introduce the burstable CFS controller") Reported-by: Qixin Liao Signed-off-by: Cheng Yu Signed-off-by: Zhang Qiao Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Tested-by: Vincent Guittot Link: https://lore.kernel.org/r/20240424132438.514720-1-serein.chengyu@huawei.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a914388144a..f88f50552acb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11402,7 +11402,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); - u64 burst = tg_get_cfs_burst(tg); + u64 burst = tg->cfs_bandwidth.burst; u64 quota; int ret; -- cgit v1.2.3