aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-27 14:03:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-27 14:03:21 -0700
commited3b7923a816ded62dccef377c9ee346c7d3b1b4 (patch)
tree41d46fc399c231088a370e7b3e488a93342fa681 /kernel/sched/core.c
parentMerge tag 'x86_sgx_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parentsched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() (diff)
downloadlinux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.tar.gz
linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.zip
Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Scheduler SMP load-balancer improvements: - Avoid unnecessary migrations within SMT domains on hybrid systems. Problem: On hybrid CPU systems, (processors with a mixture of higher-frequency SMT cores and lower-frequency non-SMT cores), under the old code lower-priority CPUs pulled tasks from the higher-priority cores if more than one SMT sibling was busy - resulting in many unnecessary task migrations. Solution: The new code improves the load balancer to recognize SMT cores with more than one busy sibling and allows lower-priority CPUs to pull tasks, which avoids superfluous migrations and lets lower-priority cores inspect all SMT siblings for the busiest queue. - Implement the 'runnable boosting' feature in the EAS balancer: consider CPU contention in frequency, EAS max util & load-balance busiest CPU selection. This improves CPU utilization for certain workloads, while leaves other key workloads unchanged. Scheduler infrastructure improvements: - Rewrite the scheduler topology setup code by consolidating it into the build_sched_topology() helper function and building it dynamically on the fly. - Resolve the local_clock() vs. noinstr complications by rewriting the code: provide separate sched_clock_noinstr() and local_clock_noinstr() functions to be used in instrumentation code, and make sure it is all instrumentation-safe. Fixes: - Fix a kthread_park() race with wait_woken() - Fix misc wait_task_inactive() bugs unearthed by the -rt merge: - Fix UP PREEMPT bug by unifying the SMP and UP implementations - Fix task_struct::saved_state handling - Fix various rq clock update bugs, unearthed by turning on the rq clock debugging code. - Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger by creating enough cgroups, by removing the warnign and restricting window size triggers to PSI file write-permission or CAP_SYS_RESOURCE. - Propagate SMT flags in the topology when removing degenerate domain - Fix grub_reclaim() calculation bug in the deadline scheduler code - Avoid resetting the min update period when it is unnecessary, in psi_trigger_destroy(). - Don't balance a task to its current running CPU in load_balance(), which was possible on certain NUMA topologies with overlapping groups. - Fix the sched-debug printing of rq->nr_uninterruptible Cleanups: - Address various -Wmissing-prototype warnings, as a preparation to (maybe) enable this warning in the future. - Remove unused code - Mark more functions __init - Fix shadow-variable warnings" * tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop() sched/core: Fixed missing rq clock update before calling set_rq_offline() sched/deadline: Update GRUB description in the documentation sched/deadline: Fix bandwidth reclaim equation in GRUB sched/wait: Fix a kthread_park race with wait_woken() sched/topology: Mark set_sched_topology() __init sched/fair: Rename variable cpu_util eff_util arm64/arch_timer: Fix MMIO byteswap sched/fair, cpufreq: Introduce 'runnable boosting' sched/fair: Refactor CPU utilization functions cpuidle: Use local_clock_noinstr() sched/clock: Provide local_clock_noinstr() x86/tsc: Provide sched_clock_noinstr() clocksource: hyper-v: Provide noinstr sched_clock() clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX x86/vdso: Fix gettimeofday masking math64: Always inline u128 version of mul_u64_u64_shr() s390/time: Provide sched_clock_noinstr() loongarch: Provide noinstr sched_clock_read() ...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c278
1 files changed, 158 insertions, 120 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..7eb6e2927390 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq);
}
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+#ifdef CONFIG_PREEMPT_RT
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+#endif
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+#ifdef CONFIG_PREEMPT_RT
+ int match;
+
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state() and
+ * current_restore_rtlock_saved_state().
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ match = __task_state_match(p, state);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+#else
+ return __task_state_match(p, state);
+#endif
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+ int running, queued, match;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_on_cpu()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_on_cpu(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
#ifdef CONFIG_SMP
static void
@@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
@@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data)
goto out;
}
- if (task_on_rq_queued(p))
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }
/*
* XXX __migrate_task() can fail, at which point we might end
@@ -3341,114 +3490,6 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * Wait for the thread to block in any of the states set in @match_state.
- * If it changes, i.e. @p might have woken up, then return zero. When we
- * succeed in waiting for @p to be off its CPU, we return a positive number
- * (its total switch count). If a second call a short while later returns the
- * same number, the caller can be sure that @p has remained unscheduled the
- * whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
- int running, queued;
- struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_on_cpu()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_on_cpu(rq, p)) {
- if (!(READ_ONCE(p->__state) & match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_on_cpu(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (READ_ONCE(p->__state) & match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &rf);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(queued)) {
- ktime_t to = NSEC_PER_SEC / HZ;
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
+ int match;
+
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
- if (READ_ONCE(p->__state) & state) {
- *success = 1;
- return true;
- }
+ *success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
/*
@@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* p::saved_state to TASK_RUNNING so any further tests will
* not result in false positives vs. @success
*/
- if (p->saved_state & state) {
+ if (match < 0)
p->saved_state = TASK_RUNNING;
- *success = 1;
- }
#endif
- return false;
+ return match > 0;
}
/*
@@ -9548,6 +9586,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -9689,7 +9728,6 @@ int sched_cpu_deactivate(unsigned int cpu)
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
- update_rq_clock(rq);
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}