aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2025-07-28 11:03:04 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2025-07-29 08:35:46 -0400
commitf02b1bcc73a17602903480562571069f0dff9f24 (patch)
tree325828b3a747b77a069cde7e0df2f949152ac081 /kernel/sched
parentMerge tag 'kvm-riscv-6.17-2' of https://github.com/kvm-riscv/linux into HEAD (diff)
parentKVM: selftests: Add CONFIG_EVENTFD for irqfd selftest (diff)
downloadlinux-f02b1bcc73a17602903480562571069f0dff9f24.tar.gz
linux-f02b1bcc73a17602903480562571069f0dff9f24.zip
Merge tag 'kvm-x86-irqs-6.17' of https://github.com/kvm-x86/linux into HEAD
KVM IRQ changes for 6.17 - Rework irqbypass to track/match producers and consumers via an xarray instead of a linked list. Using a linked list leads to O(n^2) insertion times, which is hugely problematic for use cases that create large numbers of VMs. Such use cases typically don't actually use irqbypass, but eliminating the pointless registration is a future problem to solve as it likely requires new uAPI. - Track irqbypass's "token" as "struct eventfd_ctx *" instead of a "void *", to avoid making a simple concept unnecessarily difficult to understand. - Add CONFIG_KVM_IOAPIC for x86 to allow disabling support for I/O APIC, PIC, and PIT emulation at compile time. - Drop x86's irq_comm.c, and move a pile of IRQ related code into irq.c. - Fix a variety of flaws and bugs in the AVIC device posted IRQ code. - Inhibited AVIC if a vCPU's ID is too big (relative to what hardware supports) instead of rejecting vCPU creation. - Extend enable_ipiv module param support to SVM, by simply leaving IsRunning clear in the vCPU's physical ID table entry. - Disable IPI virtualization, via enable_ipiv, if the CPU is affected by erratum #1235, to allow (safely) enabling AVIC on such CPUs. - Dedup x86's device posted IRQ code, as the vast majority of functionality can be shared verbatime between SVM and VMX. - Harden the device posted IRQ code against bugs and runtime errors. - Use vcpu_idx, not vcpu_id, for GA log tag/metadata, to make lookups O(1) instead of O(n). - Generate GA Log interrupts if and only if the target vCPU is blocking, i.e. only if KVM needs a notification in order to wake the vCPU. - Decouple device posted IRQs from VFIO device assignment, as binding a VM to a VFIO group is not a requirement for enabling device posted IRQs. - Clean up and document/comment the irqfd assignment code. - Disallow binding multiple irqfds to an eventfd with a priority waiter, i.e. ensure an eventfd is bound to at most one irqfd through the entire host, and add a selftest to verify eventfd:irqfd bindings are globally unique.
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/wait.c22
1 files changed, 20 insertions, 2 deletions
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..15632c89c4f2 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -40,13 +40,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
{
unsigned long flags;
- wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ wq_entry->flags |= WQ_FLAG_PRIORITY;
spin_lock_irqsave(&wq_head->lock, flags);
__add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry)
+{
+ struct list_head *head = &wq_head->head;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+ guard(spinlock_irqsave)(&wq_head->lock);
+
+ if (!list_empty(head) &&
+ (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+ return -EBUSY;
+
+ list_add(&wq_entry->entry, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -64,7 +82,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns