diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-14 19:01:29 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-14 19:01:29 +0200 |
commit | 9776dd36095be19f5a0ad9f07a4fc221d2a0609a (patch) | |
tree | bd2bf5c11cf57fc187569670911a51f0b16f2796 /arch/x86/kernel | |
parent | Merge tag 'irq-core-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/... (diff) | |
parent | x86/irq: Use existing helper for pending vector check (diff) | |
download | linux-9776dd36095be19f5a0ad9f07a4fc221d2a0609a.tar.xz linux-9776dd36095be19f5a0ad9f07a4fc221d2a0609a.zip |
Merge tag 'x86-irq-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 interrupt handling updates from Thomas Gleixner:
"Add support for posted interrupts on bare metal.
Posted interrupts is a virtualization feature which allows to inject
interrupts directly into a guest without host interaction. The VT-d
interrupt remapping hardware sets the bit which corresponds to the
interrupt vector in a vector bitmap which is either used to inject the
interrupt directly into the guest via a virtualized APIC or in case
that the guest is scheduled out provides a host side notification
interrupt which informs the host that an interrupt has been marked
pending in the bitmap.
This can be utilized on bare metal for scenarios where multiple
devices, e.g. NVME storage, raise interrupts with a high frequency. In
the default mode these interrupts are handles independently and
therefore require a full roundtrip of interrupt entry/exit.
Utilizing posted interrupts this roundtrip overhead can be avoided by
coalescing these interrupt entries to a single entry for the posted
interrupt notification. The notification interrupt then demultiplexes
the pending bits in a memory based bitmap and invokes the
corresponding device specific handlers.
Depending on the usage scenario and device utilization throughput
improvements between 10% and 130% have been measured.
As this is only relevant for high end servers with multiple device
queues per CPU attached and counterproductive for situations where
interrupts are arriving at distinct times, the functionality is opt-in
via a kernel command line parameter"
* tag 'x86-irq-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/irq: Use existing helper for pending vector check
iommu/vt-d: Enable posted mode for device MSIs
iommu/vt-d: Make posted MSI an opt-in command line option
x86/irq: Extend checks for pending vectors to posted interrupts
x86/irq: Factor out common code for checking pending interrupts
x86/irq: Install posted MSI notification handler
x86/irq: Factor out handler invocation from common_interrupt()
x86/irq: Set up per host CPU posted interrupt descriptors
x86/irq: Reserve a per CPU IDT vector for posted MSIs
x86/irq: Add a Kconfig option for posted MSI
x86/irq: Remove bitfields in posted interrupt descriptor
x86/irq: Unionize PID.PIR for 64bit access w/o casting
KVM: VMX: Move posted interrupt descriptor out of VMX code
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/apic/vector.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/idt.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/irq.c | 172 |
4 files changed, 166 insertions, 17 deletions
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 185738c72766..9eec52925fa3 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) lockdep_assert_held(&vector_lock); hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { - unsigned int irr, vector = apicd->prev_vector; + unsigned int vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned @@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) * fixup_irqs() was just called to scan IRR for set bits and * forward them to new destination CPUs via IPIs. */ - irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; - if (irr & (1U << (vector % 32))) { + if (check_irr && is_vector_pending(vector)) { pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); rearm = true; continue; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cdaa795a9371..2b170da84f97 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -68,6 +68,7 @@ #include <asm/traps.h> #include <asm/sev.h> #include <asm/tdx.h> +#include <asm/posted_intr.h> #include "cpu.h" @@ -2222,6 +2223,8 @@ void cpu_init(void) barrier(); x2apic_setup(); + + intel_posted_msi_init(); } mmgrab(&init_mm); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index fc37c8d83daf..f445bec516a0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = { # endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_POSTED_MSI + INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification), +# endif #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 35fde0107901..385e3a5fc304 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,8 @@ #include <asm/desc.h> #include <asm/traps.h> #include <asm/thermal.h> +#include <asm/posted_intr.h> +#include <asm/irq_remapping.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -182,6 +184,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_X86_POSTED_MSI + seq_printf(p, "%*s: ", prec, "PMN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->posted_msi_notification_count); + seq_puts(p, " Posted MSI notification event\n"); +#endif return 0; } @@ -240,24 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc, __handle_irq(desc, regs); } -/* - * common_interrupt() handles all normal device IRQ's (the special SMP - * cross-CPU interrupts have their own entry points). - */ -DEFINE_IDTENTRY_IRQ(common_interrupt) +static __always_inline int call_irq_handler(int vector, struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - - /* entry code tells RCU that we're not quiescent. Check it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + int ret = 0; desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - apic_eoi(); - + ret = -EINVAL; if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), @@ -267,6 +268,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) } } + return ret; +} + +/* + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). + */ +DEFINE_IDTENTRY_IRQ(common_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* entry code tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + + if (unlikely(call_irq_handler(vector, regs))) + apic_eoi(); + set_irq_regs(old_regs); } @@ -334,12 +352,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) } #endif +#ifdef CONFIG_X86_POSTED_MSI + +/* Posted Interrupt Descriptors for coalesced MSIs to be posted */ +DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); + +void intel_posted_msi_init(void) +{ + u32 destination; + u32 apic_id; + + this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR); + + /* + * APIC destination ID is stored in bit 8:15 while in XAPIC mode. + * VT-d spec. CH 9.11 + */ + apic_id = this_cpu_read(x86_cpu_to_apicid); + destination = x2apic_enabled() ? apic_id : apic_id << 8; + this_cpu_write(posted_msi_pi_desc.ndst, destination); +} + +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg + * for checking and clearing posted interrupt request (PIR), a 256 bit field + * within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * to dispatch interrupt handlers. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +{ + int i, vec = FIRST_EXTERNAL_VECTOR; + unsigned long pir_copy[4]; + bool handled = false; + + for (i = 0; i < 4; i++) + pir_copy[i] = pir[i]; + + for (i = 0; i < 4; i++) { + if (!pir_copy[i]) + continue; + + pir_copy[i] = arch_xchg(&pir[i], 0); + handled = true; + } + + if (handled) { + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); + } + + return handled; +} + +/* + * Performance data shows that 3 is good enough to harvest 90+% of the benefit + * on high IRQ rate workload. + */ +#define MAX_POSTED_MSI_COALESCING_LOOP 3 + +/* + * For MSIs that are delivered as posted interrupts, the CPU notifications + * can be coalesced if the MSIs arrive in high frequency bursts. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct pi_desc *pid; + int i = 0; + + pid = this_cpu_ptr(&posted_msi_pi_desc); + + inc_irq_stat(posted_msi_notification_count); + irq_enter(); + + /* + * Max coalescing count includes the extra round of handle_pending_pir + * after clearing the outstanding notification bit. Hence, at most + * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. + */ + while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { + if (!handle_pending_pir(pid->pir64, regs)) + break; + } + + /* + * Clear outstanding notification bit to allow new IRQ notifications, + * do this last to maximize the window of interrupt coalescing. + */ + pi_clear_on(pid); + + /* + * There could be a race of PI notification and the clearing of ON bit, + * process PIR bits one last time such that handling the new interrupts + * are not delayed until the next IRQ. + */ + handle_pending_pir(pid->pir64, regs); + + apic_eoi(); + irq_exit(); + set_irq_regs(old_regs); +} +#endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irr, vector; + unsigned int vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; @@ -366,8 +511,7 @@ void fixup_irqs(void) if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - if (irr & (1 << (vector % 32))) { + if (is_vector_pending(vector)) { desc = __this_cpu_read(vector_irq[vector]); raw_spin_lock(&desc->lock); |