genirq: Introduce common irq_force_complete_move() implementation

CONFIG_GENERIC_PENDING_IRQ requires an architecture specific implementation of irq_force_complete_move() for CPU hotplug. At the moment, only x86 implements this unconditionally, but for RISC-V irq_force_complete_move() is only needed when the RISC-V IMSIC driver is in use and not needed otherwise. To allow runtime configuration of this mechanism, introduce a common irq_force_complete_move() implementation in the interrupt core code, which only invokes the completion function, when a interrupt chip in the hierarchy implements it. Switch X86 over to the new mechanism. No functional change intended. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Anup Patel <apatel@ventanamicro.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/all/20250217085657.789309-5-apatel@ventanamicro.com
2025-10-31 16:48:26 +02:00 · 2025-02-17 14:26:50 +05:30 · 2025-02-17 14:26:50 +05:30 · 751dc837da
commit 751dc837da
parent fe35ecee8e
4 changed files with 123 additions and 125 deletions
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
 	return err ? err : IRQ_SET_MASK_OK;
 }
 static void free_moved_vector(struct apic_chip_data *apicd)
 {
 	unsigned int vector = apicd->prev_vector;
 	unsigned int cpu = apicd->prev_cpu;
 	bool managed = apicd->is_managed;
 	/*
 	 * Managed interrupts are usually not migrated away
 	 * from an online CPU, but CPU isolation 'managed_irq'
 	 * can make that happen.
 	 * 1) Activation does not take the isolation into account
 	 *    to keep the code simple
 	 * 2) Migration away from an isolated CPU can happen when
 	 *    a non-isolated CPU which is in the calculated
 	 *    affinity mask comes online.
 	 */
 	trace_vector_free_moved(apicd->irq, cpu, vector, managed);
 	irq_matrix_free(vector_matrix, cpu, vector, managed);
 	per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	hlist_del_init(&apicd->clist);
 	apicd->prev_vector = 0;
 	apicd->move_in_progress = 0;
 }
 /*
 * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
 */
 static void apic_force_complete_move(struct irq_data *irqd)
 {
 	unsigned int cpu = smp_processor_id();
 	struct apic_chip_data *apicd;
 	unsigned int vector;
 	guard(raw_spinlock)(&vector_lock);
 	apicd = apic_chip_data(irqd);
 	if (!apicd)
 		return;
 	/*
 	 * If prev_vector is empty or the descriptor is neither currently
 	 * nor previously on the outgoing CPU no action required.
 	 */
 	vector = apicd->prev_vector;
 	if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
 		return;
 	/*
 	 * This is tricky. If the cleanup of the old vector has not been
 	 * done yet, then the following setaffinity call will fail with
 	 * -EBUSY. This can leave the interrupt in a stale state.
 	 *
 	 * All CPUs are stuck in stop machine with interrupts disabled so
 	 * calling __irq_complete_move() would be completely pointless.
 	 *
 	 * 1) The interrupt is in move_in_progress state. That means that we
 	 *    have not seen an interrupt since the io_apic was reprogrammed to
 	 *    the new vector.
 	 *
 	 * 2) The interrupt has fired on the new vector, but the cleanup IPIs
 	 *    have not been processed yet.
 	 */
 	if (apicd->move_in_progress) {
 		/*
 		 * In theory there is a race:
 		 *
 		 * set_ioapic(new_vector) <-- Interrupt is raised before update
 		 *			      is effective, i.e. it's raised on
 		 *			      the old vector.
 		 *
 		 * So if the target cpu cannot handle that interrupt before
 		 * the old vector is cleaned up, we get a spurious interrupt
 		 * and in the worst case the ioapic irq line becomes stale.
 		 *
 		 * But in case of cpu hotplug this should be a non issue
 		 * because if the affinity update happens right before all
 		 * cpus rendezvous in stop machine, there is no way that the
 		 * interrupt can be blocked on the target cpu because all cpus
 		 * loops first with interrupts enabled in stop machine, so the
 		 * old vector is not yet cleaned up when the interrupt fires.
 		 *
 		 * So the only way to run into this issue is if the delivery
 		 * of the interrupt on the apic/system bus would be delayed
 		 * beyond the point where the target cpu disables interrupts
 		 * in stop machine. I doubt that it can happen, but at least
 		 * there is a theoretical chance. Virtualization might be
 		 * able to expose this, but AFAICT the IOAPIC emulation is not
 		 * as stupid as the real hardware.
 		 *
 		 * Anyway, there is nothing we can do about that at this point
 		 * w/o refactoring the whole fixup_irq() business completely.
 		 * We print at least the irq number and the old vector number,
 		 * so we have the necessary information when a problem in that
 		 * area arises.
 		 */
 		pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
 			irqd->irq, vector);
 	}
 	free_moved_vector(apicd);
 }
 #else
-# define apic_set_affinity	NULL
+# define apic_set_affinity		NULL
 # define apic_force_complete_move	NULL
 #endif
 static int apic_retrigger_irq(struct irq_data *irqd)
@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data,
 }
 static struct irq_chip lapic_controller = {
-	.name			= "APIC",
+	.name				= "APIC",
-	.irq_ack		= apic_ack_edge,
+	.irq_ack			= apic_ack_edge,
-	.irq_set_affinity	= apic_set_affinity,
+	.irq_set_affinity		= apic_set_affinity,
-	.irq_compose_msi_msg	= x86_vector_msi_compose_msg,
+	.irq_compose_msi_msg		= x86_vector_msi_compose_msg,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_force_complete_move	= apic_force_complete_move,
 	.irq_retrigger			= apic_retrigger_irq,
 };
 #ifdef CONFIG_SMP
 static void free_moved_vector(struct apic_chip_data *apicd)
 {
 	unsigned int vector = apicd->prev_vector;
 	unsigned int cpu = apicd->prev_cpu;
 	bool managed = apicd->is_managed;
 	/*
 	 * Managed interrupts are usually not migrated away
 	 * from an online CPU, but CPU isolation 'managed_irq'
 	 * can make that happen.
 	 * 1) Activation does not take the isolation into account
 	 *    to keep the code simple
 	 * 2) Migration away from an isolated CPU can happen when
 	 *    a non-isolated CPU which is in the calculated
 	 *    affinity mask comes online.
 	 */
 	trace_vector_free_moved(apicd->irq, cpu, vector, managed);
 	irq_matrix_free(vector_matrix, cpu, vector, managed);
 	per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	hlist_del_init(&apicd->clist);
 	apicd->prev_vector = 0;
 	apicd->move_in_progress = 0;
 }
 static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
 {
 	struct apic_chip_data *apicd;
@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
 		__vector_schedule_cleanup(apicd);
 }
 /*
 * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
 */
 void irq_force_complete_move(struct irq_desc *desc)
 {
 	unsigned int cpu = smp_processor_id();
 	struct apic_chip_data *apicd;
 	struct irq_data *irqd;
 	unsigned int vector;
 	/*
 	 * The function is called for all descriptors regardless of which
 	 * irqdomain they belong to. For example if an IRQ is provided by
 	 * an irq_chip as part of a GPIO driver, the chip data for that
 	 * descriptor is specific to the irq_chip in question.
 	 *
 	 * Check first that the chip_data is what we expect
 	 * (apic_chip_data) before touching it any further.
 	 */
 	irqd = irq_domain_get_irq_data(x86_vector_domain,
 				       irq_desc_get_irq(desc));
 	if (!irqd)
 		return;
 	raw_spin_lock(&vector_lock);
 	apicd = apic_chip_data(irqd);
 	if (!apicd)
 		goto unlock;
 	/*
 	 * If prev_vector is empty or the descriptor is neither currently
 	 * nor previously on the outgoing CPU no action required.
 	 */
 	vector = apicd->prev_vector;
 	if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
 		goto unlock;
 	/*
 	 * This is tricky. If the cleanup of the old vector has not been
 	 * done yet, then the following setaffinity call will fail with
 	 * -EBUSY. This can leave the interrupt in a stale state.
 	 *
 	 * All CPUs are stuck in stop machine with interrupts disabled so
 	 * calling __irq_complete_move() would be completely pointless.
 	 *
 	 * 1) The interrupt is in move_in_progress state. That means that we
 	 *    have not seen an interrupt since the io_apic was reprogrammed to
 	 *    the new vector.
 	 *
 	 * 2) The interrupt has fired on the new vector, but the cleanup IPIs
 	 *    have not been processed yet.
 	 */
 	if (apicd->move_in_progress) {
 		/*
 		 * In theory there is a race:
 		 *
 		 * set_ioapic(new_vector) <-- Interrupt is raised before update
 		 *			      is effective, i.e. it's raised on
 		 *			      the old vector.
 		 *
 		 * So if the target cpu cannot handle that interrupt before
 		 * the old vector is cleaned up, we get a spurious interrupt
 		 * and in the worst case the ioapic irq line becomes stale.
 		 *
 		 * But in case of cpu hotplug this should be a non issue
 		 * because if the affinity update happens right before all
 		 * cpus rendezvous in stop machine, there is no way that the
 		 * interrupt can be blocked on the target cpu because all cpus
 		 * loops first with interrupts enabled in stop machine, so the
 		 * old vector is not yet cleaned up when the interrupt fires.
 		 *
 		 * So the only way to run into this issue is if the delivery
 		 * of the interrupt on the apic/system bus would be delayed
 		 * beyond the point where the target cpu disables interrupts
 		 * in stop machine. I doubt that it can happen, but at least
 		 * there is a theoretical chance. Virtualization might be
 		 * able to expose this, but AFAICT the IOAPIC emulation is not
 		 * as stupid as the real hardware.
 		 *
 		 * Anyway, there is nothing we can do about that at this point
 		 * w/o refactoring the whole fixup_irq() business completely.
 		 * We print at least the irq number and the old vector number,
 		 * so we have the necessary information when a problem in that
 		 * area arises.
 		 */
 		pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
 			irqd->irq, vector);
 	}
 	free_moved_vector(apicd);
 unlock:
 	raw_spin_unlock(&vector_lock);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
 * Note, this is not accurate accounting, but at least good enough to
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
 * @ipi_send_mask:	send an IPI to destination cpus in cpumask
 * @irq_nmi_setup:	function called from core code before enabling an NMI
 * @irq_nmi_teardown:	function called from core code after disabling an NMI
 * @irq_force_complete_move:	optional function to force complete pending irq move
 * @flags:		chip specific flags
 */
 struct irq_chip {
@ -537,6 +538,8 @@ struct irq_chip {
 	int		(*irq_nmi_setup)(struct irq_data *data);
 	void		(*irq_nmi_teardown)(struct irq_data *data);
 	void		(*irq_force_complete_move)(struct irq_data *data);
 	unsigned long	flags;
 };
@ -619,11 +622,9 @@ static inline void irq_move_irq(struct irq_data *data)
 		__irq_move_irq(data);
 }
 void irq_move_masked_irq(struct irq_data *data);
 void irq_force_complete_move(struct irq_desc *desc);
 #else
 static inline void irq_move_irq(struct irq_data *data) { }
 static inline void irq_move_masked_irq(struct irq_data *data) { }
 static inline void irq_force_complete_move(struct irq_desc *desc) { }
 #endif
 extern int no_irq_affinity;
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@ -442,6 +442,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
 	return desc->pending_mask;
 }
 bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
 void irq_force_complete_move(struct irq_desc *desc);
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 static inline bool irq_can_move_pcntxt(struct irq_data *data)
 {
@ -467,6 +468,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 {
 	return false;
 }
 static inline void irq_force_complete_move(struct irq_desc *desc) { }
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
 	return true;
 }
 void irq_force_complete_move(struct irq_desc *desc)
 {
 	for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) {
 		if (d->chip && d->chip->irq_force_complete_move) {
 			d->chip->irq_force_complete_move(d);
 			return;
 		}
 	}
 }
 void irq_move_masked_irq(struct irq_data *idata)
 {
 	struct irq_desc *desc = irq_data_to_desc(idata);