From 12c3156f10c5d8c5f1fb3f0bbdb8c1ddb1d1f65c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 18 Nov 2013 10:59:59 -0700 Subject: PCI: Avoid unnecessary CPU switch when calling driver .probe() method If we are already on a CPU local to the device, call the driver .probe() method directly without using work_on_cpu(). This is a workaround for a lockdep warning in the following scenario: pci_call_probe work_on_cpu(cpu, local_pci_probe, ...) driver .probe pci_enable_sriov ... pci_bus_add_device ... pci_call_probe work_on_cpu(cpu, local_pci_probe, ...) It would be better to fix PCI so we don't call VF driver .probe() methods from inside a PF driver .probe() method, but that's a bigger project. [bhelgaas: open bugzilla, rework comments & changelog] Link: https://bugzilla.kernel.org/show_bug.cgi?id=65071 Link: http://lkml.kernel.org/r/CAE9FiQXYQEAZ=0sG6+2OdffBqfLS9MpoN1xviRR9aDbxPxcKxQ@mail.gmail.com Link: http://lkml.kernel.org/r/20130624195942.40795.27292.stgit@ahduyck-cp1.jf.intel.com Tested-by: Yinghai Lu Signed-off-by: Alexander Duyck Signed-off-by: Bjorn Helgaas Acked-by: Tejun Heo Acked-by: Yinghai Lu --- drivers/pci/pci-driver.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 9042fdbd7244..7edd5c307446 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -288,12 +288,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, int error, node; struct drv_dev_and_id ddi = { drv, dev, id }; - /* Execute driver initialization on node where the device's - bus is attached to. This way the driver likely allocates - its local memory on the right node without any need to - change it. */ + /* + * Execute driver initialization on node where the device is + * attached. This way the driver likely allocates its local memory + * on the right node. + */ node = dev_to_node(&dev->dev); - if (node >= 0) { + + /* + * On NUMA systems, we are likely to call a PF probe function using + * work_on_cpu(). If that probe calls pci_enable_sriov() (which + * adds the VF devices via pci_bus_add_device()), we may re-enter + * this function to call the VF probe function. Calling + * work_on_cpu() again will cause a lockdep warning. Since VFs are + * always on the same node as the PF, we can work around this by + * avoiding work_on_cpu() when we're already on the correct node. + * + * Preemption is enabled, so it's theoretically unsafe to use + * numa_node_id(), but even if we run the probe function on the + * wrong node, it should be functionally correct. + */ + if (node >= 0 && node != numa_node_id()) { int cpu; get_online_cpus(); @@ -305,6 +320,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, put_online_cpus(); } else error = local_pci_probe(&ddi); + return error; } -- cgit v1.2.3 From 4bff6749905d3abe7436d3b2d20b626886a04475 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 24 Nov 2013 01:17:52 +0100 Subject: PCI: Move device_del() from pci_stop_dev() to pci_destroy_dev() After commit bcdde7e221a8 (sysfs: make __sysfs_remove_dir() recursive) I'm seeing traces analogous to the one below in Thunderbolt testing: WARNING: CPU: 3 PID: 76 at /scratch/rafael/work/linux-pm/fs/sysfs/group.c:214 sysfs_remove_group+0x59/0xe0() sysfs group ffffffff81c6c500 not found for kobject '0000:08' Modules linked in: ... CPU: 3 PID: 76 Comm: kworker/u16:7 Not tainted 3.13.0-rc1+ #76 Hardware name: Acer Aspire S5-391/Venus , BIOS V1.02 05/29/2012 Workqueue: kacpi_hotplug acpi_hotplug_work_fn 0000000000000009 ffff8801644b9ac8 ffffffff816b23bf 0000000000000007 ffff8801644b9b18 ffff8801644b9b08 ffffffff81046607 ffff88016925b800 0000000000000000 ffffffff81c6c500 ffff88016924f928 ffff88016924f800 Call Trace: [] dump_stack+0x4e/0x71 [] warn_slowpath_common+0x87/0xb0 [] warn_slowpath_fmt+0x41/0x50 [] ? sysfs_get_dirent_ns+0x6f/0x80 [] sysfs_remove_group+0x59/0xe0 [] dpm_sysfs_remove+0x3b/0x50 [] device_del+0x58/0x1c0 [] device_unregister+0x48/0x60 [] pci_remove_bus+0x6e/0x80 [] pci_remove_bus_device+0x38/0x110 [] pci_remove_bus_device+0x4d/0x110 [] pci_stop_and_remove_bus_device+0x19/0x20 [] disable_slot+0x20/0xe0 [] acpiphp_check_bridge+0xa8/0xd0 [] hotplug_event+0x17d/0x220 [] hotplug_event_work+0x30/0x70 [] acpi_hotplug_work_fn+0x18/0x24 [] process_one_work+0x261/0x450 [] worker_thread+0x21e/0x370 [] ? rescuer_thread+0x300/0x300 [] kthread+0xd2/0xe0 [] ? flush_kthread_worker+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0x70/0x70 (Mika Westerberg sees them too in his tests). Some investigation documented in kernel bug #65281 led me to the conclusion that the source of the problem is the device_del() in pci_stop_dev() as it now causes the sysfs directory of the device to be removed recursively along with all of its subdirectories. That includes the sysfs directory of the device's subordinate bus (dev->subordinate) and its "power" group. Consequently, when pci_remove_bus() is called for dev->subordinate in pci_remove_bus_device(), it calls device_unregister(&bus->dev), but at this point the sysfs directory of bus->dev doesn't exist any more and its "power" group doesn't exist either. Thus, when dpm_sysfs_remove() called from device_del() tries to remove that group, it triggers the above warning. That indicates a logical mistake in the design of pci_stop_and_remove_bus_device(), which causes bus device objects to be left behind their parents (bridge device objects) and can be fixed by moving the device_del() from pci_stop_dev() into pci_destroy_dev(), so pci_remove_bus() can be called for the device's subordinate bus before the device itself is unregistered from the hierarchy. Still, the driver, if any, should be detached from the device in pci_stop_dev(), so use device_release_driver() directly from there. References: https://bugzilla.kernel.org/show_bug.cgi?id=65281#c6 Reported-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki Signed-off-by: Bjorn Helgaas --- drivers/pci/remove.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/pci') diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 1576851028db..cc9337a71529 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -24,7 +24,7 @@ static void pci_stop_dev(struct pci_dev *dev) if (dev->is_added) { pci_proc_detach_device(dev); pci_remove_sysfs_dev_files(dev); - device_del(&dev->dev); + device_release_driver(&dev->dev); dev->is_added = 0; } @@ -34,6 +34,8 @@ static void pci_stop_dev(struct pci_dev *dev) static void pci_destroy_dev(struct pci_dev *dev) { + device_del(&dev->dev); + down_write(&pci_bus_sem); list_del(&dev->bus_list); up_write(&pci_bus_sem); -- cgit v1.2.3 From f407dae76040c9529c2c83b1488dda4ffc54522c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 26 Nov 2013 11:27:28 -0700 Subject: PCI: mvebu: Return 'unsupported' for Interrupt Line and Interrupt Pin The emulated bridge does not support interrupts, so it should return the value 0 for Interrupt Line and Interrupt Pin. This indicates that interrupts are not supported. Since Max_Lat and Min_Gnt are also in the same 32-bit word, we return 0 for them, which means "do not care." This corrects an error message from the kernel: pci 0000:00:01.0: of_irq_parse_pci() failed with rc=135 Which is due to the default return of 0xFFFFFFFF indicating that interrupts are supported. The error message regression was caused by 16b84e5a505 ("of/irq: Create of_irq_parse_and_map_pci() to consolidate arch code.") Signed-off-by: Jason Gunthorpe Signed-off-by: Bjorn Helgaas Acked-by: Jason Cooper --- drivers/pci/host/pci-mvebu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c index c269e430c760..2aa7b77c7c88 100644 --- a/drivers/pci/host/pci-mvebu.c +++ b/drivers/pci/host/pci-mvebu.c @@ -447,6 +447,11 @@ static int mvebu_sw_pci_bridge_read(struct mvebu_pcie_port *port, *value = 0; break; + case PCI_INTERRUPT_LINE: + /* LINE PIN MIN_GNT MAX_LAT */ + *value = 0; + break; + default: *value = 0xffffffff; return PCIBIOS_BAD_REGISTER_NUMBER; -- cgit v1.2.3 From 4fc9bbf98fd66f879e628d8537ba7c240be2b58e Mon Sep 17 00:00:00 2001 From: Khalid Aziz Date: Wed, 27 Nov 2013 15:19:25 -0700 Subject: PCI: Disable Bus Master only on kexec reboot Add a flag to tell the PCI subsystem that kernel is shutting down in preparation to kexec a kernel. Add code in PCI subsystem to use this flag to clear Bus Master bit on PCI devices only in case of kexec reboot. This fixes a power-off problem on Acer Aspire V5-573G and likely other machines and avoids any other issues caused by clearing Bus Master bit on PCI devices in normal shutdown path. The problem was introduced by b566a22c2332 ("PCI: disable Bus Master on PCI device shutdown"). This patch is based on discussion at http://marc.info/?l=linux-pci&m=138425645204355&w=2 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63861 Reported-by: Chang Liu Signed-off-by: Khalid Aziz Signed-off-by: Bjorn Helgaas Acked-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org # v3.5+ --- drivers/pci/pci-driver.c | 12 +++++++++--- include/linux/kexec.h | 3 +++ kernel/kexec.c | 4 ++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 7edd5c307446..25f0bc659164 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "pci.h" struct pci_dynid { @@ -415,12 +416,17 @@ static void pci_device_shutdown(struct device *dev) pci_msi_shutdown(pci_dev); pci_msix_shutdown(pci_dev); +#ifdef CONFIG_KEXEC /* - * Turn off Bus Master bit on the device to tell it to not - * continue to do DMA. Don't touch devices in D3cold or unknown states. + * If this is a kexec reboot, turn off Bus Master bit on the + * device to tell it to not continue to do DMA. Don't touch + * devices in D3cold or unknown states. + * If it is not a kexec reboot, firmware will hit the PCI + * devices with big hammer and stop their DMA any way. */ - if (pci_dev->current_state <= PCI_D3hot) + if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) pci_clear_master(pci_dev); +#endif } #ifdef CONFIG_PM diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d78d28a733b1..5fd33dc1fe3a 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -198,6 +198,9 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; +/* flag to track if kexec reboot is in progress */ +extern bool kexec_in_progress; + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, diff --git a/kernel/kexec.c b/kernel/kexec.c index 490afc03627e..d0d8fca54065 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; size_t vmcoreinfo_size; size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -1675,6 +1678,7 @@ int kernel_kexec(void) } else #endif { + kexec_in_progress = true; kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); -- cgit v1.2.3