From 12c3156f10c5d8c5f1fb3f0bbdb8c1ddb1d1f65c Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 18 Nov 2013 10:59:59 -0700
Subject: PCI: Avoid unnecessary CPU switch when calling driver .probe() method

If we are already on a CPU local to the device, call the driver .probe()
method directly without using work_on_cpu().

This is a workaround for a lockdep warning in the following scenario:

  pci_call_probe
    work_on_cpu(cpu, local_pci_probe, ...)
      driver .probe
        pci_enable_sriov
          ...
            pci_bus_add_device
              ...
                pci_call_probe
                  work_on_cpu(cpu, local_pci_probe, ...)

It would be better to fix PCI so we don't call VF driver .probe() methods
from inside a PF driver .probe() method, but that's a bigger project.

[bhelgaas: open bugzilla, rework comments & changelog]
Link: https://bugzilla.kernel.org/show_bug.cgi?id=65071
Link: http://lkml.kernel.org/r/CAE9FiQXYQEAZ=0sG6+2OdffBqfLS9MpoN1xviRR9aDbxPxcKxQ@mail.gmail.com
Link: http://lkml.kernel.org/r/20130624195942.40795.27292.stgit@ahduyck-cp1.jf.intel.com
Tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
---
 drivers/pci/pci-driver.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'drivers/pci')

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 9042fdbd7244..7edd5c307446 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -288,12 +288,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
 	int error, node;
 	struct drv_dev_and_id ddi = { drv, dev, id };
 
-	/* Execute driver initialization on node where the device's
-	   bus is attached to.  This way the driver likely allocates
-	   its local memory on the right node without any need to
-	   change it. */
+	/*
+	 * Execute driver initialization on node where the device is
+	 * attached.  This way the driver likely allocates its local memory
+	 * on the right node.
+	 */
 	node = dev_to_node(&dev->dev);
-	if (node >= 0) {
+
+	/*
+	 * On NUMA systems, we are likely to call a PF probe function using
+	 * work_on_cpu().  If that probe calls pci_enable_sriov() (which
+	 * adds the VF devices via pci_bus_add_device()), we may re-enter
+	 * this function to call the VF probe function.  Calling
+	 * work_on_cpu() again will cause a lockdep warning.  Since VFs are
+	 * always on the same node as the PF, we can work around this by
+	 * avoiding work_on_cpu() when we're already on the correct node.
+	 *
+	 * Preemption is enabled, so it's theoretically unsafe to use
+	 * numa_node_id(), but even if we run the probe function on the
+	 * wrong node, it should be functionally correct.
+	 */
+	if (node >= 0 && node != numa_node_id()) {
 		int cpu;
 
 		get_online_cpus();
@@ -305,6 +320,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
 		put_online_cpus();
 	} else
 		error = local_pci_probe(&ddi);
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 4bff6749905d3abe7436d3b2d20b626886a04475 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 24 Nov 2013 01:17:52 +0100
Subject: PCI: Move device_del() from pci_stop_dev() to pci_destroy_dev()

After commit bcdde7e221a8 (sysfs: make __sysfs_remove_dir() recursive)
I'm seeing traces analogous to the one below in Thunderbolt testing:

WARNING: CPU: 3 PID: 76 at /scratch/rafael/work/linux-pm/fs/sysfs/group.c:214 sysfs_remove_group+0x59/0xe0()
 sysfs group ffffffff81c6c500 not found for kobject '0000:08'
 Modules linked in: ...
 CPU: 3 PID: 76 Comm: kworker/u16:7 Not tainted 3.13.0-rc1+ #76
 Hardware name: Acer Aspire S5-391/Venus    , BIOS V1.02 05/29/2012
 Workqueue: kacpi_hotplug acpi_hotplug_work_fn
  0000000000000009 ffff8801644b9ac8 ffffffff816b23bf 0000000000000007
  ffff8801644b9b18 ffff8801644b9b08 ffffffff81046607 ffff88016925b800
  0000000000000000 ffffffff81c6c500 ffff88016924f928 ffff88016924f800
 Call Trace:
  [<ffffffff816b23bf>] dump_stack+0x4e/0x71
  [<ffffffff81046607>] warn_slowpath_common+0x87/0xb0
  [<ffffffff810466d1>] warn_slowpath_fmt+0x41/0x50
  [<ffffffff811e42ef>] ? sysfs_get_dirent_ns+0x6f/0x80
  [<ffffffff811e5389>] sysfs_remove_group+0x59/0xe0
  [<ffffffff8149f00b>] dpm_sysfs_remove+0x3b/0x50
  [<ffffffff81495818>] device_del+0x58/0x1c0
  [<ffffffff814959c8>] device_unregister+0x48/0x60
  [<ffffffff813254fe>] pci_remove_bus+0x6e/0x80
  [<ffffffff81325548>] pci_remove_bus_device+0x38/0x110
  [<ffffffff8132555d>] pci_remove_bus_device+0x4d/0x110
  [<ffffffff81325639>] pci_stop_and_remove_bus_device+0x19/0x20
  [<ffffffff813418d0>] disable_slot+0x20/0xe0
  [<ffffffff81341a38>] acpiphp_check_bridge+0xa8/0xd0
  [<ffffffff813427ad>] hotplug_event+0x17d/0x220
  [<ffffffff81342880>] hotplug_event_work+0x30/0x70
  [<ffffffff8136d665>] acpi_hotplug_work_fn+0x18/0x24
  [<ffffffff81061331>] process_one_work+0x261/0x450
  [<ffffffff81061a7e>] worker_thread+0x21e/0x370
  [<ffffffff81061860>] ? rescuer_thread+0x300/0x300
  [<ffffffff81068342>] kthread+0xd2/0xe0
  [<ffffffff81068270>] ? flush_kthread_worker+0x70/0x70
  [<ffffffff816c19bc>] ret_from_fork+0x7c/0xb0
  [<ffffffff81068270>] ? flush_kthread_worker+0x70/0x70

(Mika Westerberg sees them too in his tests).

Some investigation documented in kernel bug #65281 led me to the
conclusion that the source of the problem is the device_del() in
pci_stop_dev() as it now causes the sysfs directory of the device to be
removed recursively along with all of its subdirectories.  That includes
the sysfs directory of the device's subordinate bus (dev->subordinate) and
its "power" group.

Consequently, when pci_remove_bus() is called for dev->subordinate in
pci_remove_bus_device(), it calls device_unregister(&bus->dev), but at this
point the sysfs directory of bus->dev doesn't exist any more and its
"power" group doesn't exist either.  Thus, when dpm_sysfs_remove() called
from device_del() tries to remove that group, it triggers the above
warning.

That indicates a logical mistake in the design of
pci_stop_and_remove_bus_device(), which causes bus device objects to be
left behind their parents (bridge device objects) and can be fixed by
moving the device_del() from pci_stop_dev() into pci_destroy_dev(), so
pci_remove_bus() can be called for the device's subordinate bus before the
device itself is unregistered from the hierarchy.  Still, the driver, if
any, should be detached from the device in pci_stop_dev(), so use
device_release_driver() directly from there.

References: https://bugzilla.kernel.org/show_bug.cgi?id=65281#c6
Reported-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/remove.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/pci')

diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 1576851028db..cc9337a71529 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -24,7 +24,7 @@ static void pci_stop_dev(struct pci_dev *dev)
 	if (dev->is_added) {
 		pci_proc_detach_device(dev);
 		pci_remove_sysfs_dev_files(dev);
-		device_del(&dev->dev);
+		device_release_driver(&dev->dev);
 		dev->is_added = 0;
 	}
 
@@ -34,6 +34,8 @@ static void pci_stop_dev(struct pci_dev *dev)
 
 static void pci_destroy_dev(struct pci_dev *dev)
 {
+	device_del(&dev->dev);
+
 	down_write(&pci_bus_sem);
 	list_del(&dev->bus_list);
 	up_write(&pci_bus_sem);
-- 
cgit v1.2.3


From f407dae76040c9529c2c83b1488dda4ffc54522c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Tue, 26 Nov 2013 11:27:28 -0700
Subject: PCI: mvebu: Return 'unsupported' for Interrupt Line and Interrupt Pin

The emulated bridge does not support interrupts, so it should return the
value 0 for Interrupt Line and Interrupt Pin.  This indicates that
interrupts are not supported.

Since Max_Lat and Min_Gnt are also in the same 32-bit word, we return
0 for them, which means "do not care."

This corrects an error message from the kernel:

  pci 0000:00:01.0: of_irq_parse_pci() failed with rc=135

Which is due to the default return of 0xFFFFFFFF indicating that
interrupts are supported.

The error message regression was caused by 16b84e5a505 ("of/irq: Create
of_irq_parse_and_map_pci() to consolidate arch code.")

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
---
 drivers/pci/host/pci-mvebu.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/pci')

diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c
index c269e430c760..2aa7b77c7c88 100644
--- a/drivers/pci/host/pci-mvebu.c
+++ b/drivers/pci/host/pci-mvebu.c
@@ -447,6 +447,11 @@ static int mvebu_sw_pci_bridge_read(struct mvebu_pcie_port *port,
 		*value = 0;
 		break;
 
+	case PCI_INTERRUPT_LINE:
+		/* LINE PIN MIN_GNT MAX_LAT */
+		*value = 0;
+		break;
+
 	default:
 		*value = 0xffffffff;
 		return PCIBIOS_BAD_REGISTER_NUMBER;
-- 
cgit v1.2.3


From 4fc9bbf98fd66f879e628d8537ba7c240be2b58e Mon Sep 17 00:00:00 2001
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Wed, 27 Nov 2013 15:19:25 -0700
Subject: PCI: Disable Bus Master only on kexec reboot

Add a flag to tell the PCI subsystem that kernel is shutting down in
preparation to kexec a kernel.  Add code in PCI subsystem to use this flag
to clear Bus Master bit on PCI devices only in case of kexec reboot.

This fixes a power-off problem on Acer Aspire V5-573G and likely other
machines and avoids any other issues caused by clearing Bus Master bit on
PCI devices in normal shutdown path.  The problem was introduced by
b566a22c2332 ("PCI: disable Bus Master on PCI device shutdown").

This patch is based on discussion at
http://marc.info/?l=linux-pci&m=138425645204355&w=2

Link: https://bugzilla.kernel.org/show_bug.cgi?id=63861
Reported-by: Chang Liu <cl91tp@gmail.com>
Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: stable@vger.kernel.org	# v3.5+
---
 drivers/pci/pci-driver.c | 12 +++++++++---
 include/linux/kexec.h    |  3 +++
 kernel/kexec.c           |  4 ++++
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'drivers/pci')

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 7edd5c307446..25f0bc659164 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -19,6 +19,7 @@
 #include <linux/cpu.h>
 #include <linux/pm_runtime.h>
 #include <linux/suspend.h>
+#include <linux/kexec.h>
 #include "pci.h"
 
 struct pci_dynid {
@@ -415,12 +416,17 @@ static void pci_device_shutdown(struct device *dev)
 	pci_msi_shutdown(pci_dev);
 	pci_msix_shutdown(pci_dev);
 
+#ifdef CONFIG_KEXEC
 	/*
-	 * Turn off Bus Master bit on the device to tell it to not
-	 * continue to do DMA. Don't touch devices in D3cold or unknown states.
+	 * If this is a kexec reboot, turn off Bus Master bit on the
+	 * device to tell it to not continue to do DMA. Don't touch
+	 * devices in D3cold or unknown states.
+	 * If it is not a kexec reboot, firmware will hit the PCI
+	 * devices with big hammer and stop their DMA any way.
 	 */
-	if (pci_dev->current_state <= PCI_D3hot)
+	if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot))
 		pci_clear_master(pci_dev);
+#endif
 }
 
 #ifdef CONFIG_PM
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d78d28a733b1..5fd33dc1fe3a 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -198,6 +198,9 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
+/* flag to track if kexec reboot is in progress */
+extern bool kexec_in_progress;
+
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 490afc03627e..d0d8fca54065 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
 	.name  = "Crash kernel",
@@ -1675,6 +1678,7 @@ int kernel_kexec(void)
 	} else
 #endif
 	{
+		kexec_in_progress = true;
 		kernel_restart_prepare(NULL);
 		printk(KERN_EMERG "Starting new kernel\n");
 		machine_shutdown();
-- 
cgit v1.2.3