diff options
Diffstat (limited to '')
34 files changed, 1792 insertions, 1462 deletions
diff --git a/arch/powerpc/platforms/52xx/lite5200_sleep.S b/arch/powerpc/platforms/52xx/lite5200_sleep.S index 70083649c9ea..11475c58ea43 100644 --- a/arch/powerpc/platforms/52xx/lite5200_sleep.S +++ b/arch/powerpc/platforms/52xx/lite5200_sleep.S @@ -56,7 +56,7 @@ lite5200_low_power: /* * save stuff BDI overwrites * 0xf0 (0xe0->0x100 gets overwritten when BDI connected; - * even when CONFIG_BDI* is disabled and MMU XLAT commented; heisenbug?)) + * even when CONFIG_BDI_SWITCH is disabled and MMU XLAT commented; heisenbug?)) * WARNING: self-refresh doesn't seem to work when BDI2000 is connected, * possibly because BDI sets SDRAM registers before wakeup code does */ diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index fa3d29dcb57e..b77cbb0a50e1 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -160,7 +160,7 @@ config XES_MPC85xx computers from Extreme Engineering Solutions (X-ES) based on Freescale MPC85xx processors. Manufacturer: Extreme Engineering Solutions, Inc. - URL: <http://www.xes-inc.com/> + URL: <https://www.xes-inc.com/> config STX_GP3 bool "Silicon Turnkey Express GP3" diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index 0f7c8241912b..f2ff359041ee 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -44,6 +44,7 @@ config SPU_FS tristate "SPU file system" default m depends on PPC_CELL + depends on COREDUMP select SPU_BASE help The SPU file system is used to access Synergistic Processing diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 3b75e8f60609..026c181a98c5 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -66,13 +66,20 @@ static int match_context(const void *v, struct file *file, unsigned fd) */ static struct spu_context *coredump_next_context(int *fd) { + struct spu_context *ctx; struct file *file; int n = iterate_fd(current->files, *fd, match_context, NULL); if (!n) return NULL; *fd = n - 1; + + rcu_read_lock(); file = fcheck(*fd); - return SPUFS_I(file_inode(file))->i_ctx; + ctx = SPUFS_I(file_inode(file))->i_ctx; + get_spu_context(ctx); + rcu_read_unlock(); + + return ctx; } int spufs_coredump_extra_notes_size(void) @@ -83,17 +90,23 @@ int spufs_coredump_extra_notes_size(void) fd = 0; while ((ctx = coredump_next_context(&fd)) != NULL) { rc = spu_acquire_saved(ctx); - if (rc) + if (rc) { + put_spu_context(ctx); break; + } + rc = spufs_ctx_note_size(ctx, fd); spu_release_saved(ctx); - if (rc < 0) + if (rc < 0) { + put_spu_context(ctx); break; + } size += rc; /* start searching the next fd next time */ fd++; + put_spu_context(ctx); } return size; @@ -105,7 +118,7 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i, size_t sz = spufs_coredump_read[i].size; char fullname[80]; struct elf_note en; - size_t ret; + int ret; sprintf(fullname, "SPU/%d/%s", dfd, spufs_coredump_read[i].name); en.n_namesz = strlen(fullname) + 1; diff --git a/arch/powerpc/platforms/pasemi/misc.c b/arch/powerpc/platforms/pasemi/misc.c index 1cd4ca14b08d..1bf65d02d3ba 100644 --- a/arch/powerpc/platforms/pasemi/misc.c +++ b/arch/powerpc/platforms/pasemi/misc.c @@ -56,8 +56,7 @@ static int __init pasemi_register_i2c_devices(void) if (!adap_node) continue; - node = NULL; - while ((node = of_get_next_child(adap_node, node))) { + for_each_child_of_node(adap_node, node) { struct i2c_board_info info = {}; const u32 *addr; int len; diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index 181caa3f6717..5c77b9a24c0e 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -1465,7 +1465,7 @@ static long g5_i2s_enable(struct device_node *node, long param, long value) case 2: if (macio->type == macio_shasta) break; - /* fall through */ + fallthrough; default: return -ENODEV; } diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index bf4be4b53b44..f77a59b5c2e1 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -629,8 +629,7 @@ static void __init kw_i2c_probe(void) for (i = 0; i < chans; i++) kw_i2c_add(host, np, np, i); } else { - for (child = NULL; - (child = of_get_next_child(np, child)) != NULL;) { + for_each_child_of_node(np, child) { const u32 *reg = of_get_property(child, "reg", NULL); if (reg == NULL) @@ -1193,8 +1192,7 @@ static void pmac_i2c_devscan(void (*callback)(struct device_node *dev, * platform function instance */ list_for_each_entry(bus, &pmac_i2c_busses, link) { - for (np = NULL; - (np = of_get_next_child(bus->busnode, np)) != NULL;) { + for_each_child_of_node(bus->busnode, np) { struct whitelist_ent *p; /* If multibus, check if device is on that bus */ if (bus->flags & pmac_i2c_multibus) diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c index 62311e84a423..f5422506d4b0 100644 --- a/arch/powerpc/platforms/powermac/pfunc_base.c +++ b/arch/powerpc/platforms/powermac/pfunc_base.c @@ -114,7 +114,7 @@ static void macio_gpio_init_one(struct macio_chip *macio) * Ok, got one, we dont need anything special to track them down, so * we just create them all */ - for (gp = NULL; (gp = of_get_next_child(gparent, gp)) != NULL;) { + for_each_child_of_node(gparent, gp) { const u32 *reg = of_get_property(gp, "reg", NULL); unsigned long offset; if (reg == NULL) @@ -133,7 +133,7 @@ static void macio_gpio_init_one(struct macio_chip *macio) macio->of_node); /* And now we run all the init ones */ - for (gp = NULL; (gp = of_get_next_child(gparent, gp)) != NULL;) + for_each_child_of_node(gparent, gp) pmf_do_functions(gp, NULL, 0, PMF_FLAGS_ON_INIT, NULL); /* Note: We do not at this point implement the "at sleep" or "at wake" diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c index 6b61a18e8db3..f286bdfe8346 100644 --- a/arch/powerpc/platforms/powermac/udbg_scc.c +++ b/arch/powerpc/platforms/powermac/udbg_scc.c @@ -80,7 +80,7 @@ void udbg_scc_init(int force_scc) path = of_get_property(of_chosen, "linux,stdout-path", NULL); if (path != NULL) stdout = of_find_node_by_path(path); - for (ch = NULL; (ch = of_get_next_child(escc, ch)) != NULL;) { + for_each_child_of_node(escc, ch) { if (ch == stdout) ch_def = of_node_get(ch); if (of_node_name_eq(ch, "ch-a")) diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index fe3f0fb5aeca..2eb6ae150d1f 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o obj-$(CONFIG_OPAL_CORE) += opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o +obj-$(CONFIG_PCI_IOV) += pci-sriov.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 79409e005fcd..9af8c3b98853 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -338,6 +338,28 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) return 0; } +static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev) +{ + struct pci_controller *hose = pdev->bus->sysdata; + struct pnv_phb *phb = hose->private_data; + struct pci_dev *parent = pdev->bus->self; + +#ifdef CONFIG_PCI_IOV + /* for VFs we use the PF's PE as the upstream PE */ + if (pdev->is_virtfn) + parent = pdev->physfn; +#endif + + /* otherwise use the PE of our parent bridge */ + if (parent) { + struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent); + + return eeh_pe_get(phb->hose, ioda_pe->pe_number, 0); + } + + return NULL; +} + /** * pnv_eeh_probe - Do probe on PCI device * @pdev: pci_dev to probe @@ -350,6 +372,7 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) struct pci_controller *hose = pdn->phb; struct pnv_phb *phb = hose->private_data; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + struct eeh_pe *upstream_pe; uint32_t pcie_flags; int ret; int config_addr = (pdn->busno << 8) | (pdn->devfn); @@ -372,19 +395,18 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) } /* Skip for PCI-ISA bridge */ - if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; eeh_edev_dbg(edev, "Probing device\n"); /* Initialize eeh device */ - edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP); edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF); edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); - if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS, @@ -399,8 +421,10 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) edev->pe_config_addr = phb->ioda.pe_rmap[config_addr]; + upstream_pe = pnv_eeh_get_upstream_pe(pdev); + /* Create PE */ - ret = eeh_add_to_parent_pe(edev); + ret = eeh_pe_tree_insert(edev, upstream_pe); if (ret) { eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; @@ -535,18 +559,6 @@ static int pnv_eeh_set_option(struct eeh_pe *pe, int option) return 0; } -/** - * pnv_eeh_get_pe_addr - Retrieve PE address - * @pe: EEH PE - * - * Retrieve the PE address according to the given tranditional - * PCI BDF (Bus/Device/Function) address. - */ -static int pnv_eeh_get_pe_addr(struct eeh_pe *pe) -{ - return pe->addr; -} - static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) { struct pnv_phb *phb = pe->phb->private_data; @@ -850,32 +862,32 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) case EEH_RESET_HOT: /* Don't report linkDown event */ if (aer) { - eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, &ctrl); ctrl |= PCI_ERR_UNC_SURPDN; - eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, ctrl); } - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl |= PCI_BRIDGE_CTL_BUS_RESET; - eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; - eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl); msleep(EEH_PE_RST_SETTLE_TIME); /* Continue reporting linkDown event */ if (aer) { - eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, &ctrl); ctrl &= ~PCI_ERR_UNC_SURPDN; - eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, ctrl); } @@ -944,11 +956,12 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev) static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, int pos, u16 mask) { + struct eeh_dev *edev = pdn->edev; int i, status = 0; /* Wait for Transaction Pending bit to be cleared */ for (i = 0; i < 4; i++) { - eeh_ops->read_config(pdn, pos, 2, &status); + eeh_ops->read_config(edev, pos, 2, &status); if (!(status & mask)) return; @@ -969,7 +982,7 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option) if (WARN_ON(!edev->pcie_cap)) return -ENOTTY; - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, ®); + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCAP, 4, ®); if (!(reg & PCI_EXP_DEVCAP_FLR)) return -ENOTTY; @@ -979,18 +992,18 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option) pnv_eeh_wait_for_pending(pdn, "", edev->pcie_cap + PCI_EXP_DEVSTA, PCI_EXP_DEVSTA_TRPND); - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, ®); reg |= PCI_EXP_DEVCTL_BCR_FLR; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, reg); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, ®); reg &= ~PCI_EXP_DEVCTL_BCR_FLR; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, reg); msleep(EEH_PE_RST_SETTLE_TIME); break; @@ -1007,7 +1020,7 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option) if (WARN_ON(!edev->af_cap)) return -ENOTTY; - eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap); + eeh_ops->read_config(edev, edev->af_cap + PCI_AF_CAP, 1, &cap); if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR)) return -ENOTTY; @@ -1022,12 +1035,12 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option) pnv_eeh_wait_for_pending(pdn, "AF", edev->af_cap + PCI_AF_CTRL, PCI_AF_STATUS_TP << 8); - eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, + eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, PCI_AF_CTRL_FLR); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0); + eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, 0); msleep(EEH_PE_RST_SETTLE_TIME); break; } @@ -1261,9 +1274,11 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn) return false; } -static int pnv_eeh_read_config(struct pci_dn *pdn, +static int pnv_eeh_read_config(struct eeh_dev *edev, int where, int size, u32 *val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; @@ -1275,9 +1290,11 @@ static int pnv_eeh_read_config(struct pci_dn *pdn, return pnv_pci_cfg_read(pdn, where, size, val); } -static int pnv_eeh_write_config(struct pci_dn *pdn, +static int pnv_eeh_write_config(struct eeh_dev *edev, int where, int size, u32 val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; @@ -1631,34 +1648,24 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) return ret; } -static int pnv_eeh_restore_config(struct pci_dn *pdn) +static int pnv_eeh_restore_config(struct eeh_dev *edev) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); struct pnv_phb *phb; s64 ret = 0; - int config_addr = (pdn->busno << 8) | (pdn->devfn); if (!edev) return -EEXIST; - /* - * We have to restore the PCI config space after reset since the - * firmware can't see SRIOV VFs. - * - * FIXME: The MPS, error routing rules, timeout setting are worthy - * to be exported by firmware in extendible way. - */ - if (edev->physfn) { - ret = eeh_restore_vf_config(pdn); - } else { - phb = pdn->phb->private_data; - ret = opal_pci_reinit(phb->opal_id, - OPAL_REINIT_PCI_DEV, config_addr); - } + if (edev->physfn) + return 0; + + phb = edev->controller->private_data; + ret = opal_pci_reinit(phb->opal_id, + OPAL_REINIT_PCI_DEV, edev->bdfn); if (ret) { pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, config_addr, ret); + __func__, edev->bdfn, ret); return -EIO; } @@ -1670,7 +1677,6 @@ static struct eeh_ops pnv_eeh_ops = { .init = pnv_eeh_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, - .get_pe_addr = pnv_eeh_get_pe_addr, .get_state = pnv_eeh_get_state, .reset = pnv_eeh_reset, .get_log = pnv_eeh_get_log, diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 2dd467383a88..77513a80cef9 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -48,7 +48,7 @@ static bool default_stop_found; * First stop state levels when SPR and TB loss can occur. */ static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1; -static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; +static u64 deep_spr_loss_state = MAX_STOP_STATE + 1; /* * psscr value and mask of the deepest stop idle state. @@ -73,9 +73,6 @@ static int pnv_save_sprs_for_deep_states(void) */ uint64_t lpcr_val = mfspr(SPRN_LPCR); uint64_t hid0_val = mfspr(SPRN_HID0); - uint64_t hid1_val = mfspr(SPRN_HID1); - uint64_t hid4_val = mfspr(SPRN_HID4); - uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; uint64_t psscr_val = pnv_deepest_stop_psscr_val; @@ -117,6 +114,9 @@ static int pnv_save_sprs_for_deep_states(void) /* Only p8 needs to set extra HID regiters */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); if (rc != 0) @@ -611,6 +611,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) unsigned long srr1; unsigned long pls; unsigned long mmcr0 = 0; + unsigned long mmcra = 0; struct p9_sprs sprs = {}; /* avoid false used-uninitialised */ bool sprs_saved = false; @@ -657,7 +658,22 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) */ mmcr0 = mfspr(SPRN_MMCR0); } - if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) { + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + /* + * POWER10 uses MMCRA (BHRBRD) as BHRB disable bit. + * If the user hasn't asked for the BHRB to be + * written, the value of MMCRA[BHRBRD] is 1. + * On wakeup from stop, MMCRA[BHRBD] will be 0, + * since it is previleged resource and will be lost. + * Thus, if we do not save and restore the MMCRA[BHRBD], + * hardware will be needlessly writing to the BHRB + * in problem mode. + */ + mmcra = mfspr(SPRN_MMCRA); + } + + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) { sprs.lpcr = mfspr(SPRN_LPCR); sprs.hfscr = mfspr(SPRN_HFSCR); sprs.fscr = mfspr(SPRN_FSCR); @@ -700,8 +716,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { - unsigned long mmcra; - /* * We don't need an isync after the mtsprs here because the * upcoming mtmsrd is execution synchronizing. @@ -721,6 +735,10 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) mtspr(SPRN_MMCR0, mmcr0); } + /* Reload MMCRA to restore BHRB disable bit for POWER10 */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + mtspr(SPRN_MMCRA, mmcra); + /* * DD2.2 and earlier need to set then clear bit 60 in MMCRA * to ensure the PMU starts running. @@ -741,7 +759,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) * just always test PSSCR for SPR/TB state loss. */ pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT; - if (likely(pls < pnv_first_spr_loss_level)) { + if (likely(pls < deep_spr_loss_state)) { if (sprs_saved) atomic_stop_thread_idle(); goto out; @@ -1088,7 +1106,7 @@ static void __init pnv_power9_idle_init(void) * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state. */ pnv_first_tb_loss_level = MAX_STOP_STATE + 1; - pnv_first_spr_loss_level = MAX_STOP_STATE + 1; + deep_spr_loss_state = MAX_STOP_STATE + 1; for (i = 0; i < nr_pnv_idle_states; i++) { int err; struct pnv_idle_states_t *state = &pnv_idle_states[i]; @@ -1099,8 +1117,8 @@ static void __init pnv_power9_idle_init(void) pnv_first_tb_loss_level = psscr_rl; if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) && - (pnv_first_spr_loss_level > psscr_rl)) - pnv_first_spr_loss_level = psscr_rl; + (deep_spr_loss_state > psscr_rl)) + deep_spr_loss_state = psscr_rl; /* * The idle code does not deal with TB loss occurring @@ -1111,8 +1129,8 @@ static void __init pnv_power9_idle_init(void) * compatibility. */ if ((state->flags & OPAL_PM_TIMEBASE_STOP) && - (pnv_first_spr_loss_level > psscr_rl)) - pnv_first_spr_loss_level = psscr_rl; + (deep_spr_loss_state > psscr_rl)) + deep_spr_loss_state = psscr_rl; err = validate_psscr_val_mask(&state->psscr_val, &state->psscr_mask, @@ -1158,7 +1176,7 @@ static void __init pnv_power9_idle_init(void) } pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n", - pnv_first_spr_loss_level); + deep_spr_loss_state); pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n", pnv_first_tb_loss_level); @@ -1205,7 +1223,7 @@ static void __init pnv_probe_idle_states(void) return; } - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (pvr_version_is(PVR_POWER9)) pnv_power9_idle_init(); for (i = 0; i < nr_pnv_idle_states; i++) diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 1656e8965d6b..c094fdf5825c 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -104,7 +104,7 @@ static int __opal_async_release_token(int token) */ case ASYNC_TOKEN_DISPATCHED: opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; - /* Fall through */ + fallthrough; default: rc = 1; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index f923359d8afc..5218f5da2737 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -166,7 +166,7 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, if (!ptce) { ptce = pnv_tce(tbl, false, idx, alloc); if (!ptce) - return alloc ? H_HARDWARE : H_TOO_HARD; + return -ENOMEM; } if (newtce & TCE_PCI_WRITE) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 73a63efcf855..c9c25fb0783c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -115,32 +115,13 @@ static int __init pci_reset_phbs_setup(char *str) early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup); -static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) -{ - /* - * WARNING: We cannot rely on the resource flags. The Linux PCI - * allocation code sometimes decides to put a 64-bit prefetchable - * BAR in the 32-bit window, so we have to compare the addresses. - * - * For simplicity we only test resource start. - */ - return (r->start >= phb->ioda.m64_base && - r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); -} - -static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) -{ - unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); - - return (resource_flags & flags) == flags; -} - static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) { s64 rc; phb->ioda.pe_array[pe_no].phb = phb; phb->ioda.pe_array[pe_no].pe_number = pe_no; + phb->ioda.pe_array[pe_no].dma_setup_done = false; /* * Clear the PE frozen state as it might be put into frozen state @@ -164,26 +145,48 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) return; } + mutex_lock(&phb->ioda.pe_alloc_mutex); if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) pr_debug("%s: PE %x was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); + mutex_unlock(&phb->ioda.pe_alloc_mutex); pnv_ioda_init_pe(phb, pe_no); } -static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count) { - long pe; + struct pnv_ioda_pe *ret = NULL; + int run = 0, pe, i; + + mutex_lock(&phb->ioda.pe_alloc_mutex); + /* scan backwards for a run of @count cleared bits */ for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) { - if (!test_and_set_bit(pe, phb->ioda.pe_alloc)) - return pnv_ioda_init_pe(phb, pe); + if (test_bit(pe, phb->ioda.pe_alloc)) { + run = 0; + continue; + } + + run++; + if (run == count) + break; + } + if (run != count) + goto out; + + for (i = pe; i < pe + count; i++) { + set_bit(i, phb->ioda.pe_alloc); + pnv_ioda_init_pe(phb, i); } + ret = &phb->ioda.pe_array[pe]; - return NULL; +out: + mutex_unlock(&phb->ioda.pe_alloc_mutex); + return ret; } -static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { struct pnv_phb *phb = pe->phb; unsigned int pe_num = pe->pe_number; @@ -192,7 +195,10 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */ kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); + + mutex_lock(&phb->ioda.pe_alloc_mutex); clear_bit(pe_num, phb->ioda.pe_alloc); + mutex_unlock(&phb->ioda.pe_alloc_mutex); } /* The default M64 BAR is shared by all PEs */ @@ -252,8 +258,7 @@ fail: static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct resource *r; resource_size_t base, sgsz, start, end; int segno, i; @@ -311,6 +316,28 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb) } } + for (index = 0; index < phb->ioda.total_pe_num; index++) { + int64_t rc; + + /* + * P7IOC supports M64DT, which helps mapping M64 segment + * to one particular PE#. However, PHB3 has fixed mapping + * between M64 segment and PE#. In order to have same logic + * for P7IOC and PHB3, we enforce fixed mapping between M64 + * segment and PE# on P7IOC. + */ + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + index, OPAL_M64_WINDOW_TYPE, + index / PNV_IODA1_M64_SEGS, + index % PNV_IODA1_M64_SEGS); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n", + __func__, rc, phb->hose->global_number, + index); + goto fail; + } + } + /* * Exclude the segments for reserved and root bus PE, which * are first or last two PEs. @@ -351,8 +378,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *master_pe, *pe; unsigned long size, *pe_alloc; int i; @@ -403,26 +429,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } - - /* - * P7IOC supports M64DT, which helps mapping M64 segment - * to one particular PE#. However, PHB3 has fixed mapping - * between M64 segment and PE#. In order to have same logic - * for P7IOC and PHB3, we enforce fixed mapping between M64 - * segment and PE# on P7IOC. - */ - if (phb->type == PNV_PHB_IODA1) { - int64_t rc; - - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M64_WINDOW_TYPE, - pe->pe_number / PNV_IODA1_M64_SEGS, - pe->pe_number % PNV_IODA1_M64_SEGS); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n", - __func__, rc, phb->hose->global_number, - pe->pe_number); - } } kfree(pe_alloc); @@ -673,8 +679,7 @@ struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn) struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); if (!pdn) @@ -816,7 +821,7 @@ static void pnv_ioda_unset_peltv(struct pnv_phb *phb, pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); } -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; @@ -887,7 +892,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) return 0; } -static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; @@ -982,95 +987,9 @@ out: return 0; } -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) -{ - struct pci_dn *pdn = pci_get_pdn(dev); - int i; - struct resource *res, res2; - resource_size_t size; - u16 num_vfs; - - if (!dev->is_physfn) - return -EINVAL; - - /* - * "offset" is in VFs. The M64 windows are sized so that when they - * are segmented, each segment is the same size as the IOV BAR. - * Each segment is in a separate PE, and the high order bits of the - * address are the PE number. Therefore, each VF's BAR is in a - * separate PE, and changing the IOV BAR start address changes the - * range of PEs the VFs are in. - */ - num_vfs = pdn->num_vfs; - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - /* - * The actual IOV BAR range is determined by the start address - * and the actual size for num_vfs VFs BAR. This check is to - * make sure that after shifting, the range will not overlap - * with another device. - */ - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2.flags = res->flags; - res2.start = res->start + (size * offset); - res2.end = res2.start + (size * num_vfs) - 1; - - if (res2.end > res->end) { - dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", - i, &res2, res, num_vfs, offset); - return -EBUSY; - } - } - - /* - * Since M64 BAR shares segments among all possible 256 PEs, - * we have to shift the beginning of PF IOV BAR to make it start from - * the segment which belongs to the PE number assigned to the first VF. - * This creates a "hole" in the /proc/iomem which could be used for - * allocating other resources so we reserve this area below and - * release when IOV is released. - */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2 = *res; - res->start += size * offset; - - dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", - i, &res2, res, (offset > 0) ? "En" : "Dis", - num_vfs, offset); - - if (offset < 0) { - devm_release_resource(&dev->dev, &pdn->holes[i]); - memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); - } - - pci_update_resource(dev, i + PCI_IOV_RESOURCES); - - if (offset > 0) { - pdn->holes[i].start = res2.start; - pdn->holes[i].end = res2.start + size * offset - 1; - pdn->holes[i].flags = IORESOURCE_BUS; - pdn->holes[i].name = "pnv_iov_reserved"; - devm_request_resource(&dev->dev, res->parent, - &pdn->holes[i]); - } - } - return 0; -} -#endif /* CONFIG_PCI_IOV */ - static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; @@ -1082,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { pr_warn("%s: Not enough PE# available, disabling device\n", pci_name(dev)); @@ -1129,8 +1048,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) */ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe = NULL; unsigned int pe_num; @@ -1154,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* The PE number isn't pinned by M64 */ if (!pe) - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n", @@ -1196,8 +1114,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) struct pnv_ioda_pe *pe; struct pci_dev *gpu_pdev; struct pci_dn *npu_pdn; - struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus); /* * Intentionally leak a reference on the npu device (for @@ -1297,446 +1214,12 @@ static void pnv_pci_ioda_setup_nvlink(void) #endif } -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - int i, j; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < m64_bars; j++) { - if (pdn->m64_map[j][i] == IODA_INVALID_M64) - continue; - opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); - clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); - pdn->m64_map[j][i] = IODA_INVALID_M64; - } - - kfree(pdn->m64_map); - return 0; -} - -static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - unsigned int win; - struct resource *res; - int i, j; - int64_t rc; - int total_vfs; - resource_size_t size, start; - int pe_num; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - total_vfs = pci_sriov_get_totalvfs(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - pdn->m64_map = kmalloc_array(m64_bars, - sizeof(*pdn->m64_map), - GFP_KERNEL); - if (!pdn->m64_map) - return -ENOMEM; - /* Initialize the m64_map to IODA_INVALID_M64 */ - for (i = 0; i < m64_bars ; i++) - for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) - pdn->m64_map[i][j] = IODA_INVALID_M64; - - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - for (j = 0; j < m64_bars; j++) { - do { - win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, - phb->ioda.m64_bar_idx + 1, 0); - - if (win >= phb->ioda.m64_bar_idx + 1) - goto m64_failed; - } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); - - pdn->m64_map[j][i] = win; - - if (pdn->m64_single_mode) { - size = pci_iov_resource_size(pdev, - PCI_IOV_RESOURCES + i); - start = res->start + size * j; - } else { - size = resource_size(res); - start = res->start; - } - - /* Map the M64 here */ - if (pdn->m64_single_mode) { - pe_num = pdn->pe_num_map[j]; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe_num, OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], 0); - } - - rc = opal_pci_set_phb_mem_window(phb->opal_id, - OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], - start, - 0, /* unused */ - size); - - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", - win, rc); - goto m64_failed; - } - - if (pdn->m64_single_mode) - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); - else - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", - win, rc); - goto m64_failed; - } - } - } - return 0; - -m64_failed: - pnv_pci_vf_release_m64(pdev, num_vfs); - return -EBUSY; -} - -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, - int num); - -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) -{ - struct iommu_table *tbl; - int64_t rc; - - tbl = pe->table_group.tables[0]; - rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); - if (rc) - pe_warn(pe, "OPAL error %lld release DMA window\n", rc); - - pnv_pci_ioda2_set_bypass(pe, false); - if (pe->table_group.group) { - iommu_group_put(pe->table_group.group); - BUG_ON(pe->table_group.group); - } - iommu_tce_table_put(tbl); -} - -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *pe_n; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { - if (pe->parent_dev != pdev) - continue; - - pnv_pci_ioda2_release_dma_pe(pdev, pe); - - /* Remove from list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_del(&pe->list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - pnv_ioda_deconfigure_pe(phb, pe); - - pnv_ioda_free_pe(pe); - } -} - -void pnv_pci_sriov_disable(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - u16 num_vfs, i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - num_vfs = pdn->num_vfs; - - /* Release VF PEs */ - pnv_ioda_release_vf_PE(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->m64_single_mode) - pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map); - - /* Release M64 windows */ - pnv_pci_vf_release_m64(pdev, num_vfs); - - /* Release PE numbers */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - } -} - -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); -static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - int pe_num; - u16 vf_index; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - /* Reserve PE for each VF */ - for (vf_index = 0; vf_index < num_vfs; vf_index++) { - int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); - int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); - struct pci_dn *vf_pdn; - - if (pdn->m64_single_mode) - pe_num = pdn->pe_num_map[vf_index]; - else - pe_num = *pdn->pe_num_map + vf_index; - - pe = &phb->ioda.pe_array[pe_num]; - pe->pe_number = pe_num; - pe->phb = phb; - pe->flags = PNV_IODA_PE_VF; - pe->pbus = NULL; - pe->parent_dev = pdev; - pe->mve_number = -1; - pe->rid = (vf_bus << 8) | vf_devfn; - - pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", - hose->global_number, pdev->bus->number, - PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num); - - if (pnv_ioda_configure_pe(phb, pe)) { - /* XXX What do we do here ? */ - pnv_ioda_free_pe(pe); - pe->pdev = NULL; - continue; - } - - /* Put PE to the list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_add_tail(&pe->list, &phb->ioda.pe_list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - /* associate this pe to it's pdn */ - list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { - if (vf_pdn->busno == vf_bus && - vf_pdn->devfn == vf_devfn) { - vf_pdn->pe_number = pe_num; - break; - } - } - - pnv_pci_ioda2_setup_dma_pe(phb, pe); - } -} - -int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - int ret; - u16 i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->vfs_expanded) { - dev_info(&pdev->dev, "don't support this SRIOV device" - " with non 64bit-prefetchable IOV BAR\n"); - return -ENOSPC; - } - - /* - * When M64 BARs functions in Single PE mode, the number of VFs - * could be enabled must be less than the number of M64 BARs. - */ - if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { - dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); - return -EBUSY; - } - - /* Allocating pe_num_map */ - if (pdn->m64_single_mode) - pdn->pe_num_map = kmalloc_array(num_vfs, - sizeof(*pdn->pe_num_map), - GFP_KERNEL); - else - pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL); - - if (!pdn->pe_num_map) - return -ENOMEM; - - if (pdn->m64_single_mode) - for (i = 0; i < num_vfs; i++) - pdn->pe_num_map[i] = IODA_INVALID_PE; - - /* Calculate available PE for required VFs */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - pe = pnv_ioda_alloc_pe(phb); - if (!pe) { - ret = -EBUSY; - goto m64_failed; - } - - pdn->pe_num_map[i] = pe->pe_number; - } - } else { - mutex_lock(&phb->ioda.pe_alloc_mutex); - *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe_num, - 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { - mutex_unlock(&phb->ioda.pe_alloc_mutex); - dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); - kfree(pdn->pe_num_map); - return -EBUSY; - } - bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - mutex_unlock(&phb->ioda.pe_alloc_mutex); - } - pdn->num_vfs = num_vfs; - - /* Assign M64 window accordingly */ - ret = pnv_pci_vf_assign_m64(pdev, num_vfs); - if (ret) { - dev_info(&pdev->dev, "Not enough M64 window resources\n"); - goto m64_failed; - } - - /* - * When using one M64 BAR to map one IOV BAR, we need to shift - * the IOV BAR according to the PE# allocated to the VFs. - * Otherwise, the PE# for the VF will conflict with others. - */ - if (!pdn->m64_single_mode) { - ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map); - if (ret) - goto m64_failed; - } - } - - /* Setup VF PEs */ - pnv_ioda_setup_vf_PE(pdev, num_vfs); - - return 0; - -m64_failed: - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - - return ret; -} - -int pnv_pcibios_sriov_disable(struct pci_dev *pdev) -{ - pnv_pci_sriov_disable(pdev); - - /* Release PCI data */ - remove_sriov_vf_pdns(pdev); - return 0; -} - -int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - /* Allocate PCI data */ - add_sriov_vf_pdns(pdev); - - return pnv_pci_sriov_enable(pdev, num_vfs); -} -#endif /* CONFIG_PCI_IOV */ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; @@ -1762,6 +1245,24 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number); } + /* + * We assume that bridges *probably* don't need to do any DMA so we can + * skip allocating a TCE table, etc unless we get a non-bridge device. + */ + if (!pe->dma_setup_done && !pci_is_bridge(pdev)) { + switch (phb->type) { + case PNV_PHB_IODA1: + pnv_pci_ioda1_setup_dma_pe(phb, pe); + break; + case PNV_PHB_IODA2: + pnv_pci_ioda2_setup_dma_pe(phb, pe); + break; + default: + pr_warn("%s: No DMA for PHB#%x (type %d)\n", + __func__, phb->hose->global_number, phb->type); + } + } + if (pdn) pdn->pe_number = pe->pe_number; pe->device_count++; @@ -1847,8 +1348,7 @@ err: static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, u64 dma_mask) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; @@ -1885,19 +1385,6 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, return false; } -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - dev->dev.archdata.dma_offset = pe->tce_bypass_base; - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate); - } -} - static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb, bool real_mode) { @@ -2285,6 +1772,7 @@ found: pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node, 0, 0); + pe->dma_setup_done = true; return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ @@ -2474,7 +1962,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) return 0; } -#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV) static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num) { @@ -2498,7 +1985,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, return ret; } -#endif #ifdef CONFIG_IOMMU_API unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, @@ -2547,6 +2033,19 @@ static long pnv_pci_ioda2_create_table_userspace( return ret; } +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; + + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, @@ -2583,14 +2082,11 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { }; #endif -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) { int64_t rc; - if (!pnv_pci_ioda_pe_dma_weight(pe)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2615,6 +2111,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); #endif + pe->dma_setup_done = true; } int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) @@ -2763,118 +2260,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) count, phb->msi_base); } -#ifdef CONFIG_PCI_IOV -static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - const resource_size_t gate = phb->ioda.m64_segsize >> 2; - struct resource *res; - int i; - resource_size_t size, total_vf_bar_sz; - struct pci_dn *pdn; - int mul, total_vfs; - - pdn = pci_get_pdn(pdev); - pdn->vfs_expanded = 0; - pdn->m64_single_mode = false; - - total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe_num; - total_vf_bar_sz = 0; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - if (!pnv_pci_is_m64_flags(res->flags)) { - dev_warn(&pdev->dev, "Don't support SR-IOV with" - " non M64 VF BAR%d: %pR. \n", - i, res); - goto truncate_iov; - } - - total_vf_bar_sz += pci_iov_resource_size(pdev, - i + PCI_IOV_RESOURCES); - - /* - * If bigger than quarter of M64 segment size, just round up - * power of two. - * - * Generally, one M64 BAR maps one IOV BAR. To avoid conflict - * with other devices, IOV BAR size is expanded to be - * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64 - * segment size , the expanded size would equal to half of the - * whole M64 space size, which will exhaust the M64 Space and - * limit the system flexibility. This is a design decision to - * set the boundary to quarter of the M64 segment size. - */ - if (total_vf_bar_sz > gate) { - mul = roundup_pow_of_two(total_vfs); - dev_info(&pdev->dev, - "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n", - total_vf_bar_sz, gate, mul); - pdn->m64_single_mode = true; - break; - } - } - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - - size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); - /* - * On PHB3, the minimum size alignment of M64 BAR in single - * mode is 32MB. - */ - if (pdn->m64_single_mode && (size < SZ_32M)) - goto truncate_iov; - dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); - res->end = res->start + size * mul - 1; - dev_dbg(&pdev->dev, " %pR\n", res); - dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", - i, res, mul); - } - pdn->vfs_expanded = mul; - - return; - -truncate_iov: - /* To save MMIO space, IOV BAR is truncated. */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - res->flags = 0; - res->end = res->start - 1; - } -} - -static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) -{ - if (WARN_ON(pci_dev_is_added(pdev))) - return; - - if (pdev->is_virtfn) { - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); - - /* - * VF PEs are single-device PEs so their pdev pointer needs to - * be set. The pdev doesn't exist when the PE is allocated (in - * (pcibios_sriov_enable()) so we fix it up here. - */ - pe->pdev = pdev; - WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); - } else if (pdev->is_physfn) { - /* - * For PFs adjust their allocated IOV resources to match what - * the PHB can support using it's M64 BAR table. - */ - pnv_pci_ioda_fixup_iov_resources(pdev); - } -} -#endif /* CONFIG_PCI_IOV */ - static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, struct resource *res) { @@ -3101,10 +2486,9 @@ static void pnv_pci_ioda_fixup(void) static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, unsigned long type) { - struct pci_dev *bridge; - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); int num_pci_bridges = 0; + struct pci_dev *bridge; bridge = bus->self; while (bridge) { @@ -3190,8 +2574,6 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, static void pnv_pci_configure_bus(struct pci_bus *bus) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; struct pci_dev *bridge = bus->self; struct pnv_ioda_pe *pe; bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); @@ -3215,17 +2597,6 @@ static void pnv_pci_configure_bus(struct pci_bus *bus) return; pnv_ioda_setup_pe_seg(pe); - switch (phb->type) { - case PNV_PHB_IODA1: - pnv_pci_ioda1_setup_dma_pe(phb, pe); - break; - case PNV_PHB_IODA2: - pnv_pci_ioda2_setup_dma_pe(phb, pe); - break; - default: - pr_warn("%s: No DMA for PHB#%x (type %d)\n", - __func__, phb->hose->global_number, phb->type); - } } static resource_size_t pnv_pci_default_alignment(void) @@ -3233,49 +2604,12 @@ static resource_size_t pnv_pci_default_alignment(void) return PAGE_SIZE; } -#ifdef CONFIG_PCI_IOV -static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, - int resno) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - resource_size_t align; - - /* - * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the - * SR-IOV. While from hardware perspective, the range mapped by M64 - * BAR should be size aligned. - * - * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra - * powernv-specific hardware restriction is gone. But if just use the - * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with - * in one segment of M64 #15, which introduces the PE conflict between - * PF and VF. Based on this, the minimum alignment of an IOV BAR is - * m64_segsize. - * - * This function returns the total IOV BAR size if M64 BAR is in - * Shared PE mode or just VF BAR size if not. - * If the M64 BAR is in Single PE mode, return the VF BAR size or - * M64 segment size if IOV BAR size is less. - */ - align = pci_iov_resource_size(pdev, resno); - if (!pdn->vfs_expanded) - return align; - if (pdn->m64_single_mode) - return max(align, (resource_size_t)phb->ioda.m64_segsize); - - return pdn->vfs_expanded * align; -} -#endif /* CONFIG_PCI_IOV */ - /* Prevent enabling devices for which we couldn't properly * assign a PE */ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn; /* The function is probably called while the PEs have @@ -3346,11 +2680,10 @@ static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) { - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); struct iommu_table *tbl = pe->table_group.tables[0]; int64_t rc; - if (!weight) + if (!pe->dma_setup_done) return; rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0); @@ -3367,22 +2700,17 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) iommu_tce_table_put(tbl); } -static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = pe->table_group.tables[0]; - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); -#ifdef CONFIG_IOMMU_API int64_t rc; -#endif - if (!weight) + if (pe->dma_setup_done) return; -#ifdef CONFIG_IOMMU_API rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) pe_warn(pe, "OPAL error %lld release DMA window\n", rc); -#endif pnv_pci_ioda2_set_bypass(pe, false); if (pe->table_group.group) { @@ -3405,14 +2733,8 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, if (map[idx] != pe->pe_number) continue; - if (win == OPAL_M64_WINDOW_TYPE) - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, - idx / PNV_IODA1_M64_SEGS, - idx % PNV_IODA1_M64_SEGS); - else - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, 0, idx); + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + phb->ioda.reserved_pe_idx, win, 0, idx); if (rc != OPAL_SUCCESS) pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n", @@ -3431,8 +2753,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) phb->ioda.io_segmap); pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); - pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE, - phb->ioda.m64_segmap); + /* M64 is pre-configured by pnv_ioda1_init_m64() */ } else if (phb->type == PNV_PHB_IODA2) { pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); @@ -3488,17 +2809,27 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) static void pnv_pci_release_device(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; + /* The VF PE state is torn down when sriov_disable() is called */ if (pdev->is_virtfn) return; if (!pdn || pdn->pe_number == IODA_INVALID_PE) return; +#ifdef CONFIG_PCI_IOV + /* + * FIXME: Try move this to sriov_disable(). It's here since we allocate + * the iov state at probe time since we need to fiddle with the IOV + * resources. + */ + if (pdev->is_physfn) + kfree(pdev->dev.archdata.iov_data); +#endif + /* * PCI hotplug can happen as part of EEH error recovery. The @pdn * isn't removed and added afterwards in this scenario. We should @@ -3534,8 +2865,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) { - struct pci_controller *hose = bus->sysdata; - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe; list_for_each_entry(pe, &phb->ioda.pe_list, list) { @@ -3760,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); } else { /* otherwise just allocate one */ - root_pe = pnv_ioda_alloc_pe(phb); + root_pe = pnv_ioda_alloc_pe(phb, 1); phb->ioda.root_pe_idx = root_pe->pe_number; } @@ -3873,8 +3203,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np) static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); if (!machine_is(powernv)) return; diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c new file mode 100644 index 000000000000..c4434f20f42f --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -0,0 +1,766 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/ioport.h> +#include <linux/bitmap.h> +#include <linux/pci.h> + +#include <asm/opal.h> + +#include "pci.h" + +/* for pci_dev_is_added() */ +#include "../../../../drivers/pci/pci.h" + +/* + * The majority of the complexity in supporting SR-IOV on PowerNV comes from + * the need to put the MMIO space for each VF into a separate PE. Internally + * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table". + * The MBT historically only applied to the 64bit MMIO window of the PHB + * so it's common to see it referred to as the "M64BT". + * + * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces + * the address range that we want to map to be power-of-two sized and aligned. + * For conventional PCI devices this isn't really an issue since PCI device BARs + * have the same requirement. + * + * For a SR-IOV BAR things are a little more awkward since size and alignment + * are not coupled. The alignment is set based on the the per-VF BAR size, but + * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs + * isn't necessarily a power of two, so neither is the total size. To fix that + * we need to finesse (read: hack) the Linux BAR allocator so that it will + * allocate the SR-IOV BARs in a way that lets us map them using the MBT. + * + * The changes to size and alignment that we need to do depend on the "mode" + * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above, + * so as a baseline we can assume that we have the following BAR modes + * available: + * + * NB: $PE_COUNT is the number of PEs that the PHB supports. + * + * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized + * segments. The n'th segment is mapped to the n'th PE. + * b) An un-segmented BAR that maps the whole address range to a specific PE. + * + * + * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR + * For comparison b) requires one entry per-VF per-BAR, or: + * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment + * to equal the size of the per-VF BAR area. So: + * + * new_size = per-vf-size * number-of-PEs + * + * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size + * to "new_size", calculated above. Implementing this is a convoluted process + * which requires several hooks in the PCI core: + * + * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov(). + * + * At this point the device has been probed and the device's BARs are sized, + * but no resource allocations have been done. The SR-IOV BARs are sized + * based on the maximum number of VFs supported by the device and we need + * to increase that to new_size. + * + * 2. Later, when Linux actually assigns resources it tries to make the resource + * allocations for each PCI bus as compact as possible. As a part of that it + * sorts the BARs on a bus by their required alignment, which is calculated + * using pci_resource_alignment(). + * + * For IOV resources this goes: + * pci_resource_alignment() + * pci_sriov_resource_alignment() + * pcibios_sriov_resource_alignment() + * pnv_pci_iov_resource_alignment() + * + * Our hook overrides the default alignment, equal to the per-vf-size, with + * new_size computed above. + * + * 3. When userspace enables VFs for a device: + * + * sriov_enable() + * pcibios_sriov_enable() + * pnv_pcibios_sriov_enable() + * + * This is where we actually allocate PE numbers for each VF and setup the + * MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena" + * where each MBT segment is equal in size to the VF BAR so we can shift + * around the actual SR-IOV BAR location within this arena. We need this + * ability because the PE space is shared by all devices on the same PHB. + * When using mode a) described above segment 0 in maps to PE#0 which might + * be already being used by another device on the PHB. + * + * As a result we need allocate a contigious range of PE numbers, then shift + * the address programmed into the SR-IOV BAR of the PF so that the address + * of VF0 matches up with the segment corresponding to the first allocated + * PE number. This is handled in pnv_pci_vf_resource_shift(). + * + * Once all that is done we return to the PCI core which then enables VFs, + * scans them and creates pci_devs for each. The init process for a VF is + * largely the same as a normal device, but the VF is inserted into the IODA + * PE that we allocated for it rather than the PE associated with the bus. + * + * 4. When userspace disables VFs we unwind the above in + * pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since + * we don't need to validate anything, just tear down the mappings and + * move SR-IOV resource back to its "proper" location. + * + * That's how mode a) works. In theory mode b) (single PE mapping) is less work + * since we can map each individual VF with a separate BAR. However, there's a + * few limitations: + * + * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes + * it only usable for devices with very large per-VF BARs. Such devices are + * similar to Big Foot. They definitely exist, but I've never seen one. + * + * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only + * 16 total and some are needed for. Most SR-IOV capable network cards can support + * more than 16 VFs on each port. + * + * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO + * window of the PHB. + * + * + * + * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It + * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows + * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since + * the Linux BAR allocation will place any BAR marked as non-prefetchable into + * the non-prefetchable bridge window, which is 32bit only. It also added two + * new modes: + * + * c) A segmented BAR similar to a), but each segment can be individually + * mapped to any PE. This is matches how the 32bit MMIO window worked on + * IODA1&2. + * + * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a), + * but with fewer segments and configurable base PE. + * + * i.e. The n'th segment maps to the (n + base)'th PE. + * + * The base PE is also required to be a multiple of the window size. + * + * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us + * to exploit any of the IODA3 features. + */ + +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) +{ + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct resource *res; + int i; + resource_size_t vf_bar_sz; + struct pnv_iov_data *iov; + int mul; + + iov = kzalloc(sizeof(*iov), GFP_KERNEL); + if (!iov) + goto disable_iov; + pdev->dev.archdata.iov_data = iov; + mul = phb->ioda.total_pe_num; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_m64_flags(res->flags)) { + dev_warn(&pdev->dev, "Don't support SR-IOV with non M64 VF BAR%d: %pR. \n", + i, res); + goto disable_iov; + } + + vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + + /* + * Generally, one segmented M64 BAR maps one IOV BAR. However, + * if a VF BAR is too large we end up wasting a lot of space. + * If each VF needs more than 1/4 of the default m64 segment + * then each VF BAR should be mapped in single-PE mode to reduce + * the amount of space required. This does however limit the + * number of VFs we can support. + * + * The 1/4 limit is arbitrary and can be tweaked. + */ + if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) { + /* + * On PHB3, the minimum size alignment of M64 BAR in + * single mode is 32MB. If this VF BAR is smaller than + * 32MB, but still too large for a segmented window + * then we can't map it and need to disable SR-IOV for + * this device. + */ + if (vf_bar_sz < SZ_32M) { + pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n", + i, res); + goto disable_iov; + } + + iov->m64_single_mode[i] = true; + continue; + } + + /* + * This BAR can be mapped with one segmented window, so adjust + * te resource size to accommodate. + */ + pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res); + res->end = res->start + vf_bar_sz * mul - 1; + pci_dbg(pdev, " %pR\n", res); + + pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", + i, res, mul); + + iov->need_shift = true; + } + + return; + +disable_iov: + /* Save ourselves some MMIO space by disabling the unusable BARs */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + res->flags = 0; + res->end = res->start - 1; + } + + pdev->dev.archdata.iov_data = NULL; + kfree(iov); +} + +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) +{ + if (WARN_ON(pci_dev_is_added(pdev))) + return; + + if (pdev->is_virtfn) { + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); + + /* + * VF PEs are single-device PEs so their pdev pointer needs to + * be set. The pdev doesn't exist when the PE is allocated (in + * (pcibios_sriov_enable()) so we fix it up here. + */ + pe->pdev = pdev; + WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); + } else if (pdev->is_physfn) { + /* + * For PFs adjust their allocated IOV resources to match what + * the PHB can support using it's M64 BAR table. + */ + pnv_pci_ioda_fixup_iov_resources(pdev); + } +} + +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, + int resno) +{ + resource_size_t align = pci_iov_resource_size(pdev, resno); + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pnv_iov_data *iov = pnv_iov_get(pdev); + + /* + * iov can be null if we have an SR-IOV device with IOV BAR that can't + * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch). + * In that case we don't allow VFs to be enabled since one of their + * BARs would not be placed in the correct PE. + */ + if (!iov) + return align; + + /* + * If we're using single mode then we can just use the native VF BAR + * alignment. We validated that it's possible to use a single PE + * window above when we did the fixup. + */ + if (iov->m64_single_mode[resno - PCI_IOV_RESOURCES]) + return align; + + /* + * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the + * SR-IOV. While from hardware perspective, the range mapped by M64 + * BAR should be size aligned. + * + * This function returns the total IOV BAR size if M64 BAR is in + * Shared PE mode or just VF BAR size if not. + * If the M64 BAR is in Single PE mode, return the VF BAR size or + * M64 segment size if IOV BAR size is less. + */ + return phb->ioda.total_pe_num * align; +} + +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int window_id; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + for_each_set_bit(window_id, iov->used_m64_bar_mask, MAX_M64_BARS) { + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + 0); + + clear_bit(window_id, &phb->ioda.m64_bar_alloc); + } + + return 0; +} + + +/* + * PHB3 and beyond support segmented windows. The window's address range + * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1 + * mapping between PEs and segments. + */ +static int64_t pnv_ioda_map_m64_segmented(struct pnv_phb *phb, + int window_id, + resource_size_t start, + resource_size_t size) +{ + int64_t rc; + + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + start, + 0, /* unused */ + size); + if (rc) + goto out; + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + OPAL_ENABLE_M64_SPLIT); +out: + if (rc) + pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc); + + return rc; +} + +static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb, + int pe_num, + int window_id, + resource_size_t start, + resource_size_t size) +{ + int64_t rc; + + /* + * The API for setting up m64 mmio windows seems to have been designed + * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed + * split of 8 equally sized segments each of which could individually + * assigned to a PE. + * + * The problem with this is that the API doesn't have any way to + * communicate the number of segments we want on a BAR. This wasn't + * a problem for p7-ioc since you didn't have a choice, but the + * single PE windows added in PHB3 don't map cleanly to this API. + * + * As a result we've got this slightly awkward process where we + * call opal_pci_map_pe_mmio_window() to put the single in single + * PE mode, and set the PE for the window before setting the address + * bounds. We need to do it this way because the single PE windows + * for PHB3 have different alignment requirements on PHB3. + */ + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe_num, + OPAL_M64_WINDOW_TYPE, + window_id, + 0); + if (rc) + goto out; + + /* + * NB: In single PE mode the window needs to be aligned to 32MB + */ + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + start, + 0, /* ignored by FW, m64 is 1-1 */ + size); + if (rc) + goto out; + + /* + * Now actually enable it. We specified the BAR should be in "non-split" + * mode so FW will validate that the BAR is in single PE mode. + */ + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + OPAL_ENABLE_M64_NON_SPLIT); +out: + if (rc) + pr_err("Error mapping single PE BAR\n"); + + return rc; +} + +static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov) +{ + int win; + + do { + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, + phb->ioda.m64_bar_idx + 1, 0); + + if (win >= phb->ioda.m64_bar_idx + 1) + return -1; + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + + set_bit(win, iov->used_m64_bar_mask); + + return win; +} + +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_iov_data *iov; + struct pnv_phb *phb; + unsigned int win; + struct resource *res; + int i, j; + int64_t rc; + resource_size_t size, start; + int base_pe_num; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + /* don't need single mode? map everything in one go! */ + if (!iov->m64_single_mode[i]) { + win = pnv_pci_alloc_m64_bar(phb, iov); + if (win < 0) + goto m64_failed; + + size = resource_size(res); + start = res->start; + + rc = pnv_ioda_map_m64_segmented(phb, win, start, size); + if (rc) + goto m64_failed; + + continue; + } + + /* otherwise map each VF with single PE BARs */ + size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i); + base_pe_num = iov->vf_pe_arr[0].pe_number; + + for (j = 0; j < num_vfs; j++) { + win = pnv_pci_alloc_m64_bar(phb, iov); + if (win < 0) + goto m64_failed; + + start = res->start + size * j; + rc = pnv_ioda_map_m64_single(phb, win, + base_pe_num + j, + start, + size); + if (rc) + goto m64_failed; + } + } + return 0; + +m64_failed: + pnv_pci_vf_release_m64(pdev, num_vfs); + return -EBUSY; +} + +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) +{ + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *pe_n; + + phb = pci_bus_to_pnvhb(pdev->bus); + + if (!pdev->is_physfn) + return; + + /* FIXME: Use pnv_ioda_release_pe()? */ + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { + if (pe->parent_dev != pdev) + continue; + + pnv_pci_ioda2_release_pe_dma(pe); + + /* Remove from list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_ioda_deconfigure_pe(phb, pe); + + pnv_ioda_free_pe(pe); + } +} + +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) +{ + struct resource *res, res2; + struct pnv_iov_data *iov; + resource_size_t size; + u16 num_vfs; + int i; + + if (!dev->is_physfn) + return -EINVAL; + iov = pnv_iov_get(dev); + + /* + * "offset" is in VFs. The M64 windows are sized so that when they + * are segmented, each segment is the same size as the IOV BAR. + * Each segment is in a separate PE, and the high order bits of the + * address are the PE number. Therefore, each VF's BAR is in a + * separate PE, and changing the IOV BAR start address changes the + * range of PEs the VFs are in. + */ + num_vfs = iov->num_vfs; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + if (iov->m64_single_mode[i]) + continue; + + /* + * The actual IOV BAR range is determined by the start address + * and the actual size for num_vfs VFs BAR. This check is to + * make sure that after shifting, the range will not overlap + * with another device. + */ + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2.flags = res->flags; + res2.start = res->start + (size * offset); + res2.end = res2.start + (size * num_vfs) - 1; + + if (res2.end > res->end) { + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + return -EBUSY; + } + } + + /* + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. + */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + if (iov->m64_single_mode[i]) + continue; + + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2 = *res; + res->start += size * offset; + + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", + i, &res2, res, (offset > 0) ? "En" : "Dis", + num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &iov->holes[i]); + memset(&iov->holes[i], 0, sizeof(iov->holes[i])); + } + + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + iov->holes[i].start = res2.start; + iov->holes[i].end = res2.start + size * offset - 1; + iov->holes[i].flags = IORESOURCE_BUS; + iov->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &iov->holes[i]); + } + } + return 0; +} + +static void pnv_pci_sriov_disable(struct pci_dev *pdev) +{ + u16 num_vfs, base_pe; + struct pnv_iov_data *iov; + + iov = pnv_iov_get(pdev); + num_vfs = iov->num_vfs; + base_pe = iov->vf_pe_arr[0].pe_number; + + if (WARN_ON(!iov)) + return; + + /* Release VF PEs */ + pnv_ioda_release_vf_PE(pdev); + + /* Un-shift the IOV BARs if we need to */ + if (iov->need_shift) + pnv_pci_vf_resource_shift(pdev, -base_pe); + + /* Release M64 windows */ + pnv_pci_vf_release_m64(pdev, num_vfs); +} + +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + int pe_num; + u16 vf_index; + struct pnv_iov_data *iov; + struct pci_dn *pdn; + + if (!pdev->is_physfn) + return; + + phb = pci_bus_to_pnvhb(pdev->bus); + pdn = pci_get_pdn(pdev); + iov = pnv_iov_get(pdev); + + /* Reserve PE for each VF */ + for (vf_index = 0; vf_index < num_vfs; vf_index++) { + int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); + int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); + struct pci_dn *vf_pdn; + + pe = &iov->vf_pe_arr[vf_index]; + pe->phb = phb; + pe->flags = PNV_IODA_PE_VF; + pe->pbus = NULL; + pe->parent_dev = pdev; + pe->mve_number = -1; + pe->rid = (vf_bus << 8) | vf_devfn; + + pe_num = pe->pe_number; + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", + pci_domain_nr(pdev->bus), pdev->bus->number, + PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + pnv_ioda_free_pe(pe); + pe->pdev = NULL; + continue; + } + + /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_add_tail(&pe->list, &phb->ioda.pe_list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + /* associate this pe to it's pdn */ + list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { + if (vf_pdn->busno == vf_bus && + vf_pdn->devfn == vf_devfn) { + vf_pdn->pe_number = pe_num; + break; + } + } + + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } +} + +static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_ioda_pe *base_pe; + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int ret; + u16 i; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + /* + * There's a calls to IODA2 PE setup code littered throughout. We could + * probably fix that, but we'd still have problems due to the + * restriction inherent on IODA1 PHBs. + * + * NB: We class IODA3 as IODA2 since they're very similar. + */ + if (phb->type != PNV_PHB_IODA2) { + pci_err(pdev, "SR-IOV is not supported on this PHB\n"); + return -ENXIO; + } + + if (!iov) { + dev_info(&pdev->dev, "don't support this SRIOV device with non 64bit-prefetchable IOV BAR\n"); + return -ENOSPC; + } + + /* allocate a contigious block of PEs for our VFs */ + base_pe = pnv_ioda_alloc_pe(phb, num_vfs); + if (!base_pe) { + pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs); + return -EBUSY; + } + + iov->vf_pe_arr = base_pe; + iov->num_vfs = num_vfs; + + /* Assign M64 window accordingly */ + ret = pnv_pci_vf_assign_m64(pdev, num_vfs); + if (ret) { + dev_info(&pdev->dev, "Not enough M64 window resources\n"); + goto m64_failed; + } + + /* + * When using one M64 BAR to map one IOV BAR, we need to shift + * the IOV BAR according to the PE# allocated to the VFs. + * Otherwise, the PE# for the VF will conflict with others. + */ + if (iov->need_shift) { + ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number); + if (ret) + goto shift_failed; + } + + /* Setup VF PEs */ + pnv_ioda_setup_vf_PE(pdev, num_vfs); + + return 0; + +shift_failed: + pnv_pci_vf_release_m64(pdev, num_vfs); + +m64_failed: + for (i = 0; i < num_vfs; i++) + pnv_ioda_free_pe(&iov->vf_pe_arr[i]); + + return ret; +} + +int pnv_pcibios_sriov_disable(struct pci_dev *pdev) +{ + pnv_pci_sriov_disable(pdev); + + /* Release PCI data */ + remove_sriov_vf_pdns(pdev); + return 0; +} + +int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + /* Allocate PCI data */ + add_sriov_vf_pdns(pdev); + + return pnv_pci_sriov_enable(pdev, num_vfs); +} diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 091fe1cf386b..9b9bca169275 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -162,8 +162,7 @@ EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct msi_desc *entry; struct msi_msg msg; int hwirq; @@ -211,8 +210,7 @@ int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) void pnv_teardown_msi_irqs(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct msi_desc *entry; irq_hw_number_t hwirq; @@ -824,10 +822,9 @@ EXPORT_SYMBOL(pnv_pci_get_phb_node); int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable) { - __be64 val; - struct pci_controller *hose; - struct pnv_phb *phb; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); u64 tunnel_bar; + __be64 val; int rc; if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR)) @@ -835,9 +832,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable) if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR)) return -ENXIO; - hose = pci_bus_to_host(dev->bus); - phb = hose->private_data; - mutex_lock(&tunnel_mutex); rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val); if (rc != OPAL_SUCCESS) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 51c254f2f3cb..739a0b3b72e1 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -87,7 +87,14 @@ struct pnv_ioda_pe { bool tce_bypass_enabled; uint64_t tce_bypass_base; - /* MSIs. MVE index is identical for for 32 and 64 bit MSI + /* + * Used to track whether we've done DMA setup for this PE or not. We + * want to defer allocating TCE tables, etc until we've added a + * non-bridge device to the PE. + */ + bool dma_setup_done; + + /* MSIs. MVE index is identical for 32 and 64 bit MSI * and -1 if not supported. (It's actually identical to the * PE number) */ @@ -147,6 +154,7 @@ struct pnv_phb { unsigned long m64_size; unsigned long m64_segsize; unsigned long m64_base; +#define MAX_M64_BARS 64 unsigned long m64_bar_alloc; /* IO ports */ @@ -187,6 +195,89 @@ struct pnv_phb { u8 *diag_data; }; + +/* IODA PE management */ + +static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) +{ + /* + * WARNING: We cannot rely on the resource flags. The Linux PCI + * allocation code sometimes decides to put a 64-bit prefetchable + * BAR in the 32-bit window, so we have to compare the addresses. + * + * For simplicity we only test resource start. + */ + return (r->start >= phb->ioda.m64_base && + r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); +} + +static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) +{ + unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); + + return (resource_flags & flags) == flags; +} + +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); + +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe); + +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count); +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe); + +#ifdef CONFIG_PCI_IOV +/* + * For SR-IOV we want to put each VF's MMIO resource in to a separate PE. + * This requires a bit of acrobatics with the MMIO -> PE configuration + * and this structure is used to keep track of it all. + */ +struct pnv_iov_data { + /* number of VFs enabled */ + u16 num_vfs; + + /* pointer to the array of VF PEs. num_vfs long*/ + struct pnv_ioda_pe *vf_pe_arr; + + /* Did we map the VF BAR with single-PE IODA BARs? */ + bool m64_single_mode[PCI_SRIOV_NUM_BARS]; + + /* + * True if we're using any segmented windows. In that case we need + * shift the start of the IOV resource the segment corresponding to + * the allocated PE. + */ + bool need_shift; + + /* + * Bit mask used to track which m64 windows are used to map the + * SR-IOV BARs for this device. + */ + DECLARE_BITMAP(used_m64_bar_mask, MAX_M64_BARS); + + /* + * If we map the SR-IOV BARs with a segmented window then + * parts of that window will be "claimed" by other PEs. + * + * "holes" here is used to reserve the leading portion + * of the window that is used by other (non VF) PEs. + */ + struct resource holes[PCI_SRIOV_NUM_BARS]; +}; + +static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev) +{ + return pdev->dev.archdata.iov_data; +} + +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev); +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno); + +int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs); +int pnv_pcibios_sriov_disable(struct pci_dev *pdev); +#endif /* CONFIG_PCI_IOV */ + extern struct pci_ops pnv_pci_ops; void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, @@ -260,4 +351,14 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); +static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + + if (hose) + return hose->private_data; + + return NULL; +} + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 3bc188da82ba..7fcb88623081 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -399,7 +399,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + /* + * We map the kernel linear region with 1GB large pages on radix. For + * memory hot unplug to work our memory block size must be at least + * this size. + */ + if (radix_enabled()) + return radix_mem_block_size; + else + return 256UL * 1024 * 1024; } #endif diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 24c18362e5ea..5e037df2a3a1 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -25,15 +25,22 @@ config PPC_PSERIES select SWIOTLB default y +config PARAVIRT_SPINLOCKS + bool + config PPC_SPLPAR - depends on PPC_PSERIES bool "Support for shared-processor logical partitions" + depends on PPC_PSERIES + select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS + default y help Enabling this option will make the kernel run more efficiently on logically-partitioned pSeries systems which use shared processors, that is, which share physical processors between two or more partitions. + Say Y if you are unsure. + config DTL bool "Dispatch Trace Log" depends on PPC_SPLPAR && DEBUG_FS diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index ace117f99d94..cb2d9a970b7b 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -24,6 +24,7 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/spinlock.h> +#include <linux/crash_dump.h> #include <asm/eeh.h> #include <asm/eeh_event.h> @@ -32,6 +33,8 @@ #include <asm/ppc-pci.h> #include <asm/rtas.h> +static int pseries_eeh_get_pe_addr(struct pci_dn *pdn); + /* RTAS tokens */ static int ibm_set_eeh_option; static int ibm_set_slot_reset; @@ -52,8 +55,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) dev_dbg(&pdev->dev, "EEH: Setting up device\n"); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { - struct pci_dn *physfn_pdn; - pdn->device_id = pdev->device; pdn->vendor_id = pdev->vendor; pdn->class_code = pdev->class; @@ -63,23 +64,172 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) * completion from platform. */ pdn->last_allow_rc = 0; - physfn_pdn = pci_get_pdn(pdev->physfn); - pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; } #endif pseries_eeh_init_edev(pdn); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { + /* + * FIXME: This really should be handled by choosing the right + * parent PE in in pseries_eeh_init_edev(). + */ + struct eeh_pe *physfn_pe = pci_dev_to_eeh_dev(pdev->physfn)->pe; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); - eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ - eeh_add_to_parent_pe(edev); /* Add as VF PE type */ + eeh_pe_tree_remove(edev); /* Remove as it is adding to bus pe */ + eeh_pe_tree_insert(edev, physfn_pe); /* Add as VF PE type */ } #endif eeh_probe_device(pdev); } + +/** + * pseries_eeh_get_config_addr - Retrieve config address + * + * Retrieve the assocated config address. Actually, there're 2 RTAS + * function calls dedicated for the purpose. We need implement + * it through the new function and then the old one. Besides, + * you should make sure the config address is figured out from + * FDT node before calling the function. + * + * It's notable that zero'ed return value means invalid PE config + * address. + */ +static int pseries_eeh_get_config_addr(struct pci_controller *phb, int config_addr) +{ + int ret = 0; + int rets[3]; + + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { + /* + * First of all, we need to make sure there has one PE + * associated with the device. Otherwise, PE address is + * meaningless. + */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), 1); + if (ret || (rets[0] == 0)) + return 0; + + /* Retrieve the associated PE config address */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), 0); + if (ret) { + pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", + __func__, phb->global_number, config_addr); + return 0; + } + + return rets[0]; + } + + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), 0); + if (ret) { + pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", + __func__, phb->global_number, config_addr); + return 0; + } + + return rets[0]; + } + + return ret; +} + +/** + * pseries_eeh_phb_reset - Reset the specified PHB + * @phb: PCI controller + * @config_adddr: the associated config address + * @option: reset option + * + * Reset the specified PHB/PE + */ +static int pseries_eeh_phb_reset(struct pci_controller *phb, int config_addr, int option) +{ + int ret; + + /* Reset PE through RTAS call */ + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), option); + + /* If fundamental-reset not supported, try hot-reset */ + if (option == EEH_RESET_FUNDAMENTAL && + ret == -8) { + option = EEH_RESET_HOT; + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid), option); + } + + /* We need reset hold or settlement delay */ + if (option == EEH_RESET_FUNDAMENTAL || + option == EEH_RESET_HOT) + msleep(EEH_PE_RST_HOLD_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + + return ret; +} + +/** + * pseries_eeh_phb_configure_bridge - Configure PCI bridges in the indicated PE + * @phb: PCI controller + * @config_adddr: the associated config address + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int pseries_eeh_phb_configure_bridge(struct pci_controller *phb, int config_addr) +{ + int ret; + /* Waiting 0.2s maximum before skipping configuration */ + int max_wait = 200; + + while (max_wait > 0) { + ret = rtas_call(ibm_configure_pe, 3, 1, NULL, + config_addr, BUID_HI(phb->buid), + BUID_LO(phb->buid)); + + if (!ret) + return ret; + if (ret < 0) + break; + + /* + * If RTAS returns a delay value that's above 100ms, cut it + * down to 100ms in case firmware made a mistake. For more + * on how these delay values work see rtas_busy_delay_time + */ + if (ret > RTAS_EXTENDED_DELAY_MIN+2 && + ret <= RTAS_EXTENDED_DELAY_MAX) + ret = RTAS_EXTENDED_DELAY_MIN+2; + + max_wait -= rtas_busy_delay_time(ret); + + if (max_wait < 0) + break; + + rtas_busy_delay(ret); + } + + pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n", + __func__, phb->global_number, config_addr, ret); + /* PAPR defines -3 as "Parameter Error" for this function: */ + if (ret == -3) + return -EINVAL; + else + return -EIO; +} + /* * Buffer for reporting slot-error-detail rtas calls. Its here * in BSS, and not dynamically alloced, so that it ends up in @@ -96,6 +246,10 @@ static int eeh_error_buf_size; */ static int pseries_eeh_init(void) { + struct pci_controller *phb; + struct pci_dn *pdn; + int addr, config_addr; + /* figure out EEH RTAS function call tokens */ ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); @@ -148,6 +302,22 @@ static int pseries_eeh_init(void) /* Set EEH machine dependent code */ ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; + if (is_kdump_kernel() || reset_devices) { + pr_info("Issue PHB reset ...\n"); + list_for_each_entry(phb, &hose_list, list_node) { + pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list); + addr = (pdn->busno << 16) | (pdn->devfn << 8); + config_addr = pseries_eeh_get_config_addr(phb, addr); + /* invalid PE config addr */ + if (config_addr == 0) + continue; + + pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_FUNDAMENTAL); + pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_DEACTIVATE); + pseries_eeh_phb_configure_bridge(phb, config_addr); + } + } + return 0; } @@ -221,6 +391,43 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) } /** + * pseries_eeh_pe_get_parent - Retrieve the parent PE + * @edev: EEH device + * + * The whole PEs existing in the system are organized as hierarchy + * tree. The function is used to retrieve the parent PE according + * to the parent EEH device. + */ +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev) +{ + struct eeh_dev *parent; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + + /* + * It might have the case for the indirect parent + * EEH device already having associated PE, but + * the direct parent EEH device doesn't have yet. + */ + if (edev->physfn) + pdn = pci_get_pdn(edev->physfn); + else + pdn = pdn ? pdn->parent : NULL; + while (pdn) { + /* We're poking out of PCI territory */ + parent = pdn_to_eeh_dev(pdn); + if (!parent) + return NULL; + + if (parent->pe) + return parent->pe; + + pdn = pdn->parent; + } + + return NULL; +} + +/** * pseries_eeh_init_edev - initialise the eeh_dev and eeh_pe for a pci_dn * * @pdn: PCI device node @@ -275,12 +482,11 @@ void pseries_eeh_init_edev(struct pci_dn *pdn) * correctly reflects that current device is root port * or PCIe switch downstream port. */ - edev->class_code = pdn->class_code; edev->pcix_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); edev->pcie_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_EXP); edev->aer_cap = pseries_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); edev->mode &= 0xFFFFFF00; - if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, @@ -304,8 +510,10 @@ void pseries_eeh_init_edev(struct pci_dn *pdn) if (ret) { eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); } else { + struct eeh_pe *parent; + /* Retrieve PE address */ - edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); + edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn); pe.addr = edev->pe_config_addr; /* Some older systems (Power4) allow the ibm,set-eeh-option @@ -316,16 +524,19 @@ void pseries_eeh_init_edev(struct pci_dn *pdn) if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT) enable = 1; - if (enable) { + /* + * This device doesn't support EEH, but it may have an + * EEH parent. In this case any error on the device will + * freeze the PE of it's upstream bridge, so added it to + * the upstream PE. + */ + parent = pseries_eeh_pe_get_parent(edev); + if (parent && !enable) + edev->pe_config_addr = parent->addr; + + if (enable || parent) { eeh_add_flag(EEH_ENABLED); - eeh_add_to_parent_pe(edev); - } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && - (pdn_to_eeh_dev(pdn->parent))->pe) { - /* This device doesn't support EEH, but it may have an - * EEH parent, in which case we mark it as supported. - */ - edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; - eeh_add_to_parent_pe(edev); + eeh_pe_tree_insert(edev, parent); } eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", (enable ? "enabled" : "unsupported"), ret); @@ -435,8 +646,10 @@ static int pseries_eeh_set_option(struct eeh_pe *pe, int option) * It's notable that zero'ed return value means invalid PE config * address. */ -static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) +static int pseries_eeh_get_pe_addr(struct pci_dn *pdn) { + int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); + unsigned long buid = pdn->phb->buid; int ret = 0; int rets[3]; @@ -447,18 +660,16 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) * meaningless. */ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, - pe->config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), 1); + config_addr, BUID_HI(buid), BUID_LO(buid), 1); if (ret || (rets[0] == 0)) return 0; /* Retrieve the associated PE config address */ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, - pe->config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), 0); + config_addr, BUID_HI(buid), BUID_LO(buid), 0); if (ret) { pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", - __func__, pe->phb->global_number, pe->config_addr); + __func__, pdn->phb->global_number, config_addr); return 0; } @@ -467,11 +678,10 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, - pe->config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), 0); + config_addr, BUID_HI(buid), BUID_LO(buid), 0); if (ret) { pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n", - __func__, pe->phb->global_number, pe->config_addr); + __func__, pdn->phb->global_number, config_addr); return 0; } @@ -569,35 +779,13 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) static int pseries_eeh_reset(struct eeh_pe *pe, int option) { int config_addr; - int ret; /* Figure out PE address */ config_addr = pe->config_addr; if (pe->addr) config_addr = pe->addr; - /* Reset PE through RTAS call */ - ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), option); - - /* If fundamental-reset not supported, try hot-reset */ - if (option == EEH_RESET_FUNDAMENTAL && - ret == -8) { - option = EEH_RESET_HOT; - ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid), option); - } - - /* We need reset hold or settlement delay */ - if (option == EEH_RESET_FUNDAMENTAL || - option == EEH_RESET_HOT) - msleep(EEH_PE_RST_HOLD_TIME); - else - msleep(EEH_PE_RST_SETTLE_TIME); - - return ret; + return pseries_eeh_phb_reset(pe->phb, config_addr, option); } /** @@ -641,108 +829,49 @@ static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, u * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE * @pe: EEH PE * - * The function will be called to reconfigure the bridges included - * in the specified PE so that the mulfunctional PE would be recovered - * again. */ static int pseries_eeh_configure_bridge(struct eeh_pe *pe) { int config_addr; - int ret; - /* Waiting 0.2s maximum before skipping configuration */ - int max_wait = 200; /* Figure out the PE address */ config_addr = pe->config_addr; if (pe->addr) config_addr = pe->addr; - while (max_wait > 0) { - ret = rtas_call(ibm_configure_pe, 3, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid)); - - if (!ret) - return ret; - if (ret < 0) - break; - - /* - * If RTAS returns a delay value that's above 100ms, cut it - * down to 100ms in case firmware made a mistake. For more - * on how these delay values work see rtas_busy_delay_time - */ - if (ret > RTAS_EXTENDED_DELAY_MIN+2 && - ret <= RTAS_EXTENDED_DELAY_MAX) - ret = RTAS_EXTENDED_DELAY_MIN+2; - - max_wait -= rtas_busy_delay_time(ret); - - if (max_wait < 0) - break; - - rtas_busy_delay(ret); - } - - pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n", - __func__, pe->phb->global_number, pe->addr, ret); - /* PAPR defines -3 as "Parameter Error" for this function: */ - if (ret == -3) - return -EINVAL; - else - return -EIO; + return pseries_eeh_phb_configure_bridge(pe->phb, config_addr); } /** * pseries_eeh_read_config - Read PCI config space - * @pdn: PCI device node - * @where: PCI address + * @edev: EEH device handle + * @where: PCI config space offset * @size: size to read * @val: return value * * Read config space from the speicifed device */ -static int pseries_eeh_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +static int pseries_eeh_read_config(struct eeh_dev *edev, int where, int size, u32 *val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + return rtas_read_config(pdn, where, size, val); } /** * pseries_eeh_write_config - Write PCI config space - * @pdn: PCI device node - * @where: PCI address + * @edev: EEH device handle + * @where: PCI config space offset * @size: size to write * @val: value to be written * * Write config space to the specified device */ -static int pseries_eeh_write_config(struct pci_dn *pdn, int where, int size, u32 val) +static int pseries_eeh_write_config(struct eeh_dev *edev, int where, int size, u32 val) { - return rtas_write_config(pdn, where, size, val); -} + struct pci_dn *pdn = eeh_dev_to_pdn(edev); -static int pseries_eeh_restore_config(struct pci_dn *pdn) -{ - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - s64 ret = 0; - - if (!edev) - return -EEXIST; - - /* - * FIXME: The MPS, error routing rules, timeout setting are worthy - * to be exported by firmware in extendible way. - */ - if (edev->physfn) - ret = eeh_restore_vf_config(pdn); - - if (ret) { - pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, edev->pe_config_addr, ret); - return -EIO; - } - - return ret; + return rtas_write_config(pdn, where, size, val); } #ifdef CONFIG_PCI_IOV @@ -772,8 +901,8 @@ int pseries_send_allow_unfreeze(struct pci_dn *pdn, static int pseries_call_allow_unfreeze(struct eeh_dev *edev) { + int cur_vfs = 0, rc = 0, vf_index, bus, devfn, vf_pe_num; struct pci_dn *pdn, *tmp, *parent, *physfn_pdn; - int cur_vfs = 0, rc = 0, vf_index, bus, devfn; u16 *vf_pe_array; vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); @@ -806,8 +935,10 @@ static int pseries_call_allow_unfreeze(struct eeh_dev *edev) } } else { pdn = pci_get_pdn(edev->pdev); - vf_pe_array[0] = cpu_to_be16(pdn->pe_number); physfn_pdn = pci_get_pdn(edev->physfn); + + vf_pe_num = physfn_pdn->pe_num_map[edev->vf_index]; + vf_pe_array[0] = cpu_to_be16(vf_pe_num); rc = pseries_send_allow_unfreeze(physfn_pdn, vf_pe_array, 1); pdn->last_allow_rc = rc; @@ -818,10 +949,8 @@ static int pseries_call_allow_unfreeze(struct eeh_dev *edev) return rc; } -static int pseries_notify_resume(struct pci_dn *pdn) +static int pseries_notify_resume(struct eeh_dev *edev) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - if (!edev) return -EEXIST; @@ -841,7 +970,6 @@ static struct eeh_ops pseries_eeh_ops = { .init = pseries_eeh_init, .probe = pseries_eeh_probe, .set_option = pseries_eeh_set_option, - .get_pe_addr = pseries_eeh_get_pe_addr, .get_state = pseries_eeh_get_state, .reset = pseries_eeh_reset, .get_log = pseries_eeh_get_log, @@ -850,7 +978,7 @@ static struct eeh_ops pseries_eeh_ops = { .read_config = pseries_eeh_read_config, .write_config = pseries_eeh_write_config, .next_error = NULL, - .restore_config = pseries_eeh_restore_config, + .restore_config = NULL, /* NB: configure_bridge() does this */ #ifdef CONFIG_PCI_IOV .notify_resume = pseries_notify_resume #endif diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 3e49cc23a97a..4c7b7f5a2ebc 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -65,6 +65,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, {FW_FEATURE_PAPR_SCM, "hcall-scm"}, + {FW_FEATURE_RPT_INVALIDATE, "hcall-rpt-invalidate"}, }; /* Build up the firmware features bitmask using the contents of diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 3e8cbfe7a80f..c6e0d8abf75e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -35,54 +35,10 @@ #include <asm/topology.h> #include "pseries.h" -#include "offline_states.h" /* This version can't take the spinlock, because it never returns */ static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; -static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = - CPU_STATE_OFFLINE; -static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; - -static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; - -static bool cede_offline_enabled __read_mostly = true; - -/* - * Enable/disable cede_offline when available. - */ -static int __init setup_cede_offline(char *str) -{ - return (kstrtobool(str, &cede_offline_enabled) == 0); -} - -__setup("cede_offline=", setup_cede_offline); - -enum cpu_state_vals get_cpu_current_state(int cpu) -{ - return per_cpu(current_state, cpu); -} - -void set_cpu_current_state(int cpu, enum cpu_state_vals state) -{ - per_cpu(current_state, cpu) = state; -} - -enum cpu_state_vals get_preferred_offline_state(int cpu) -{ - return per_cpu(preferred_offline_state, cpu); -} - -void set_preferred_offline_state(int cpu, enum cpu_state_vals state) -{ - per_cpu(preferred_offline_state, cpu) = state; -} - -void set_default_offline_state(int cpu) -{ - per_cpu(preferred_offline_state, cpu) = default_offline_state; -} - static void rtas_stop_self(void) { static struct rtas_args args; @@ -101,9 +57,7 @@ static void rtas_stop_self(void) static void pseries_mach_cpu_die(void) { - unsigned int cpu = smp_processor_id(); unsigned int hwcpu = hard_smp_processor_id(); - u8 cede_latency_hint = 0; local_irq_disable(); idle_task_exit(); @@ -112,49 +66,6 @@ static void pseries_mach_cpu_die(void) else xics_teardown_cpu(); - if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - set_cpu_current_state(cpu, CPU_STATE_INACTIVE); - if (ppc_md.suspend_disable_cpu) - ppc_md.suspend_disable_cpu(); - - cede_latency_hint = 2; - - get_lppaca()->idle = 1; - if (!lppaca_shared_proc(get_lppaca())) - get_lppaca()->donate_dedicated_cpu = 1; - - while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - while (!prep_irq_for_idle()) { - local_irq_enable(); - local_irq_disable(); - } - - extended_cede_processor(cede_latency_hint); - } - - local_irq_disable(); - - if (!lppaca_shared_proc(get_lppaca())) - get_lppaca()->donate_dedicated_cpu = 0; - get_lppaca()->idle = 0; - - if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { - unregister_slb_shadow(hwcpu); - - hard_irq_disable(); - /* - * Call to start_secondary_resume() will not return. - * Kernel stack will be reset and start_secondary() - * will be called to continue the online operation. - */ - start_secondary_resume(); - } - } - - /* Requested state is CPU_STATE_OFFLINE at this point */ - WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); - - set_cpu_current_state(cpu, CPU_STATE_OFFLINE); unregister_slb_shadow(hwcpu); rtas_stop_self(); @@ -200,24 +111,13 @@ static void pseries_cpu_die(unsigned int cpu) int cpu_status = 1; unsigned int pcpu = get_hard_smp_processor_id(cpu); - if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - cpu_status = 1; - for (tries = 0; tries < 5000; tries++) { - if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { - cpu_status = 0; - break; - } - msleep(1); - } - } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + for (tries = 0; tries < 25; tries++) { + cpu_status = smp_query_cpu_stopped(pcpu); + if (cpu_status == QCSS_STOPPED || + cpu_status == QCSS_HARDWARE_ERROR) + break; + cpu_relax(); - for (tries = 0; tries < 25; tries++) { - cpu_status = smp_query_cpu_stopped(pcpu); - if (cpu_status == QCSS_STOPPED || - cpu_status == QCSS_HARDWARE_ERROR) - break; - cpu_relax(); - } } if (cpu_status != 0) { @@ -359,28 +259,14 @@ static int dlpar_offline_cpu(struct device_node *dn) if (get_hard_smp_processor_id(cpu) != thread) continue; - if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + if (!cpu_online(cpu)) break; - if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { - set_preferred_offline_state(cpu, - CPU_STATE_OFFLINE); - cpu_maps_update_done(); - timed_topology_update(1); - rc = device_offline(get_cpu_device(cpu)); - if (rc) - goto out; - cpu_maps_update_begin(); - break; - } - - /* - * The cpu is in CPU_STATE_INACTIVE. - * Upgrade it's state to CPU_STATE_OFFLINE. - */ - set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); - WARN_ON(plpar_hcall_norets(H_PROD, thread) != H_SUCCESS); - __cpu_die(cpu); + cpu_maps_update_done(); + rc = device_offline(get_cpu_device(cpu)); + if (rc) + goto out; + cpu_maps_update_begin(); break; } if (cpu == num_possible_cpus()) { @@ -414,10 +300,7 @@ static int dlpar_online_cpu(struct device_node *dn) for_each_present_cpu(cpu) { if (get_hard_smp_processor_id(cpu) != thread) continue; - BUG_ON(get_cpu_current_state(cpu) - != CPU_STATE_OFFLINE); cpu_maps_update_done(); - timed_topology_update(1); find_and_online_cpu_nid(cpu); rc = device_online(get_cpu_device(cpu)); if (rc) { @@ -854,7 +737,6 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add) parent = of_find_node_by_path("/cpus"); if (!parent) { pr_warn("Could not find CPU root node in device tree\n"); - kfree(cpu_drcs); return -1; } @@ -896,25 +778,6 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add) return rc; } -int dlpar_cpu_readd(int cpu) -{ - struct device_node *dn; - struct device *dev; - u32 drc_index; - int rc; - - dev = get_cpu_device(cpu); - dn = dev->of_node; - - rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index); - - rc = dlpar_cpu_remove_by_index(drc_index); - if (!rc) - rc = dlpar_cpu_add(drc_index); - - return rc; -} - int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { u32 count, drc_index; @@ -1013,27 +876,8 @@ static struct notifier_block pseries_smp_nb = { .notifier_call = pseries_smp_notifier, }; -#define MAX_CEDE_LATENCY_LEVELS 4 -#define CEDE_LATENCY_PARAM_LENGTH 10 -#define CEDE_LATENCY_PARAM_MAX_LENGTH \ - (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) -#define CEDE_LATENCY_TOKEN 45 - -static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; - -static int parse_cede_parameters(void) -{ - memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); - return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - CEDE_LATENCY_TOKEN, - __pa(cede_parameters), - CEDE_LATENCY_PARAM_MAX_LENGTH); -} - static int __init pseries_cpu_hotplug_init(void) { - int cpu; int qcss_tok; #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE @@ -1056,16 +900,8 @@ static int __init pseries_cpu_hotplug_init(void) smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) { + if (firmware_has_feature(FW_FEATURE_LPAR)) of_reconfig_notifier_register(&pseries_smp_nb); - cpu_maps_update_begin(); - if (cede_offline_enabled && parse_cede_parameters() == 0) { - default_offline_state = CPU_STATE_INACTIVE; - for_each_online_cpu(cpu) - set_default_offline_state(cpu); - } - cpu_maps_update_done(); - } return 0; } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 5ace2f9a277e..5d545b78111f 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -22,12 +22,10 @@ #include <asm/drmem.h> #include "pseries.h" -static bool rtas_hp_event; - unsigned long pseries_memory_block_size(void) { struct device_node *np; - unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE; + u64 memblock_size = MIN_MEMORY_BLOCK_SIZE; struct resource r; np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); @@ -487,40 +485,6 @@ static int dlpar_memory_remove_by_index(u32 drc_index) return rc; } -static int dlpar_memory_readd_by_index(u32 drc_index) -{ - struct drmem_lmb *lmb; - int lmb_found; - int rc; - - pr_info("Attempting to update LMB, drc index %x\n", drc_index); - - lmb_found = 0; - for_each_drmem_lmb(lmb) { - if (lmb->drc_index == drc_index) { - lmb_found = 1; - rc = dlpar_remove_lmb(lmb); - if (!rc) { - rc = dlpar_add_lmb(lmb); - if (rc) - dlpar_release_drc(lmb->drc_index); - } - break; - } - } - - if (!lmb_found) - rc = -EINVAL; - - if (rc) - pr_info("Failed to update memory at %llx\n", - lmb->base_addr); - else - pr_info("Memory at %llx was updated\n", lmb->base_addr); - - return rc; -} - static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) { struct drmem_lmb *lmb, *start_lmb, *end_lmb; @@ -617,10 +581,6 @@ static int dlpar_memory_remove_by_index(u32 drc_index) { return -EOPNOTSUPP; } -static int dlpar_memory_readd_by_index(u32 drc_index) -{ - return -EOPNOTSUPP; -} static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) { @@ -903,21 +863,14 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) } break; - case PSERIES_HP_ELOG_ACTION_READD: - drc_index = hp_elog->_drc_u.drc_index; - rc = dlpar_memory_readd_by_index(drc_index); - break; default: pr_err("Invalid action (%d) specified\n", hp_elog->action); rc = -EINVAL; break; } - if (!rc) { - rtas_hp_event = true; + if (!rc) rc = drmem_update_dt(); - rtas_hp_event = false; - } unlock_device_hotplug(); return rc; @@ -953,60 +906,6 @@ static int pseries_add_mem_node(struct device_node *np) return (ret < 0) ? -EINVAL : 0; } -static int pseries_update_drconf_memory(struct of_reconfig_data *pr) -{ - struct of_drconf_cell_v1 *new_drmem, *old_drmem; - unsigned long memblock_size; - u32 entries; - __be32 *p; - int i, rc = -EINVAL; - - if (rtas_hp_event) - return 0; - - memblock_size = pseries_memory_block_size(); - if (!memblock_size) - return -EINVAL; - - if (!pr->old_prop) - return 0; - - p = (__be32 *) pr->old_prop->value; - if (!p) - return -EINVAL; - - /* The first int of the property is the number of lmb's described - * by the property. This is followed by an array of of_drconf_cell - * entries. Get the number of entries and skip to the array of - * of_drconf_cell's. - */ - entries = be32_to_cpu(*p++); - old_drmem = (struct of_drconf_cell_v1 *)p; - - p = (__be32 *)pr->prop->value; - p++; - new_drmem = (struct of_drconf_cell_v1 *)p; - - for (i = 0; i < entries; i++) { - if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) && - (!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) { - rc = pseries_remove_memblock( - be64_to_cpu(old_drmem[i].base_addr), - memblock_size); - break; - } else if ((!(be32_to_cpu(old_drmem[i].flags) & - DRCONF_MEM_ASSIGNED)) && - (be32_to_cpu(new_drmem[i].flags) & - DRCONF_MEM_ASSIGNED)) { - rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr), - memblock_size); - rc = (rc < 0) ? -EINVAL : 0; - break; - } - } - return rc; -} - static int pseries_memory_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -1020,10 +919,6 @@ static int pseries_memory_notifier(struct notifier_block *nb, case OF_RECONFIG_DETACH_NODE: err = pseries_remove_mem_node(rd->dn); break; - case OF_RECONFIG_UPDATE_PROPERTY: - if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) - err = pseries_update_drconf_memory(rd); - break; } return notifier_from_errno(err); } diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c index 267139b13530..96e18d3b2fcf 100644 --- a/arch/powerpc/platforms/pseries/hvcserver.c +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -45,7 +45,7 @@ static int hvcs_convert(long to_convert) case H_LONG_BUSY_ORDER_10_SEC: case H_LONG_BUSY_ORDER_100_SEC: return -EBUSY; - case H_FUNCTION: /* fall through */ + case H_FUNCTION: default: return -EPERM; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f71ff2c94efe..baf24eacd268 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1681,9 +1681,11 @@ static int pseries_lpar_register_process_table(unsigned long base, if (table_size) flags |= PROC_TABLE_NEW; - if (radix_enabled()) - flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE; - else + if (radix_enabled()) { + flags |= PROC_TABLE_RADIX; + if (mmu_has_feature(MMU_FTR_GTSE)) + flags |= PROC_TABLE_GTSE; + } else flags |= PROC_TABLE_HPT_SLB; for (;;) { rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 10d982997736..d6f4162478a5 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -6,6 +6,9 @@ * Copyright (C) 2010 IBM Corporation */ + +#define pr_fmt(fmt) "mobility: " fmt + #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/kobject.h> @@ -64,6 +67,8 @@ static int delete_dt_node(__be32 phandle) if (!dn) return -ENOENT; + pr_debug("removing node %pOFfp\n", dn); + dlpar_detach_node(dn); of_node_put(dn); return 0; @@ -122,6 +127,7 @@ static int update_dt_property(struct device_node *dn, struct property **prop, } if (!more) { + pr_debug("updating node %pOF property %s\n", dn, name); of_update_property(dn, new_prop); *prop = NULL; } @@ -240,33 +246,12 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) if (rc) dlpar_free_cc_nodes(dn); + pr_debug("added node %pOFfp\n", dn); + of_node_put(parent_dn); return rc; } -static void prrn_update_node(__be32 phandle) -{ - struct pseries_hp_errorlog hp_elog; - struct device_node *dn; - - /* - * If a node is found from a the given phandle, the phandle does not - * represent the drc index of an LMB and we can ignore. - */ - dn = of_find_node_by_phandle(be32_to_cpu(phandle)); - if (dn) { - of_node_put(dn); - return; - } - - hp_elog.resource = PSERIES_HP_ELOG_RESOURCE_MEM; - hp_elog.action = PSERIES_HP_ELOG_ACTION_READD; - hp_elog.id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; - hp_elog._drc_u.drc_index = phandle; - - handle_dlpar_errorlog(&hp_elog); -} - int pseries_devicetree_update(s32 scope) { char *rtas_buf; @@ -305,10 +290,6 @@ int pseries_devicetree_update(s32 scope) break; case UPDATE_DT_NODE: update_dt_node(phandle, scope); - - if (scope == PRRN_SCOPE) - prrn_update_node(phandle); - break; case ADD_DT_NODE: drc_index = *data++; @@ -388,8 +369,6 @@ static ssize_t migration_store(struct class *class, if (rc) return rc; - stop_topology_update(); - do { rc = rtas_ibm_suspend_me(streamid); if (rc == -EAGAIN) @@ -401,8 +380,6 @@ static ssize_t migration_store(struct class *class, post_mobility_fixup(); - start_topology_update(); - return count; } @@ -427,11 +404,11 @@ static int __init mobility_sysfs_init(void) rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr); if (rc) - pr_err("mobility: unable to create migration sysfs file (%d)\n", rc); + pr_err("unable to create migration sysfs file (%d)\n", rc); rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr); if (rc) - pr_err("mobility: unable to create api_version sysfs file (%d)\n", rc); + pr_err("unable to create api_version sysfs file (%d)\n", rc); return 0; } diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h deleted file mode 100644 index 51414aee2862..000000000000 --- a/arch/powerpc/platforms/pseries/offline_states.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _OFFLINE_STATES_H_ -#define _OFFLINE_STATES_H_ - -/* Cpu offline states go here */ -enum cpu_state_vals { - CPU_STATE_OFFLINE, - CPU_STATE_INACTIVE, - CPU_STATE_ONLINE, - CPU_MAX_OFFLINE_STATES -}; - -#ifdef CONFIG_HOTPLUG_CPU -extern enum cpu_state_vals get_cpu_current_state(int cpu); -extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); -extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); -extern void set_default_offline_state(int cpu); -#else -static inline enum cpu_state_vals get_cpu_current_state(int cpu) -{ - return CPU_STATE_ONLINE; -} - -static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state) -{ -} - -static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state) -{ -} - -static inline void set_default_offline_state(int cpu) -{ -} -#endif - -extern enum cpu_state_vals get_preferred_offline_state(int cpu); -#endif diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 9c569078a09f..f439f0dfea7d 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -13,9 +13,11 @@ #include <linux/platform_device.h> #include <linux/delay.h> #include <linux/seq_buf.h> +#include <linux/nd.h> #include <asm/plpar_wrappers.h> #include <asm/papr_pdsm.h> +#include <asm/mce.h> #define BIND_ANY_ADDR (~0ul) @@ -62,6 +64,26 @@ PAPR_PMEM_HEALTH_FATAL | \ PAPR_PMEM_HEALTH_UNHEALTHY) +#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) +#define PAPR_SCM_PERF_STATS_VERSION 0x1 + +/* Struct holding a single performance metric */ +struct papr_scm_perf_stat { + u8 stat_id[8]; + __be64 stat_val; +} __packed; + +/* Struct exchanged between kernel and PHYP for fetching drc perf stats */ +struct papr_scm_perf_stats { + u8 eye_catcher[8]; + /* Should be PAPR_SCM_PERF_STATS_VERSION */ + __be32 stats_version; + /* Number of stats following */ + __be32 num_statistics; + /* zero or more performance matrics */ + struct papr_scm_perf_stat scm_statistic[]; +} __packed; + /* private struct associated with each region */ struct papr_scm_priv { struct platform_device *pdev; @@ -80,6 +102,7 @@ struct papr_scm_priv { struct resource res; struct nd_region *region; struct nd_interleave_set nd_set; + struct list_head region_list; /* Protect dimm health data from concurrent read/writes */ struct mutex health_mutex; @@ -89,8 +112,14 @@ struct papr_scm_priv { /* Health information for the dimm */ u64 health_bitmap; + + /* length of the stat buffer as expected by phyp */ + size_t stat_buffer_len; }; +static LIST_HEAD(papr_nd_regions); +static DEFINE_MUTEX(papr_ndr_lock); + static int drc_pmem_bind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; @@ -195,6 +224,79 @@ err_out: } /* + * Query the Dimm performance stats from PHYP and copy them (if returned) to + * provided struct papr_scm_perf_stats instance 'stats' that can hold atleast + * (num_stats + header) bytes. + * - If buff_stats == NULL the return value is the size in byes of the buffer + * needed to hold all supported performance-statistics. + * - If buff_stats != NULL and num_stats == 0 then we copy all known + * performance-statistics to 'buff_stat' and expect to be large enough to + * hold them. + * - if buff_stats != NULL and num_stats > 0 then copy the requested + * performance-statistics to buff_stats. + */ +static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, + struct papr_scm_perf_stats *buff_stats, + unsigned int num_stats) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + size_t size; + s64 rc; + + /* Setup the out buffer */ + if (buff_stats) { + memcpy(buff_stats->eye_catcher, + PAPR_SCM_PERF_STATS_EYECATCHER, 8); + buff_stats->stats_version = + cpu_to_be32(PAPR_SCM_PERF_STATS_VERSION); + buff_stats->num_statistics = + cpu_to_be32(num_stats); + + /* + * Calculate the buffer size based on num-stats provided + * or use the prefetched max buffer length + */ + if (num_stats) + /* Calculate size from the num_stats */ + size = sizeof(struct papr_scm_perf_stats) + + num_stats * sizeof(struct papr_scm_perf_stat); + else + size = p->stat_buffer_len; + } else { + /* In case of no out buffer ignore the size */ + size = 0; + } + + /* Do the HCALL asking PHYP for info */ + rc = plpar_hcall(H_SCM_PERFORMANCE_STATS, ret, p->drc_index, + buff_stats ? virt_to_phys(buff_stats) : 0, + size); + + /* Check if the error was due to an unknown stat-id */ + if (rc == H_PARTIAL) { + dev_err(&p->pdev->dev, + "Unknown performance stats, Err:0x%016lX\n", ret[0]); + return -ENOENT; + } else if (rc != H_SUCCESS) { + dev_err(&p->pdev->dev, + "Failed to query performance stats, Err:%lld\n", rc); + return -EIO; + + } else if (!size) { + /* Handle case where stat buffer size was requested */ + dev_dbg(&p->pdev->dev, + "Performance stats size %ld\n", ret[0]); + return ret[0]; + } + + /* Successfully fetched the requested stats from phyp */ + dev_dbg(&p->pdev->dev, + "Performance stats returned %d stats\n", + be32_to_cpu(buff_stats->num_statistics)); + return 0; +} + +/* * Issue hcall to retrieve dimm health info and populate papr_scm_priv with the * health information. */ @@ -416,6 +518,51 @@ static int is_cmd_valid(struct nvdimm *nvdimm, unsigned int cmd, void *buf, return 0; } +static int papr_pdsm_fuel_gauge(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc, size; + u64 statval; + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + + /* Silently fail if fetching performance metrics isn't supported */ + if (!p->stat_buffer_len) + return 0; + + /* Allocate request buffer enough to hold single performance stat */ + size = sizeof(struct papr_scm_perf_stats) + + sizeof(struct papr_scm_perf_stat); + + stats = kzalloc(size, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stat = &stats->scm_statistic[0]; + memcpy(&stat->stat_id, "MemLife ", sizeof(stat->stat_id)); + stat->stat_val = 0; + + /* Fetch the fuel gauge and populate it in payload */ + rc = drc_pmem_query_stats(p, stats, 1); + if (rc < 0) { + dev_dbg(&p->pdev->dev, "Err(%d) fetching fuel gauge\n", rc); + goto free_stats; + } + + statval = be64_to_cpu(stat->stat_val); + dev_dbg(&p->pdev->dev, + "Fetched fuel-gauge %llu", statval); + payload->health.extension_flags |= + PDSM_DIMM_HEALTH_RUN_GAUGE_VALID; + payload->health.dimm_fuel_gauge = statval; + + rc = sizeof(struct nd_papr_pdsm_health); + +free_stats: + kfree(stats); + return rc; +} + /* Fetch the DIMM health info and populate it in provided package. */ static int papr_pdsm_health(struct papr_scm_priv *p, union nd_pdsm_payload *payload) @@ -456,6 +603,10 @@ static int papr_pdsm_health(struct papr_scm_priv *p, /* struct populated hence can release the mutex now */ mutex_unlock(&p->health_mutex); + + /* Populate the fuel gauge meter in the payload */ + papr_pdsm_fuel_gauge(p, payload); + rc = sizeof(struct nd_papr_pdsm_health); out: @@ -631,6 +782,48 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, return 0; } +static ssize_t perf_stats_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int index, rc; + struct seq_buf s; + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + + if (!p->stat_buffer_len) + return -ENOENT; + + /* Allocate the buffer for phyp where stats are written */ + stats = kzalloc(p->stat_buffer_len, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + /* Ask phyp to return all dimm perf stats */ + rc = drc_pmem_query_stats(p, stats, 0); + if (rc) + goto free_stats; + /* + * Go through the returned output buffer and print stats and + * values. Since stat_id is essentially a char string of + * 8 bytes, simply use the string format specifier to print it. + */ + seq_buf_init(&s, buf, PAGE_SIZE); + for (index = 0, stat = stats->scm_statistic; + index < be32_to_cpu(stats->num_statistics); + ++index, ++stat) { + seq_buf_printf(&s, "%.8s = 0x%016llX\n", + stat->stat_id, + be64_to_cpu(stat->stat_val)); + } + +free_stats: + kfree(stats); + return rc ? rc : seq_buf_used(&s); +} +DEVICE_ATTR_RO(perf_stats); + static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -676,6 +869,7 @@ DEVICE_ATTR_RO(flags); /* papr_scm specific dimm attributes */ static struct attribute *papr_nd_attributes[] = { &dev_attr_flags.attr, + &dev_attr_perf_stats.attr, NULL, }; @@ -696,6 +890,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) struct nd_region_desc ndr_desc; unsigned long dimm_flags; int target_nid, online_nid; + ssize_t stat_size; p->bus_desc.ndctl = papr_scm_ndctl; p->bus_desc.module = THIS_MODULE; @@ -759,6 +954,20 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) dev_info(dev, "Region registered with target node %d and online node %d", target_nid, online_nid); + mutex_lock(&papr_ndr_lock); + list_add_tail(&p->region_list, &papr_nd_regions); + mutex_unlock(&papr_ndr_lock); + + /* Try retriving the stat buffer and see if its supported */ + stat_size = drc_pmem_query_stats(p, NULL, 0); + if (stat_size > 0) { + p->stat_buffer_len = stat_size; + dev_dbg(&p->pdev->dev, "Max perf-stat size %lu-bytes\n", + p->stat_buffer_len); + } else { + dev_info(&p->pdev->dev, "Dimm performance stats unavailable\n"); + } + return 0; err: nvdimm_bus_unregister(p->bus); @@ -766,6 +975,68 @@ err: nvdimm_bus_unregister(p->bus); return -ENXIO; } +static void papr_scm_add_badblock(struct nd_region *region, + struct nvdimm_bus *bus, u64 phys_addr) +{ + u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES); + + if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) { + pr_err("Bad block registration for 0x%llx failed\n", phys_addr); + return; + } + + pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n", + aligned_addr, aligned_addr + L1_CACHE_BYTES); + + nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON); +} + +static int handle_mce_ue(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct machine_check_event *evt = data; + struct papr_scm_priv *p; + u64 phys_addr; + bool found = false; + + if (evt->error_type != MCE_ERROR_TYPE_UE) + return NOTIFY_DONE; + + if (list_empty(&papr_nd_regions)) + return NOTIFY_DONE; + + /* + * The physical address obtained here is PAGE_SIZE aligned, so get the + * exact address from the effective address + */ + phys_addr = evt->u.ue_error.physical_address + + (evt->u.ue_error.effective_address & ~PAGE_MASK); + + if (!evt->u.ue_error.physical_address_provided || + !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT))) + return NOTIFY_DONE; + + /* mce notifier is called from a process context, so mutex is safe */ + mutex_lock(&papr_ndr_lock); + list_for_each_entry(p, &papr_nd_regions, region_list) { + if (phys_addr >= p->res.start && phys_addr <= p->res.end) { + found = true; + break; + } + } + + if (found) + papr_scm_add_badblock(p->region, p->bus, phys_addr); + + mutex_unlock(&papr_ndr_lock); + + return found ? NOTIFY_OK : NOTIFY_DONE; +} + +static struct notifier_block mce_ue_nb = { + .notifier_call = handle_mce_ue +}; + static int papr_scm_probe(struct platform_device *pdev) { struct device_node *dn = pdev->dev.of_node; @@ -866,6 +1137,10 @@ static int papr_scm_remove(struct platform_device *pdev) { struct papr_scm_priv *p = platform_get_drvdata(pdev); + mutex_lock(&papr_ndr_lock); + list_del(&p->region_list); + mutex_unlock(&papr_ndr_lock); + nvdimm_bus_unregister(p->bus); drc_pmem_unbind(p); kfree(p->bus_desc.provider_name); @@ -876,6 +1151,7 @@ static int papr_scm_remove(struct platform_device *pdev) static const struct of_device_id papr_scm_match[] = { { .compatible = "ibm,pmemory" }, + { .compatible = "ibm,pmemory-v2" }, { }, }; @@ -888,7 +1164,25 @@ static struct platform_driver papr_scm_driver = { }, }; -module_platform_driver(papr_scm_driver); +static int __init papr_scm_init(void) +{ + int ret; + + ret = platform_driver_register(&papr_scm_driver); + if (!ret) + mce_register_notifier(&mce_ue_nb); + + return ret; +} +module_init(papr_scm_init); + +static void __exit papr_scm_exit(void) +{ + mce_unregister_notifier(&mce_ue_nb); + platform_driver_unregister(&papr_scm_driver); +} +module_exit(papr_scm_exit); + MODULE_DEVICE_TABLE(of, papr_scm_match); MODULE_LICENSE("GPL"); MODULE_AUTHOR("IBM Corporation"); diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index b3a38f5a6b68..f9ae17e8a0f4 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -34,7 +34,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); /* Create EEH devices for the PHB */ - eeh_dev_phb_init_dynamic(phb); + eeh_phb_pe_create(phb); if (dn->child) pseries_eeh_init_edev_recursive(PCI_DN(dn)); diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index f860a897a9e0..e1dc5d3254df 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -24,7 +24,6 @@ #include <asm/topology.h> #include "pseries.h" -#include "offline_states.h" static struct device_node *pmem_node; @@ -147,6 +146,12 @@ const struct of_device_id drc_pmem_match[] = { static int pseries_pmem_init(void) { + /* + * Only supported on POWER8 and above. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return 0; + pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory"); if (!pmem_node) return 0; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 27094c872fd6..2f4ee0a90284 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -359,7 +359,7 @@ static void pseries_lpar_idle(void) * to ever be a problem in practice we can move this into a kernel thread to * finish off the process later in boot. */ -void pseries_enable_reloc_on_exc(void) +bool pseries_enable_reloc_on_exc(void) { long rc; unsigned int delay, total_delay = 0; @@ -370,11 +370,13 @@ void pseries_enable_reloc_on_exc(void) if (rc == H_P2) { pr_info("Relocation on exceptions not" " supported\n"); + return false; } else if (rc != H_SUCCESS) { pr_warn("Unable to enable relocation" " on exceptions: %ld\n", rc); + return false; } - break; + return true; } delay = get_longbusy_msecs(rc); @@ -383,7 +385,7 @@ void pseries_enable_reloc_on_exc(void) pr_warn("Warning: Giving up waiting to enable " "relocation on exceptions (%u msec)!\n", total_delay); - return; + return false; } mdelay(delay); @@ -746,6 +748,11 @@ static void __init pSeries_setup_arch(void) smp_init_pseries(); + if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE)) + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) + panic("BUG: Radix support requires either GTSE or RPT_INVALIDATE\n"); + + /* openpic global configuration register (64-bit format). */ /* openpic Interrupt Source Unit pointer (64-bit format). */ /* python0 facility area (mmio) (64-bit format) REAL address. */ @@ -772,8 +779,10 @@ static void __init pSeries_setup_arch(void) if (firmware_has_feature(FW_FEATURE_LPAR)) { vpa_init(boot_cpuid); - if (lppaca_shared_proc(get_lppaca())) + if (lppaca_shared_proc(get_lppaca())) { static_branch_enable(&shared_processor); + pv_spinlocks_init(); + } ppc_md.power_save = pseries_lpar_idle; ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; @@ -832,12 +841,15 @@ static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx) return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx); } -static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx) +static int pseries_set_dawr(int nr, unsigned long dawr, unsigned long dawrx) { /* PAPR says we can't set HYP */ dawrx &= ~DAWRX_HYP; - return plpar_set_watchpoint0(dawr, dawrx); + if (nr == 0) + return plpar_set_watchpoint0(dawr, dawrx); + else + return plpar_set_watchpoint1(dawr, dawrx); } #define CMO_CHARACTERISTICS_TOKEN 44 diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 6891710833be..92922491a81c 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -44,8 +44,6 @@ #include <asm/svm.h> #include "pseries.h" -#include "offline_states.h" - /* * The Primary thread of each non-boot processor was started from the OF client @@ -108,10 +106,7 @@ static inline int smp_startup_cpu(unsigned int lcpu) /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count = 0; -#ifdef CONFIG_HOTPLUG_CPU - if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) - goto out; -#endif + /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -126,9 +121,6 @@ static inline int smp_startup_cpu(unsigned int lcpu) return 0; } -#ifdef CONFIG_HOTPLUG_CPU -out: -#endif return 1; } @@ -143,10 +135,6 @@ static void smp_setup_cpu(int cpu) vpa_init(cpu); cpumask_clear_cpu(cpu, of_spin_mask); -#ifdef CONFIG_HOTPLUG_CPU - set_cpu_current_state(cpu, CPU_STATE_ONLINE); - set_default_offline_state(cpu); -#endif } static int smp_pSeries_kick_cpu(int nr) @@ -163,20 +151,6 @@ static int smp_pSeries_kick_cpu(int nr) * the processor will continue on to secondary_start */ paca_ptrs[nr]->cpu_start = 1; -#ifdef CONFIG_HOTPLUG_CPU - set_preferred_offline_state(nr, CPU_STATE_ONLINE); - - if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { - long rc; - unsigned long hcpuid; - - hcpuid = get_hard_smp_processor_id(nr); - rc = plpar_hcall_norets(H_PROD, hcpuid); - if (rc != H_SUCCESS) - printk(KERN_ERR "Error: Prod to wake up processor %d " - "Ret= %ld\n", nr, rc); - } -#endif return 0; } @@ -188,13 +162,16 @@ static int pseries_smp_prepare_cpu(int cpu) return 0; } -static void smp_pseries_cause_ipi(int cpu) +/* Cause IPI as setup by the interrupt controller (xics or xive) */ +static void (*ic_cause_ipi)(int cpu) __ro_after_init; + +/* Use msgsndp doorbells target is a sibling, else use interrupt controller */ +static void dbell_or_ic_cause_ipi(int cpu) { - /* POWER9 should not use this handler */ if (doorbell_try_core_ipi(cpu)) return; - icp_ops->cause_ipi(cpu); + ic_cause_ipi(cpu); } static int pseries_cause_nmi_ipi(int cpu) @@ -218,26 +195,49 @@ static int pseries_cause_nmi_ipi(int cpu) return 0; } -static __init void pSeries_smp_probe_xics(void) -{ - xics_smp_probe(); - - if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) - smp_ops->cause_ipi = smp_pseries_cause_ipi; - else - smp_ops->cause_ipi = icp_ops->cause_ipi; -} - static __init void pSeries_smp_probe(void) { if (xive_enabled()) - /* - * Don't use P9 doorbells when XIVE is enabled. IPIs - * using MMIOs should be faster - */ xive_smp_probe(); else - pSeries_smp_probe_xics(); + xics_smp_probe(); + + /* No doorbell facility, must use the interrupt controller for IPIs */ + if (!cpu_has_feature(CPU_FTR_DBELL)) + return; + + /* Doorbells can only be used for IPIs between SMT siblings */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + if (is_kvm_guest()) { + /* + * KVM emulates doorbells by disabling FSCR[MSGP] so msgsndp + * faults to the hypervisor which then reads the instruction + * from guest memory, which tends to be slower than using XIVE. + */ + if (xive_enabled()) + return; + + /* + * XICS hcalls aren't as fast, so we can use msgsndp (which + * also helps exercise KVM emulation), however KVM can't + * emulate secure guests because it can't read the instruction + * out of their memory. + */ + if (is_secure_guest()) + return; + } + + /* + * Under PowerVM, FSCR[MSGP] is enabled as guest vCPU siblings are + * gang scheduled on the same physical core, so doorbells are always + * faster than the interrupt controller, and they can be used by + * secure guests. + */ + + ic_cause_ipi = smp_ops->cause_ipi; + smp_ops->cause_ipi = dbell_or_ic_cause_ipi; } static struct smp_ops_t pseries_smp_ops = { diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index 0a24a5a185f0..81e0ac58d620 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -132,15 +132,11 @@ static ssize_t store_hibernate(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - cpumask_var_t offline_mask; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL)) - return -ENOMEM; - stream_id = simple_strtoul(buf, NULL, 16); do { @@ -149,33 +145,14 @@ static ssize_t store_hibernate(struct device *dev, ssleep(1); } while (rc == -EAGAIN); - if (!rc) { - /* All present CPUs must be online */ - cpumask_andnot(offline_mask, cpu_present_mask, - cpu_online_mask); - rc = rtas_online_cpus_mask(offline_mask); - if (rc) { - pr_err("%s: Could not bring present CPUs online.\n", - __func__); - goto out; - } - - stop_topology_update(); + if (!rc) rc = pm_suspend(PM_SUSPEND_MEM); - start_topology_update(); - - /* Take down CPUs not online prior to suspend */ - if (!rtas_offline_cpus_mask(offline_mask)) - pr_warn("%s: Could not restore CPUs to offline " - "state.\n", __func__); - } stream_id = 0; if (!rc) rc = count; -out: - free_cpumask_var(offline_mask); + return rc; } |