From 674bfa485554156aa90ce17288712fcb568a42c3 Mon Sep 17 00:00:00 2001 From: Suzuki Poulose Date: Mon, 18 Jul 2011 03:29:20 +0000 Subject: powerpc/44x: Kexec support for PPC440X chipsets This patch adds kexec support for PPC440 based chipsets. This work is based on the KEXEC patches for FSL BookE. The FSL BookE patch and the code flow could be found at the link below: http://patchwork.ozlabs.org/patch/49359/ Steps: 1) Invalidate all the TLB entries except the one this code is run from 2) Create a tmp mapping for our code in the other address space and jump to it 3) Invalidate the entry we used 4) Create a 1:1 mapping for 0-2GiB in blocks of 256M 5) Jump to the new 1:1 mapping and invalidate the tmp mapping I have tested this patches on Ebony, Sequoia boards and Virtex on QEMU. You need kexec-tools commit e8b7939b1e or newer for ppc440x support, available at: git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git Signed-off-by: Suzuki Poulose Cc: Sebastian Andrzej Siewior Signed-off-by: Josh Boyer --- arch/powerpc/kernel/misc_32.S | 171 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 998a10028608..f7d760ab5ca1 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -8,6 +8,8 @@ * kexec bits: * Copyright (C) 2002-2003 Eric Biederman * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz + * PPC44x port. Copyright (C) 2011, IBM Corporation + * Author: Suzuki Poulose * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -735,6 +737,175 @@ relocate_new_kernel: mr r4, r30 mr r5, r31 + li r0, 0 +#elif defined(CONFIG_44x) && !defined(CONFIG_47x) + +/* + * Code for setting up 1:1 mapping for PPC440x for KEXEC + * + * We cannot switch off the MMU on PPC44x. + * So we: + * 1) Invalidate all the mappings except the one we are running from. + * 2) Create a tmp mapping for our code in the other address space(TS) and + * jump to it. Invalidate the entry we started in. + * 3) Create a 1:1 mapping for 0-2GiB in chunks of 256M in original TS. + * 4) Jump to the 1:1 mapping in original TS. + * 5) Invalidate the tmp mapping. + * + * - Based on the kexec support code for FSL BookE + * - Doesn't support 47x yet. + * + */ + /* Save our parameters */ + mr r29, r3 + mr r30, r4 + mr r31, r5 + + /* Load our MSR_IS and TID to MMUCR for TLB search */ + mfspr r3,SPRN_PID + mfmsr r4 + andi. r4,r4,MSR_IS@l + beq wmmucr + oris r3,r3,PPC44x_MMUCR_STS@h +wmmucr: + mtspr SPRN_MMUCR,r3 + sync + + /* + * Invalidate all the TLB entries except the current entry + * where we are running from + */ + bl 0f /* Find our address */ +0: mflr r5 /* Make it accessible */ + tlbsx r23,0,r5 /* Find entry we are in */ + li r4,0 /* Start at TLB entry 0 */ + li r3,0 /* Set PAGEID inval value */ +1: cmpw r23,r4 /* Is this our entry? */ + beq skip /* If so, skip the inval */ + tlbwe r3,r4,PPC44x_TLB_PAGEID /* If not, inval the entry */ +skip: + addi r4,r4,1 /* Increment */ + cmpwi r4,64 /* Are we done? */ + bne 1b /* If not, repeat */ + isync + + /* Create a temp mapping and jump to it */ + andi. r6, r23, 1 /* Find the index to use */ + addi r24, r6, 1 /* r24 will contain 1 or 2 */ + + mfmsr r9 /* get the MSR */ + rlwinm r5, r9, 27, 31, 31 /* Extract the MSR[IS] */ + xori r7, r5, 1 /* Use the other address space */ + + /* Read the current mapping entries */ + tlbre r3, r23, PPC44x_TLB_PAGEID + tlbre r4, r23, PPC44x_TLB_XLAT + tlbre r5, r23, PPC44x_TLB_ATTRIB + + /* Save our current XLAT entry */ + mr r25, r4 + + /* Extract the TLB PageSize */ + li r10, 1 /* r10 will hold PageSize */ + rlwinm r11, r3, 0, 24, 27 /* bits 24-27 */ + + /* XXX: As of now we use 256M, 4K pages */ + cmpwi r11, PPC44x_TLB_256M + bne tlb_4k + rotlwi r10, r10, 28 /* r10 = 256M */ + b write_out +tlb_4k: + cmpwi r11, PPC44x_TLB_4K + bne default + rotlwi r10, r10, 12 /* r10 = 4K */ + b write_out +default: + rotlwi r10, r10, 10 /* r10 = 1K */ + +write_out: + /* + * Write out the tmp 1:1 mapping for this code in other address space + * Fixup EPN = RPN , TS=other address space + */ + insrwi r3, r7, 1, 23 /* Bit 23 is TS for PAGEID field */ + + /* Write out the tmp mapping entries */ + tlbwe r3, r24, PPC44x_TLB_PAGEID + tlbwe r4, r24, PPC44x_TLB_XLAT + tlbwe r5, r24, PPC44x_TLB_ATTRIB + + subi r11, r10, 1 /* PageOffset Mask = PageSize - 1 */ + not r10, r11 /* Mask for PageNum */ + + /* Switch to other address space in MSR */ + insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ + + bl 1f +1: mflr r8 + addi r8, r8, (2f-1b) /* Find the target offset */ + + /* Jump to the tmp mapping */ + mtspr SPRN_SRR0, r8 + mtspr SPRN_SRR1, r9 + rfi + +2: + /* Invalidate the entry we were executing from */ + li r3, 0 + tlbwe r3, r23, PPC44x_TLB_PAGEID + + /* attribute fields. rwx for SUPERVISOR mode */ + li r5, 0 + ori r5, r5, (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G) + + /* Create 1:1 mapping in 256M pages */ + xori r7, r7, 1 /* Revert back to Original TS */ + + li r8, 0 /* PageNumber */ + li r6, 3 /* TLB Index, start at 3 */ + +next_tlb: + rotlwi r3, r8, 28 /* Create EPN (bits 0-3) */ + mr r4, r3 /* RPN = EPN */ + ori r3, r3, (PPC44x_TLB_VALID | PPC44x_TLB_256M) /* SIZE = 256M, Valid */ + insrwi r3, r7, 1, 23 /* Set TS from r7 */ + + tlbwe r3, r6, PPC44x_TLB_PAGEID /* PageID field : EPN, V, SIZE */ + tlbwe r4, r6, PPC44x_TLB_XLAT /* Address translation : RPN */ + tlbwe r5, r6, PPC44x_TLB_ATTRIB /* Attributes */ + + addi r8, r8, 1 /* Increment PN */ + addi r6, r6, 1 /* Increment TLB Index */ + cmpwi r8, 8 /* Are we done ? */ + bne next_tlb + isync + + /* Jump to the new mapping 1:1 */ + li r9,0 + insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ + + bl 1f +1: mflr r8 + and r8, r8, r11 /* Get our offset within page */ + addi r8, r8, (2f-1b) + + and r5, r25, r10 /* Get our target PageNum */ + or r8, r8, r5 /* Target jump address */ + + mtspr SPRN_SRR0, r8 + mtspr SPRN_SRR1, r9 + rfi +2: + /* Invalidate the tmp entry we used */ + li r3, 0 + tlbwe r3, r24, PPC44x_TLB_PAGEID + sync + + /* Restore the parameters */ + mr r3, r29 + mr r4, r30 + mr r5, r31 + li r0, 0 #else li r0, 0 -- cgit v1.2.3 From 6a5c7be5e484bda5b2639fedf7dbe3f25c15c962 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Fri, 24 Jun 2011 09:05:22 +0000 Subject: powerpc: Override dma_get_required_mask by platform hook and ops The hook dma_get_required_mask is supposed to return the mask required by the platform to operate efficently. The generic version of dma_get_required_mask in driver/base/platform.c returns a mask based only on max_pfn. However, this is likely too big for iommu systems and could be too small for platforms that require a dma offset or have a secondary window at a high offset. Override the default, provide a hook in ppc_md used by pseries lpar and cell, and provide the default answer based on memblock_end_of_DRAM(), with hooks for get_dma_offset, and provide an implementation for iommu that looks at the defined table size. Coverting from the end address to the required bit mask is based on the generic implementation. The need for this was discovered when the qla2xxx driver switched to 64 bit dma then reverted to 32 bit when dma_get_required_mask said 32 bits was sufficient. Signed-off-by: Milton Miller Signed-off-by: Nishanth Aravamudan Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Cc: benh@kernel.crashing.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/dma-mapping.h | 3 +++ arch/powerpc/include/asm/machdep.h | 3 ++- arch/powerpc/kernel/dma-iommu.c | 13 ++++++++++++ arch/powerpc/kernel/dma.c | 39 ++++++++++++++++++++++++++++++++++ arch/powerpc/platforms/cell/iommu.c | 12 +++++++++++ arch/powerpc/platforms/pseries/iommu.c | 27 +++++++++++++++++++++++ 6 files changed, 96 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index dd70fac57ec8..8135e66a4bb9 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -20,6 +20,8 @@ #define DMA_ERROR_CODE (~(dma_addr_t)0x0) +#define ARCH_HAS_DMA_GET_REQUIRED_MASK + /* Some dma direct funcs must be visible for use in other dma_ops */ extern void *dma_direct_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag); @@ -69,6 +71,7 @@ static inline unsigned long device_to_mask(struct device *dev) */ #ifdef CONFIG_PPC64 extern struct dma_map_ops dma_iommu_ops; +extern u64 dma_iommu_get_required_mask(struct device *dev); #endif extern struct dma_map_ops dma_direct_ops; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 47cacddb14cf..58fc21623014 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -85,8 +85,9 @@ struct machdep_calls { void (*pci_dma_dev_setup)(struct pci_dev *dev); void (*pci_dma_bus_setup)(struct pci_bus *bus); - /* Platform set_dma_mask override */ + /* Platform set_dma_mask and dma_get_required_mask overrides */ int (*dma_set_mask)(struct device *dev, u64 dma_mask); + u64 (*dma_get_required_mask)(struct device *dev); int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index e7554154a6de..1f2a711a261e 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -90,6 +90,19 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } +u64 dma_iommu_get_required_mask(struct device *dev) +{ + struct iommu_table *tbl = get_iommu_table_base(dev); + u64 mask; + if (!tbl) + return 0; + + mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); + mask += mask - 1; + + return mask; +} + struct dma_map_ops dma_iommu_ops = { .alloc_coherent = dma_iommu_alloc_coherent, .free_coherent = dma_iommu_free_coherent, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 4f0959fbfbee..503093efa202 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -170,6 +170,45 @@ int dma_set_mask(struct device *dev, u64 dma_mask) } EXPORT_SYMBOL(dma_set_mask); +u64 dma_get_required_mask(struct device *dev) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + u64 mask, end = 0; + + if (ppc_md.dma_get_required_mask) + return ppc_md.dma_get_required_mask(dev); + + if (unlikely(dma_ops == NULL)) + return 0; + +#ifdef CONFIG_PPC64 + else if (dma_ops == &dma_iommu_ops) + return dma_iommu_get_required_mask(dev); +#endif +#ifdef CONFIG_SWIOTLB + else if (dma_ops == &swiotlb_dma_ops) { + u64 max_direct_dma_addr = dev->archdata.max_direct_dma_addr; + + end = memblock_end_of_DRAM(); + if (max_direct_dma_addr && end > max_direct_dma_addr) + end = max_direct_dma_addr; + end += get_dma_offset(dev); + } +#endif + else if (dma_ops == &dma_direct_ops) + end = memblock_end_of_DRAM() + get_dma_offset(dev); + else { + WARN_ONCE(1, "%s: unknown ops %p\n", __func__, dma_ops); + end = memblock_end_of_DRAM(); + } + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} +EXPORT_SYMBOL_GPL(dma_get_required_mask); + static int __init dma_init(void) { dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 26a067122a54..5ef55f3b0987 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -1159,6 +1159,17 @@ static int __init setup_iommu_fixed(char *str) } __setup("iommu_fixed=", setup_iommu_fixed); +static u64 cell_dma_get_required_mask(struct device *dev) +{ + if (!dev->dma_mask) + return 0; + + if (iommu_fixed_disabled && get_dma_ops(dev) == &dma_iommu_ops) + return dma_iommu_get_required_mask(dev); + + return DMA_BIT_MASK(64); +} + static int __init cell_iommu_init(void) { struct device_node *np; @@ -1175,6 +1186,7 @@ static int __init cell_iommu_init(void) /* Setup various ppc_md. callbacks */ ppc_md.pci_dma_dev_setup = cell_pci_dma_dev_setup; + ppc_md.dma_get_required_mask = cell_dma_get_required_mask; ppc_md.tce_build = tce_build_cell; ppc_md.tce_free = tce_free_cell; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 01faab9456ca..fe5ededf0d60 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1077,12 +1077,38 @@ check_mask: return 0; } +static u64 dma_get_required_mask_pSeriesLP(struct device *dev) +{ + if (!dev->dma_mask) + return 0; + + if (!disable_ddw && dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + struct device_node *dn; + + dn = pci_device_to_OF_node(pdev); + + /* search upwards for ibm,dma-window */ + for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table; + dn = dn->parent) + if (of_get_property(dn, "ibm,dma-window", NULL)) + break; + /* if there is a ibm,ddw-applicable property require 64 bits */ + if (dn && PCI_DN(dn) && + of_get_property(dn, "ibm,ddw-applicable", NULL)) + return DMA_BIT_MASK(64); + } + + return dma_iommu_get_required_mask(dev); +} + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL #define dma_set_mask_pSeriesLP NULL +#define dma_get_required_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, @@ -1186,6 +1212,7 @@ void iommu_init_early_pSeries(void) ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; + ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; -- cgit v1.2.3 From d24f9c6999eacd3a7bc2b289e49fcb2bf2fafef2 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Fri, 24 Jun 2011 09:05:24 +0000 Subject: powerpc: Use the newly added get_required_mask dma_map_ops hook Now that the generic code has dma_map_ops set, instead of having a messy ifdef & if block in the base dma_get_required_mask hook push the computation into the dma ops. If the ops fails to set the get_required_mask hook default to the width of dma_addr_t. This also corrects ibmbus ibmebus_dma_supported to require a 64 bit mask. I doubt anything is checking or setting the dma mask on that bus. Signed-off-by: Milton Miller Signed-off-by: Nishanth Aravamudan Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Cc: benh@kernel.crashing.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/device.h | 2 ++ arch/powerpc/include/asm/dma-mapping.h | 3 --- arch/powerpc/kernel/dma-iommu.c | 3 ++- arch/powerpc/kernel/dma-swiotlb.c | 16 +++++++++++++ arch/powerpc/kernel/dma.c | 41 +++++++++++++-------------------- arch/powerpc/kernel/ibmebus.c | 8 ++++++- arch/powerpc/kernel/vio.c | 7 +++++- arch/powerpc/platforms/cell/iommu.c | 13 +++++++++-- arch/powerpc/platforms/ps3/system-bus.c | 7 ++++++ arch/powerpc/platforms/pseries/iommu.c | 2 +- 10 files changed, 68 insertions(+), 34 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 16d25c0974be..d57c08acedfc 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -37,4 +37,6 @@ struct pdev_archdata { u64 dma_mask; }; +#define ARCH_HAS_DMA_GET_REQUIRED_MASK + #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 8135e66a4bb9..dd70fac57ec8 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -20,8 +20,6 @@ #define DMA_ERROR_CODE (~(dma_addr_t)0x0) -#define ARCH_HAS_DMA_GET_REQUIRED_MASK - /* Some dma direct funcs must be visible for use in other dma_ops */ extern void *dma_direct_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag); @@ -71,7 +69,6 @@ static inline unsigned long device_to_mask(struct device *dev) */ #ifdef CONFIG_PPC64 extern struct dma_map_ops dma_iommu_ops; -extern u64 dma_iommu_get_required_mask(struct device *dev); #endif extern struct dma_map_ops dma_direct_ops; diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 1f2a711a261e..c1ad9db934f6 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -90,7 +90,7 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } -u64 dma_iommu_get_required_mask(struct device *dev) +static u64 dma_iommu_get_required_mask(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; @@ -111,5 +111,6 @@ struct dma_map_ops dma_iommu_ops = { .dma_supported = dma_iommu_dma_supported, .map_page = dma_iommu_map_page, .unmap_page = dma_iommu_unmap_page, + .get_required_mask = dma_iommu_get_required_mask, }; EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 4295e0b94b2d..1ebc9189aada 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,6 +24,21 @@ unsigned int ppc_swiotlb_enable; +static u64 swiotlb_powerpc_get_required(struct device *dev) +{ + u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr; + + end = memblock_end_of_DRAM(); + if (max_direct_dma_addr && end > max_direct_dma_addr) + end = max_direct_dma_addr; + end += get_dma_offset(dev); + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + /* * At the moment, all platforms that use this code only require * swiotlb to be used if we're operating on HIGHMEM. Since @@ -44,6 +59,7 @@ struct dma_map_ops swiotlb_dma_ops = { .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .mapping_error = swiotlb_dma_mapping_error, + .get_required_mask = swiotlb_powerpc_get_required, }; void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 503093efa202..10b136afbf50 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -96,6 +96,18 @@ static int dma_direct_dma_supported(struct device *dev, u64 mask) #endif } +static u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 end, mask; + + end = memblock_end_of_DRAM() + get_dma_offset(dev); + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + static inline dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, @@ -144,6 +156,7 @@ struct dma_map_ops dma_direct_ops = { .dma_supported = dma_direct_dma_supported, .map_page = dma_direct_map_page, .unmap_page = dma_direct_unmap_page, + .get_required_mask = dma_direct_get_required_mask, #ifdef CONFIG_NOT_COHERENT_CACHE .sync_single_for_cpu = dma_direct_sync_single, .sync_single_for_device = dma_direct_sync_single, @@ -173,7 +186,6 @@ EXPORT_SYMBOL(dma_set_mask); u64 dma_get_required_mask(struct device *dev) { struct dma_map_ops *dma_ops = get_dma_ops(dev); - u64 mask, end = 0; if (ppc_md.dma_get_required_mask) return ppc_md.dma_get_required_mask(dev); @@ -181,31 +193,10 @@ u64 dma_get_required_mask(struct device *dev) if (unlikely(dma_ops == NULL)) return 0; -#ifdef CONFIG_PPC64 - else if (dma_ops == &dma_iommu_ops) - return dma_iommu_get_required_mask(dev); -#endif -#ifdef CONFIG_SWIOTLB - else if (dma_ops == &swiotlb_dma_ops) { - u64 max_direct_dma_addr = dev->archdata.max_direct_dma_addr; - - end = memblock_end_of_DRAM(); - if (max_direct_dma_addr && end > max_direct_dma_addr) - end = max_direct_dma_addr; - end += get_dma_offset(dev); - } -#endif - else if (dma_ops == &dma_direct_ops) - end = memblock_end_of_DRAM() + get_dma_offset(dev); - else { - WARN_ONCE(1, "%s: unknown ops %p\n", __func__, dma_ops); - end = memblock_end_of_DRAM(); - } + if (dma_ops->get_required_mask) + return dma_ops->get_required_mask(dev); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; + return DMA_BIT_MASK(8 * sizeof(dma_addr_t)); } EXPORT_SYMBOL_GPL(dma_get_required_mask); diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index 28581f1ad2c0..90ef2a44613b 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -125,7 +125,12 @@ static void ibmebus_unmap_sg(struct device *dev, static int ibmebus_dma_supported(struct device *dev, u64 mask) { - return 1; + return mask == DMA_BIT_MASK(64); +} + +static u64 ibmebus_dma_get_required_mask(struct device *dev) +{ + return DMA_BIT_MASK(64); } static struct dma_map_ops ibmebus_dma_ops = { @@ -134,6 +139,7 @@ static struct dma_map_ops ibmebus_dma_ops = { .map_sg = ibmebus_map_sg, .unmap_sg = ibmebus_unmap_sg, .dma_supported = ibmebus_dma_supported, + .get_required_mask = ibmebus_dma_get_required_mask, .map_page = ibmebus_map_page, .unmap_page = ibmebus_unmap_page, }; diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 1b695fdc362b..c0493259d133 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -605,6 +605,11 @@ static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask) return dma_iommu_ops.dma_supported(dev, mask); } +static u64 vio_dma_get_required_mask(struct device *dev) +{ + return dma_iommu_ops.get_required_mask(dev); +} + struct dma_map_ops vio_dma_mapping_ops = { .alloc_coherent = vio_dma_iommu_alloc_coherent, .free_coherent = vio_dma_iommu_free_coherent, @@ -613,7 +618,7 @@ struct dma_map_ops vio_dma_mapping_ops = { .map_page = vio_dma_iommu_map_page, .unmap_page = vio_dma_iommu_unmap_page, .dma_supported = vio_dma_iommu_dma_supported, - + .get_required_mask = vio_dma_get_required_mask, }; /** diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 5ef55f3b0987..fc46fcac3921 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -1161,11 +1161,20 @@ __setup("iommu_fixed=", setup_iommu_fixed); static u64 cell_dma_get_required_mask(struct device *dev) { + struct dma_map_ops *dma_ops; + if (!dev->dma_mask) return 0; - if (iommu_fixed_disabled && get_dma_ops(dev) == &dma_iommu_ops) - return dma_iommu_get_required_mask(dev); + if (!iommu_fixed_disabled && + cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) + return DMA_BIT_MASK(64); + + dma_ops = get_dma_ops(dev); + if (dma_ops->get_required_mask) + return dma_ops->get_required_mask(dev); + + WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops); return DMA_BIT_MASK(64); } diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 23083c397528..688141c76e03 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -695,12 +695,18 @@ static int ps3_dma_supported(struct device *_dev, u64 mask) return mask >= DMA_BIT_MASK(32); } +static u64 ps3_dma_get_required_mask(struct device *_dev) +{ + return DMA_BIT_MASK(32); +} + static struct dma_map_ops ps3_sb_dma_ops = { .alloc_coherent = ps3_alloc_coherent, .free_coherent = ps3_free_coherent, .map_sg = ps3_sb_map_sg, .unmap_sg = ps3_sb_unmap_sg, .dma_supported = ps3_dma_supported, + .get_required_mask = ps3_dma_get_required_mask, .map_page = ps3_sb_map_page, .unmap_page = ps3_unmap_page, }; @@ -711,6 +717,7 @@ static struct dma_map_ops ps3_ioc0_dma_ops = { .map_sg = ps3_ioc0_map_sg, .unmap_sg = ps3_ioc0_unmap_sg, .dma_supported = ps3_dma_supported, + .get_required_mask = ps3_dma_get_required_mask, .map_page = ps3_ioc0_map_page, .unmap_page = ps3_unmap_page, }; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index fe5ededf0d60..9f121a37eb51 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1099,7 +1099,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev) return DMA_BIT_MASK(64); } - return dma_iommu_get_required_mask(dev); + return dma_iommu_ops.get_required_mask(dev); } #else /* CONFIG_PCI */ -- cgit v1.2.3 From 2eccacd0974dca73e2151d3fd4c2dacf1a5c7cc2 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Fri, 24 Jun 2011 09:05:25 +0000 Subject: powerpc: Tidy up dma_map_ops after adding new hook The new get_required_mask hook name is longer than many of but not all of the prior ops. Tidy the struct initializers to align the equal signs using the local whitespace. Signed-off-by: Milton Miller Signed-off-by: Nishanth Aravamudan Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Cc: benh@kernel.crashing.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/dma-iommu.c | 14 +++++++------- arch/powerpc/kernel/dma.c | 16 ++++++++-------- arch/powerpc/kernel/ibmebus.c | 14 +++++++------- arch/powerpc/kernel/vio.c | 14 +++++++------- 4 files changed, 29 insertions(+), 29 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index c1ad9db934f6..6f04b9c383a7 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -104,13 +104,13 @@ static u64 dma_iommu_get_required_mask(struct device *dev) } struct dma_map_ops dma_iommu_ops = { - .alloc_coherent = dma_iommu_alloc_coherent, - .free_coherent = dma_iommu_free_coherent, - .map_sg = dma_iommu_map_sg, - .unmap_sg = dma_iommu_unmap_sg, - .dma_supported = dma_iommu_dma_supported, - .map_page = dma_iommu_map_page, - .unmap_page = dma_iommu_unmap_page, + .alloc_coherent = dma_iommu_alloc_coherent, + .free_coherent = dma_iommu_free_coherent, + .map_sg = dma_iommu_map_sg, + .unmap_sg = dma_iommu_unmap_sg, + .dma_supported = dma_iommu_dma_supported, + .map_page = dma_iommu_map_page, + .unmap_page = dma_iommu_unmap_page, .get_required_mask = dma_iommu_get_required_mask, }; EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 10b136afbf50..8593f53c4f6c 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -149,14 +149,14 @@ static inline void dma_direct_sync_single(struct device *dev, #endif struct dma_map_ops dma_direct_ops = { - .alloc_coherent = dma_direct_alloc_coherent, - .free_coherent = dma_direct_free_coherent, - .map_sg = dma_direct_map_sg, - .unmap_sg = dma_direct_unmap_sg, - .dma_supported = dma_direct_dma_supported, - .map_page = dma_direct_map_page, - .unmap_page = dma_direct_unmap_page, - .get_required_mask = dma_direct_get_required_mask, + .alloc_coherent = dma_direct_alloc_coherent, + .free_coherent = dma_direct_free_coherent, + .map_sg = dma_direct_map_sg, + .unmap_sg = dma_direct_unmap_sg, + .dma_supported = dma_direct_dma_supported, + .map_page = dma_direct_map_page, + .unmap_page = dma_direct_unmap_page, + .get_required_mask = dma_direct_get_required_mask, #ifdef CONFIG_NOT_COHERENT_CACHE .sync_single_for_cpu = dma_direct_sync_single, .sync_single_for_device = dma_direct_sync_single, diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index 90ef2a44613b..73110fb6bb6c 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -134,14 +134,14 @@ static u64 ibmebus_dma_get_required_mask(struct device *dev) } static struct dma_map_ops ibmebus_dma_ops = { - .alloc_coherent = ibmebus_alloc_coherent, - .free_coherent = ibmebus_free_coherent, - .map_sg = ibmebus_map_sg, - .unmap_sg = ibmebus_unmap_sg, - .dma_supported = ibmebus_dma_supported, + .alloc_coherent = ibmebus_alloc_coherent, + .free_coherent = ibmebus_free_coherent, + .map_sg = ibmebus_map_sg, + .unmap_sg = ibmebus_unmap_sg, + .dma_supported = ibmebus_dma_supported, .get_required_mask = ibmebus_dma_get_required_mask, - .map_page = ibmebus_map_page, - .unmap_page = ibmebus_unmap_page, + .map_page = ibmebus_map_page, + .unmap_page = ibmebus_unmap_page, }; static int ibmebus_match_path(struct device *dev, void *data) diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index c0493259d133..34d291d83ec8 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -611,13 +611,13 @@ static u64 vio_dma_get_required_mask(struct device *dev) } struct dma_map_ops vio_dma_mapping_ops = { - .alloc_coherent = vio_dma_iommu_alloc_coherent, - .free_coherent = vio_dma_iommu_free_coherent, - .map_sg = vio_dma_iommu_map_sg, - .unmap_sg = vio_dma_iommu_unmap_sg, - .map_page = vio_dma_iommu_map_page, - .unmap_page = vio_dma_iommu_unmap_page, - .dma_supported = vio_dma_iommu_dma_supported, + .alloc_coherent = vio_dma_iommu_alloc_coherent, + .free_coherent = vio_dma_iommu_free_coherent, + .map_sg = vio_dma_iommu_map_sg, + .unmap_sg = vio_dma_iommu_unmap_sg, + .map_page = vio_dma_iommu_map_page, + .unmap_page = vio_dma_iommu_unmap_page, + .dma_supported = vio_dma_iommu_dma_supported, .get_required_mask = vio_dma_get_required_mask, }; -- cgit v1.2.3 From 7df5659eefad9b6d457ccdee016bd78bd064cfc0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Jun 2011 11:45:16 +0000 Subject: serial/8250: Move UPIO_TSI to powerpc This iotype is only used by the legacy_serial code in powerpc, so the code should live there, rather than be compiled in for every 8250 driver. Signed-off-by: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: linuxppc-dev@lists.ozlabs.org Cc: Greg Kroah-Hartman Cc: linux-serial@vger.kernel.org Acked-by: David Daney Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/legacy_serial.c | 25 +++++++++++++++++++++++++ drivers/tty/serial/8250.c | 23 ----------------------- 2 files changed, 25 insertions(+), 23 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 2b97b80d6d7d..c7b5afeecaf2 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,24 @@ static struct __initdata of_device_id legacy_serial_parents[] = { static unsigned int legacy_serial_count; static int legacy_serial_console = -1; +static unsigned int tsi_serial_in(struct uart_port *p, int offset) +{ + unsigned int tmp; + offset = offset << p->regshift; + if (offset == UART_IIR) { + tmp = readl(p->membase + (UART_IIR & ~3)); + return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */ + } else + return readb(p->membase + offset); +} + +static void tsi_serial_out(struct uart_port *p, int offset, int value) +{ + offset = offset << p->regshift; + if (!((offset == UART_IER) && (value & UART_IER_UUE))) + writeb(value, p->membase + offset); +} + static int __init add_legacy_port(struct device_node *np, int want_index, int iotype, phys_addr_t base, phys_addr_t taddr, unsigned long irq, @@ -102,6 +121,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_ports[index].iobase = base; else legacy_serial_ports[index].mapbase = base; + legacy_serial_ports[index].iotype = iotype; legacy_serial_ports[index].uartclk = clock; legacy_serial_ports[index].irq = irq; @@ -112,6 +132,11 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_infos[index].speed = spd ? be32_to_cpup(spd) : 0; legacy_serial_infos[index].irq_check_parent = irq_check_parent; + if (iotype == UPIO_TSI) { + legacy_serial_ports[index].serial_in = tsi_serial_in; + legacy_serial_ports[index].serial_out = tsi_serial_out; + } + printk(KERN_DEBUG "Found legacy serial port %d for %s\n", index, np->full_name); printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n", diff --git a/drivers/tty/serial/8250.c b/drivers/tty/serial/8250.c index 7f50999eebc2..610b8e63710d 100644 --- a/drivers/tty/serial/8250.c +++ b/drivers/tty/serial/8250.c @@ -443,24 +443,6 @@ static void au_serial_out(struct uart_port *p, int offset, int value) __raw_writel(value, p->membase + offset); } -static unsigned int tsi_serial_in(struct uart_port *p, int offset) -{ - unsigned int tmp; - offset = map_8250_in_reg(p, offset) << p->regshift; - if (offset == UART_IIR) { - tmp = readl(p->membase + (UART_IIR & ~3)); - return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */ - } else - return readb(p->membase + offset); -} - -static void tsi_serial_out(struct uart_port *p, int offset, int value) -{ - offset = map_8250_out_reg(p, offset) << p->regshift; - if (!((offset == UART_IER) && (value & UART_IER_UUE))) - writeb(value, p->membase + offset); -} - /* Save the LCR value so it can be re-written when a Busy Detect IRQ occurs. */ static inline void dwapb_save_out_value(struct uart_port *p, int offset, int value) @@ -535,11 +517,6 @@ static void set_io_from_upio(struct uart_port *p) p->serial_out = au_serial_out; break; - case UPIO_TSI: - p->serial_in = tsi_serial_in; - p->serial_out = tsi_serial_out; - break; - case UPIO_DWAPB: p->serial_in = mem_serial_in; p->serial_out = dwapb_serial_out; -- cgit v1.2.3 From 41151e77a4d96ea138cede6d84c955aa4769ce74 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Tue, 28 Jun 2011 09:54:48 +0000 Subject: powerpc: Hugetlb for BookE Enable hugepages on Freescale BookE processors. This allows the kernel to use huge TLB entries to map pages, which can greatly reduce the number of TLB misses and the amount of TLB thrashing experienced by applications with large memory footprints. Care should be taken when using this on FSL processors, as the number of large TLB entries supported by the core is low (16-64) on current processors. The supported set of hugepage sizes include 4m, 16m, 64m, 256m, and 1g. Page sizes larger than the max zone size are called "gigantic" pages and must be allocated on the command line (and cannot be deallocated). This is currently only fully implemented for Freescale 32-bit BookE processors, but there is some infrastructure in the code for 64-bit BooKE. Signed-off-by: Becky Bruce Signed-off-by: David Gibson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 3 +- arch/powerpc/include/asm/hugetlb.h | 63 +++++- arch/powerpc/include/asm/mmu-book3e.h | 7 + arch/powerpc/include/asm/mmu-hash64.h | 3 +- arch/powerpc/include/asm/mmu.h | 18 +- arch/powerpc/include/asm/page.h | 31 ++- arch/powerpc/include/asm/page_64.h | 11 - arch/powerpc/include/asm/pte-book3e.h | 3 + arch/powerpc/kernel/head_fsl_booke.S | 133 ++++++++++-- arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/hash_utils_64.c | 3 - arch/powerpc/mm/hugetlbpage-book3e.c | 121 +++++++++++ arch/powerpc/mm/hugetlbpage.c | 379 +++++++++++++++++++++++++++++---- arch/powerpc/mm/init_32.c | 9 + arch/powerpc/mm/mem.c | 5 + arch/powerpc/mm/mmu_context_nohash.c | 5 + arch/powerpc/mm/pgtable.c | 3 +- arch/powerpc/mm/tlb_low_64e.S | 24 +-- arch/powerpc/mm/tlb_nohash.c | 46 +++- arch/powerpc/platforms/Kconfig.cputype | 4 +- 20 files changed, 766 insertions(+), 106 deletions(-) create mode 100644 arch/powerpc/mm/hugetlbpage-book3e.c (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0a3d5560c9be..62711421cd64 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -429,8 +429,7 @@ config ARCH_POPULATES_NODE_MAP def_bool y config SYS_SUPPORTS_HUGETLBFS - def_bool y - depends on PPC_BOOK3S_64 + bool source "mm/Kconfig" diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 5856a66ab404..86004930a78e 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -1,15 +1,60 @@ #ifndef _ASM_POWERPC_HUGETLB_H #define _ASM_POWERPC_HUGETLB_H +#ifdef CONFIG_HUGETLB_PAGE #include +extern struct kmem_cache *hugepte_cache; +extern void __init reserve_hugetlb_gpages(void); + +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + BUG_ON(!hugepd_ok(hpd)); + return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | PD_HUGE); +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return hpd.pd & HUGEPD_SHIFT_MASK; +} + +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, + unsigned pdshift) +{ + /* + * On 32-bit, we have multiple higher-level table entries that point to + * the same hugepte. Just use the first one since they're all + * identical. So for that case, idx=0. + */ + unsigned long idx = 0; + + pte_t *dir = hugepd_page(*hpdp); +#ifdef CONFIG_PPC64 + idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); +#endif + + return dir + idx; +} + pte_t *huge_pte_offset_and_shift(struct mm_struct *mm, unsigned long addr, unsigned *shift); void flush_dcache_icache_hugepage(struct page *page); +#if defined(CONFIG_PPC_MM_SLICES) || defined(CONFIG_PPC_SUBPAGE_PROT) int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); +#else +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + return 0; +} +#endif + +void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte); +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, @@ -50,8 +95,11 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); - return __pte(old); +#ifdef CONFIG_PPC64 + return __pte(pte_update(mm, addr, ptep, ~0UL, 1)); +#else + return __pte(pte_update(ptep, ~0UL, 0)); +#endif } static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -93,4 +141,15 @@ static inline void arch_release_hugepage(struct page *page) { } +#else /* ! CONFIG_HUGETLB_PAGE */ +static inline void reserve_hugetlb_gpages(void) +{ + pr_err("Cannot reserve gpages without hugetlb enabled\n"); +} +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ +} +#endif + #endif /* _ASM_POWERPC_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 3ea0f9a259d8..0260ea5ec3c2 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -66,6 +66,7 @@ #define MAS2_M 0x00000004 #define MAS2_G 0x00000002 #define MAS2_E 0x00000001 +#define MAS2_WIMGE_MASK 0x0000001f #define MAS2_EPN_MASK(size) (~0 << (size + 10)) #define MAS2_VAL(addr, size, flags) ((addr) & MAS2_EPN_MASK(size) | (flags)) @@ -80,6 +81,7 @@ #define MAS3_SW 0x00000004 #define MAS3_UR 0x00000002 #define MAS3_SR 0x00000001 +#define MAS3_BAP_MASK 0x0000003f #define MAS3_SPSIZE 0x0000003e #define MAS3_SPSIZE_SHIFT 1 @@ -212,6 +214,11 @@ typedef struct { unsigned int id; unsigned int active; unsigned long vdso_base; +#ifdef CONFIG_PPC_MM_SLICES + u64 low_slices_psize; /* SLB page size encodings */ + u64 high_slices_psize; /* 4 bits per slice for now */ + u16 user_psize; /* page size index */ +#endif } mm_context_t; /* Page size definitions, common between 32 and 64-bit diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index b445e0af4c2b..db645ec842bd 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -262,8 +262,7 @@ extern void hash_failure_debug(unsigned long ea, unsigned long access, extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long pstart, unsigned long prot, int psize, int ssize); -extern void add_gpage(unsigned long addr, unsigned long page_size, - unsigned long number_of_pages); +extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); extern void hpte_init_native(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 698b30638681..f0145522cfba 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -175,14 +175,16 @@ extern u64 ppc64_rma_size; #define MMU_PAGE_64K_AP 3 /* "Admixed pages" (hash64 only) */ #define MMU_PAGE_256K 4 #define MMU_PAGE_1M 5 -#define MMU_PAGE_8M 6 -#define MMU_PAGE_16M 7 -#define MMU_PAGE_256M 8 -#define MMU_PAGE_1G 9 -#define MMU_PAGE_16G 10 -#define MMU_PAGE_64G 11 -#define MMU_PAGE_COUNT 12 - +#define MMU_PAGE_4M 6 +#define MMU_PAGE_8M 7 +#define MMU_PAGE_16M 8 +#define MMU_PAGE_64M 9 +#define MMU_PAGE_256M 10 +#define MMU_PAGE_1G 11 +#define MMU_PAGE_16G 12 +#define MMU_PAGE_64G 13 + +#define MMU_PAGE_COUNT 14 #if defined(CONFIG_PPC_STD_MMU_64) /* 64-bit classic hash table MMU */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 2cd664ef0a5e..dd9c4fd038e0 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -36,6 +36,18 @@ #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +extern unsigned int HPAGE_SHIFT; +#else +#define HPAGE_SHIFT PAGE_SHIFT +#endif +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1) +#endif + /* We do define AT_SYSINFO_EHDR but don't use the gate mechanism */ #define __HAVE_ARCH_GATE_AREA 1 @@ -158,6 +170,24 @@ extern phys_addr_t kernstart_addr; #define is_kernel_addr(x) ((x) >= PAGE_OFFSET) #endif +/* + * Use the top bit of the higher-level page table entries to indicate whether + * the entries we point to contain hugepages. This works because we know that + * the page tables live in kernel space. If we ever decide to support having + * page tables at arbitrary addresses, this breaks and will have to change. + */ +#ifdef CONFIG_PPC64 +#define PD_HUGE 0x8000000000000000 +#else +#define PD_HUGE 0x80000000 +#endif + +/* + * Some number of bits at the level of the page table that points to + * a hugepte are used to encode the size. This masks those bits. + */ +#define HUGEPD_SHIFT_MASK 0x3f + #ifndef __ASSEMBLY__ #undef STRICT_MM_TYPECHECKS @@ -243,7 +273,6 @@ typedef unsigned long pgprot_t; #endif typedef struct { signed long pd; } hugepd_t; -#define HUGEPD_SHIFT_MASK 0x3f #ifdef CONFIG_HUGETLB_PAGE static inline int hugepd_ok(hugepd_t hpd) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 9356262fd3cc..fb40ede6bc0d 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -64,17 +64,6 @@ extern void copy_page(void *to, void *from); /* Log 2 of page table size */ extern u64 ppc64_pft_size; -/* Large pages size */ -#ifdef CONFIG_HUGETLB_PAGE -extern unsigned int HPAGE_SHIFT; -#else -#define HPAGE_SHIFT PAGE_SHIFT -#endif -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) -#define HPAGE_MASK (~(HPAGE_SIZE - 1)) -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1) - #endif /* __ASSEMBLY__ */ #ifdef CONFIG_PPC_MM_SLICES diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 082d515930a2..0156702ba24e 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -72,6 +72,9 @@ #define PTE_RPN_SHIFT (24) #endif +#define PTE_WIMGE_SHIFT (19) +#define PTE_BAP_SHIFT (2) + /* On 32-bit, we never clear the top part of the PTE */ #ifdef CONFIG_PPC32 #define _PTE_NONE_MASK 0xffffffff00000000ULL diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 50845924b7d9..4ea9bfbf67e9 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -236,8 +236,24 @@ _ENTRY(__early_start) * if we find the pte (fall through): * r11 is low pte word * r12 is pointer to the pte + * r10 is the pshift from the PGD, if we're a hugepage */ #ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_HUGETLB_PAGE +#define FIND_PTE \ + rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ + lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ + blt 1000f; /* Normal non-huge page */ \ + beq 2f; /* Bail if no table */ \ + oris r11, r11, PD_HUGE@h; /* Put back address bit */ \ + andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \ + xor r12, r10, r11; /* drop size bits from pointer */ \ + b 1001f; \ +1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + li r10, 0; /* clear r10 */ \ +1001: lwz r11, 4(r12); /* Get pte entry */ +#else #define FIND_PTE \ rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ @@ -245,7 +261,8 @@ _ENTRY(__early_start) beq 2f; /* Bail if no table */ \ rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ lwz r11, 4(r12); /* Get pte entry */ -#else +#endif /* HUGEPAGE */ +#else /* !PTE_64BIT */ #define FIND_PTE \ rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ lwz r11, 0(r11); /* Get L1 entry */ \ @@ -402,8 +419,8 @@ interrupt_base: #ifdef CONFIG_PTE_64BIT #ifdef CONFIG_SMP - subf r10,r11,r12 /* create false data dep */ - lwzx r13,r11,r10 /* Get upper pte bits */ + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ #else lwz r13,0(r12) /* Get upper pte bits */ #endif @@ -483,8 +500,8 @@ interrupt_base: #ifdef CONFIG_PTE_64BIT #ifdef CONFIG_SMP - subf r10,r11,r12 /* create false data dep */ - lwzx r13,r11,r10 /* Get upper pte bits */ + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ #else lwz r13,0(r12) /* Get upper pte bits */ #endif @@ -548,7 +565,7 @@ interrupt_base: /* * Both the instruction and data TLB miss get to this * point to load the TLB. - * r10 - available to use + * r10 - tsize encoding (if HUGETLB_PAGE) or available to use * r11 - TLB (info from Linux PTE) * r12 - available to use * r13 - upper bits of PTE (if PTE_64BIT) or available to use @@ -558,21 +575,73 @@ interrupt_base: * Upon exit, we reload everything and RFI. */ finish_tlb_load: +#ifdef CONFIG_HUGETLB_PAGE + cmpwi 6, r10, 0 /* check for huge page */ + beq 6, finish_tlb_load_cont /* !huge */ + + /* Alas, we need more scratch registers for hugepages */ + mfspr r12, SPRN_SPRG_THREAD + stw r14, THREAD_NORMSAVE(4)(r12) + stw r15, THREAD_NORMSAVE(5)(r12) + stw r16, THREAD_NORMSAVE(6)(r12) + stw r17, THREAD_NORMSAVE(7)(r12) + + /* Get the next_tlbcam_idx percpu var */ +#ifdef CONFIG_SMP + lwz r12, THREAD_INFO-THREAD(r12) + lwz r15, TI_CPU(r12) + lis r14, __per_cpu_offset@h + ori r14, r14, __per_cpu_offset@l + rlwinm r15, r15, 2, 0, 29 + lwzx r16, r14, r15 +#else + li r16, 0 +#endif + lis r17, next_tlbcam_idx@h + ori r17, r17, next_tlbcam_idx@l + add r17, r17, r16 /* r17 = *next_tlbcam_idx */ + lwz r15, 0(r17) /* r15 = next_tlbcam_idx */ + + lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */ + rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */ + mtspr SPRN_MAS0, r14 + + /* Extract TLB1CFG(NENTRY) */ + mfspr r16, SPRN_TLB1CFG + andi. r16, r16, 0xfff + + /* Update next_tlbcam_idx, wrapping when necessary */ + addi r15, r15, 1 + cmpw r15, r16 + blt 100f + lis r14, tlbcam_index@h + ori r14, r14, tlbcam_index@l + lwz r15, 0(r14) +100: stw r15, 0(r17) + + /* + * Calc MAS1_TSIZE from r10 (which has pshift encoded) + * tlb_enc = (pshift - 10). + */ + subi r15, r10, 10 + mfspr r16, SPRN_MAS1 + rlwimi r16, r15, 7, 20, 24 + mtspr SPRN_MAS1, r16 + + /* copy the pshift for use later */ + mr r14, r10 + + /* fall through */ + +#endif /* CONFIG_HUGETLB_PAGE */ + /* * We set execute, because we don't have the granularity to * properly set this at the page level (Linux problem). * Many of these bits are software only. Bits we don't set * here we (properly should) assume have the appropriate value. */ - - mfspr r12, SPRN_MAS2 -#ifdef CONFIG_PTE_64BIT - rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ -#else - rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ -#endif - mtspr SPRN_MAS2, r12 - +finish_tlb_load_cont: #ifdef CONFIG_PTE_64BIT rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */ andi. r10, r11, _PAGE_DIRTY @@ -581,22 +650,40 @@ finish_tlb_load: andc r12, r12, r10 1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */ rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */ - mtspr SPRN_MAS3, r12 +2: mtspr SPRN_MAS3, r12 BEGIN_MMU_FTR_SECTION srwi r10, r13, 12 /* grab RPN[12:31] */ mtspr SPRN_MAS7, r10 END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) #else li r10, (_PAGE_EXEC | _PAGE_PRESENT) + mr r13, r11 rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ slwi r10, r12, 1 or r10, r10, r12 iseleq r12, r12, r10 - rlwimi r11, r12, 0, 20, 31 /* Extract RPN from PTE and merge with perms */ - mtspr SPRN_MAS3, r11 + rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ + mtspr SPRN_MAS3, r13 #endif + + mfspr r12, SPRN_MAS2 +#ifdef CONFIG_PTE_64BIT + rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ +#else + rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ +#endif +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 3f /* don't mask if page isn't huge */ + li r13, 1 + slw r13, r13, r14 + subi r13, r13, 1 + rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */ + andc r12, r12, r13 /* mask off ea bits within the page */ +#endif +3: mtspr SPRN_MAS2, r12 + #ifdef CONFIG_E200 /* Round robin TLB1 entries assignment */ mfspr r12, SPRN_MAS0 @@ -622,11 +709,19 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) mtspr SPRN_MAS0,r12 #endif /* CONFIG_E200 */ +tlb_write_entry: tlbwe /* Done...restore registers and get out of here. */ mfspr r10, SPRN_SPRG_THREAD - lwz r11, THREAD_NORMSAVE(3)(r10) +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 8f /* skip restore for 4k page faults */ + lwz r14, THREAD_NORMSAVE(4)(r10) + lwz r15, THREAD_NORMSAVE(5)(r10) + lwz r16, THREAD_NORMSAVE(6)(r10) + lwz r17, THREAD_NORMSAVE(7)(r10) +#endif +8: lwz r11, THREAD_NORMSAVE(3)(r10) mtcr r11 lwz r13, THREAD_NORMSAVE(2)(r10) lwz r12, THREAD_NORMSAVE(1)(r10) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index bdca46e08382..991ee813d2a8 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-y += hugetlbpage.o obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 26b2872b3d00..1f8b2a05e3d0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -105,9 +105,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M; int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; EXPORT_SYMBOL_GPL(mmu_slb_size); -#ifdef CONFIG_HUGETLB_PAGE -unsigned int HPAGE_SHIFT; -#endif #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c new file mode 100644 index 000000000000..1295b7c1cdac --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -0,0 +1,121 @@ +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include +#include + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { + asm volatile( + "li %0,0\n" + "tlbsx. 0,%1\n" + "bne 1f\n" + "li %0,1\n" + "1:\n" + : "=&r"(found) : "r"(ea)); + } else { + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + } + + return found; +} + +void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int index, lz, ncams; + struct vm_area_struct *vma; +#endif + + if (unlikely(is_kernel_addr(ea))) + return; + +#ifdef CONFIG_MM_SLICES + psize = mmu_get_tsize(get_slice_psize(mm, ea)); + tsize = mmu_get_psize(psize); + shift = mmu_psize_defs[psize].shift; +#else + vma = find_vma(mm, ea); + psize = vma_mmu_pagesize(vma); /* returns actual size in bytes */ + asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (psize)); + shift = 31 - lz; + tsize = 21 - lz; +#endif + + /* + * We can't be interrupted while we're setting up the MAS + * regusters or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + local_irq_restore(flags); + return; + } + +#ifdef CONFIG_PPC_FSL_BOOK3E + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = __get_cpu_var(next_tlbcam_idx); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __get_cpu_var(next_tlbcam_idx) = tlbcam_index; + else + __get_cpu_var(next_tlbcam_idx)++; +#endif + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { + mtspr(SPRN_MAS7_MAS3, mas7_3); + } else { + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + } + + asm volatile ("tlbwe"); + + local_irq_restore(flags); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0); + +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b9..3a5f59dcbb33 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1,7 +1,8 @@ /* - * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * PPC Huge TLB Page Support for Kernel. * * Copyright (C) 2003 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor * * Based on the IA-32 version: * Copyright (C) 2002, Rohit Seth @@ -11,24 +12,39 @@ #include #include #include +#include +#include +#include #include #include #include +#include #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 #define PAGE_SHIFT_16G 34 -#define MAX_NUMBER_GPAGES 1024 +unsigned int HPAGE_SHIFT; -/* Tracks the 16G pages after the device tree is scanned and before the - * huge_boot_pages list is ready. */ -static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; +/* + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready. On 64-bit implementations, this is + * just used to track 16G pages and so is a single array. 32-bit + * implementations may have more than one gpage size due to limitations + * of the memory allocators, so we need multiple arrays + */ +#ifdef CONFIG_PPC64 +#define MAX_NUMBER_GPAGES 1024 +static u64 gpage_freearray[MAX_NUMBER_GPAGES]; static unsigned nr_gpages; - -/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() - * will choke on pointers to hugepte tables, which is handy for - * catching screwups early. */ +#else +#define MAX_NUMBER_GPAGES 128 +struct psize_gpages { + u64 gpage_list[MAX_NUMBER_GPAGES]; + unsigned int nr_gpages; +}; +static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; +#endif static inline int shift_to_mmu_psize(unsigned int shift) { @@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) #define hugepd_none(hpd) ((hpd).pd == 0) -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); - return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return hpd.pd & HUGEPD_SHIFT_MASK; -} - -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) -{ - unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); - pte_t *dir = hugepd_page(*hpdp); - - return dir + idx; -} - pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) { pgd_t *pg; @@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift if (is_hugepd(pm)) hpdp = (hugepd_t *)pm; else if (!pmd_none(*pm)) { - return pte_offset_map(pm, ea); + return pte_offset_kernel(pm, ea); } } } @@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, unsigned long address, unsigned pdshift, unsigned pshift) { - pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), - GFP_KERNEL|__GFP_REPEAT); + struct kmem_cache *cachep; + pte_t *new; + +#ifdef CONFIG_PPC64 + cachep = PGT_CACHE(pdshift - pshift); +#else + int i; + int num_hugepd = 1 << (pshift - pdshift); + cachep = hugepte_cache; +#endif + + new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, return -ENOMEM; spin_lock(&mm->page_table_lock); +#ifdef CONFIG_PPC64 if (!hugepd_none(*hpdp)) - kmem_cache_free(PGT_CACHE(pdshift - pshift), new); + kmem_cache_free(cachep, new); else - hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +#else + /* + * We have multiple higher-level entries that point to the same + * actual pte location. Fill in each as we go and backtrack on error. + * We need all of these so the DTLB pgtable walk code can find the + * right higher-level entry without knowing if it's a hugepage or not. + */ + for (i = 0; i < num_hugepd; i++, hpdp++) { + if (unlikely(!hugepd_none(*hpdp))) + break; + else + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; + } + /* If we bailed from the for loop early, an error occurred, clean up */ + if (i < num_hugepd) { + for (i = i - 1 ; i >= 0; i--, hpdp--) + hpdp->pd = 0; + kmem_cache_free(cachep, new); + } +#endif spin_unlock(&mm->page_table_lock); return 0; } @@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz return hugepte_offset(hpdp, addr, pdshift); } +#ifdef CONFIG_PPC32 /* Build list of addresses of gigantic pages. This function is used in early * boot before the buddy or bootmem allocator is setup. */ -void add_gpage(unsigned long addr, unsigned long page_size, - unsigned long number_of_pages) +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); + int i; + + if (addr == 0) + return; + + gpage_freearray[idx].nr_gpages = number_of_pages; + + for (i = 0; i < number_of_pages; i++) { + gpage_freearray[idx].gpage_list[i] = addr; + addr += page_size; + } +} + +/* + * Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT); + int nr_gpages = gpage_freearray[idx].nr_gpages; + + if (nr_gpages == 0) + return 0; + +#ifdef CONFIG_HIGHMEM + /* + * If gpages can be in highmem we can't use the trick of storing the + * data structure in the page; allocate space for this + */ + m = alloc_bootmem(sizeof(struct huge_bootmem_page)); + m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; +#else + m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); +#endif + + list_add(&m->list, &huge_boot_pages); + gpage_freearray[idx].nr_gpages = nr_gpages; + gpage_freearray[idx].gpage_list[nr_gpages] = 0; + m->hstate = hstate; + + return 1; +} +/* + * Scan the command line hugepagesz= options for gigantic pages; store those in + * a list that we use to allocate the memory once all options are parsed. + */ + +unsigned long gpage_npages[MMU_PAGE_COUNT]; + +static int __init do_gpage_early_setup(char *param, char *val) +{ + static phys_addr_t size; + unsigned long npages; + + /* + * The hugepagesz and hugepages cmdline options are interleaved. We + * use the size variable to keep track of whether or not this was done + * properly and skip over instances where it is incorrect. Other + * command-line parsing code will issue warnings, so we don't need to. + * + */ + if ((strcmp(param, "default_hugepagesz") == 0) || + (strcmp(param, "hugepagesz") == 0)) { + size = memparse(val, NULL); + } else if (strcmp(param, "hugepages") == 0) { + if (size != 0) { + if (sscanf(val, "%lu", &npages) <= 0) + npages = 0; + gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; + size = 0; + } + } + return 0; +} + + +/* + * This function allocates physical space for pages that are larger than the + * buddy allocator can handle. We want to allocate these in highmem because + * the amount of lowmem is limited. This means that this function MUST be + * called before lowmem_end_addr is set up in MMU_init() in order for the lmb + * allocate to grab highmem. + */ +void __init reserve_hugetlb_gpages(void) +{ + static __initdata char cmdline[COMMAND_LINE_SIZE]; + phys_addr_t size, base; + int i; + + strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup); + + /* + * Walk gpage list in reverse, allocating larger page sizes first. + * Skip over unsupported sizes, or sizes that have 0 gpages allocated. + * When we reach the point in the list where pages are no longer + * considered gpages, we're done. + */ + for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { + if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) + continue; + else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) + break; + + size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); + base = memblock_alloc_base(size * gpage_npages[i], size, + MEMBLOCK_ALLOC_ANYWHERE); + add_gpage(base, size, gpage_npages[i]); + } +} + +#else /* PPC64 */ + +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate) m->hstate = hstate; return 1; } +#endif int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } +#ifdef CONFIG_PPC32 +#define HUGEPD_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) + +struct hugepd_freelist { + struct rcu_head rcu; + unsigned int index; + void *ptes[0]; +}; + +static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); + +static void hugepd_free_rcu_callback(struct rcu_head *head) +{ + struct hugepd_freelist *batch = + container_of(head, struct hugepd_freelist, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + kmem_cache_free(hugepte_cache, batch->ptes[i]); + + free_page((unsigned long)batch); +} + +static void hugepd_free(struct mmu_gather *tlb, void *hugepte) +{ + struct hugepd_freelist **batchp; + + batchp = &__get_cpu_var(hugepd_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpumask_equal(mm_cpumask(tlb->mm), + cpumask_of(smp_processor_id()))) { + kmem_cache_free(hugepte_cache, hugepte); + return; + } + + if (*batchp == NULL) { + *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); + (*batchp)->index = 0; + } + + (*batchp)->ptes[(*batchp)->index++] = hugepte; + if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { + call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); + *batchp = NULL; + } +} +#endif + static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, unsigned long start, unsigned long end, unsigned long floor, unsigned long ceiling) { pte_t *hugepte = hugepd_page(*hpdp); - unsigned shift = hugepd_shift(*hpdp); + int i; + unsigned long pdmask = ~((1UL << pdshift) - 1); + unsigned int num_hugepd = 1; + +#ifdef CONFIG_PPC64 + unsigned int shift = hugepd_shift(*hpdp); +#else + /* Note: On 32-bit the hpdp may be the first of several */ + num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); +#endif start &= pdmask; if (start < floor) @@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (end - 1 > ceiling - 1) return; - hpdp->pd = 0; + for (i = 0; i < num_hugepd; i++, hpdp++) + hpdp->pd = 0; + tlb->need_flush = 1; +#ifdef CONFIG_PPC64 pgtable_free_tlb(tlb, hugepte, pdshift - shift); +#else + hugepd_free(tlb, hugepte); +#endif } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, * too. */ - pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); + pgd = pgd_offset(tlb->mm, addr); if (!is_hugepd(pgd)) { if (pgd_none_or_clear_bad(pgd)) continue; hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); } else { +#ifdef CONFIG_PPC32 + /* + * Increment next by the size of the huge mapping since + * on 32-bit there may be more than one entry at the pgd + * level for a single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); +#endif free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, addr, next, floor, ceiling); } - } while (pgd++, addr = next, addr != end); + } while (addr = next, addr != end); } struct page * @@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { +#ifdef CONFIG_MM_SLICES struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); +#else + return get_unmapped_area(file, addr, len, pgoff, flags); +#endif } unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { +#ifdef CONFIG_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); return 1UL << mmu_psize_to_shift(psize); +#else + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + return huge_page_size(hstate_vma(vma)); +#endif +} + +static inline bool is_power_of_4(unsigned long x) +{ + if (is_power_of_2(x)) + return (__ilog2(x) % 2) ? false : true; + return false; } static int __init add_huge_page_size(unsigned long long size) @@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if ((size < PAGE_SIZE) || !is_power_of_4(size)) + return -EINVAL; +#else if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) return -EINVAL; +#endif if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) return -EINVAL; @@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str) } __setup("hugepagesz=", hugepage_setup_sz); +#ifdef CONFIG_FSL_BOOKE +struct kmem_cache *hugepte_cache; +static int __init hugetlbpage_init(void) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + + if (!mmu_psize_defs[psize].shift) + continue; + + shift = mmu_psize_to_shift(psize); + + /* Don't treat normal page sizes as huge... */ + if (shift != PAGE_SHIFT) + if (add_huge_page_size(1ULL << shift) < 0) + continue; + } + + /* + * Create a kmem cache for hugeptes. The bottom bits in the pte have + * size information encoded in them, so align them to allow this + */ + hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), + HUGEPD_SHIFT_MASK + 1, 0, NULL); + if (hugepte_cache == NULL) + panic("%s: Unable to create kmem cache for hugeptes\n", + __func__); + + /* Default hpage size = 4M */ + if (mmu_psize_defs[MMU_PAGE_4M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; + else + panic("%s: Unable to set default huge page size\n", __func__); + + + return 0; +} +#else static int __init hugetlbpage_init(void) { int psize; @@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void) return 0; } - +#endif module_init(hugetlbpage_init); void flush_dcache_icache_hugepage(struct page *page) { int i; + void *start; BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) - __flush_dcache_icache(page_address(page+i)); + for (i = 0; i < (1UL << compound_order(page)); i++) { + if (!PageHighMem(page)) { + __flush_dcache_icache(page_address(page+i)); + } else { + start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE); + __flush_dcache_icache(start); + kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + } + } } diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index c77fef56dad6..161cefde5c15 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -123,6 +126,12 @@ void __init MMU_init(void) /* parse args from command line */ MMU_setup(); + /* + * Reserve gigantic pages for hugetlb. This MUST occur before + * lowmem_end_addr is initialized below. + */ + reserve_hugetlb_gpages(); + if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII memblock.memory.cnt = 1; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index c781bbcf7338..ad9cf49dfb89 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -548,4 +548,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ +#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ + && defined(CONFIG_HUGETLB_PAGE) + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma->vm_mm, address, *ptep); +#endif } diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 336807de550e..5b63bd3da4a9 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -292,6 +292,11 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; +#ifdef CONFIG_PPC_MM_SLICES + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); +#endif + return 0; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index af40c8768a78..214130a4edc6 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -212,7 +213,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { - if (!(vma->vm_flags & VM_HUGETLB)) + if (!is_vm_hugetlb_page(vma)) assert_pte_locked(vma->vm_mm, address); __ptep_set_access_flags(ptep, entry); flush_tlb_page_nohash(vma, address); diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 4ebb34bc01d6..dc4a5f385e41 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -553,24 +553,24 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault #ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault #endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq virt_page_table_tlb_miss_fault + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault /* Ok, we're all right, we can now create a kernel translation for * a 4K or 64K page from r16 -> r15. @@ -802,24 +802,24 @@ htw_tlb_miss: rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault #ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault #endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 - cmpldi cr0,r15,0 - beq htw_tlb_miss_fault + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault /* Ok, we're all right, we can now create an indirect entry for * a 1M or 256M page. diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d32ec643c231..afc95c7304ae 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -36,14 +36,49 @@ #include #include #include +#include #include #include #include +#include #include "mmu_decl.h" -#ifdef CONFIG_PPC_BOOK3E +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_FSL_BOOKE +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_4M] = { + .shift = 22, + .enc = BOOK3E_PAGESZ_4M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_64M] = { + .shift = 26, + .enc = BOOK3E_PAGESZ_64M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#else struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -77,6 +112,8 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .enc = BOOK3E_PAGESZ_1GB, }, }; +#endif /* CONFIG_FSL_BOOKE */ + static inline int mmu_get_tsize(int psize) { return mmu_psize_defs[psize].enc; @@ -87,7 +124,7 @@ static inline int mmu_get_tsize(int psize) /* This isn't used on !Book3E for now */ return 0; } -#endif +#endif /* CONFIG_PPC_BOOK3E_MMU */ /* The variables below are currently only used on 64-bit Book3E * though this will probably be made common with other nohash @@ -266,6 +303,11 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, mmu_get_tsize(mmu_virtual_psize), 0); } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index e06e39589a09..a85990c886e9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -69,6 +69,7 @@ config PPC_BOOK3S_64 bool "Server processors" select PPC_FPU select PPC_HAVE_PMU_SUPPORT + select SYS_SUPPORTS_HUGETLBFS config PPC_BOOK3E_64 bool "Embedded processors" @@ -173,6 +174,7 @@ config BOOKE config FSL_BOOKE bool depends on (E200 || E500) && PPC32 + select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT default y # this is for common code between PPC32 & PPC64 FSL BOOKE @@ -296,7 +298,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES) + default y if (PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) default n config VIRT_CPU_ACCOUNTING -- cgit v1.2.3 From 6dece0eb69b2a28e18d104bc5d707f1cb673f5e0 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 25 Jul 2011 11:29:33 +0000 Subject: powerpc/32: Pass device tree address as u64 to machine_init u64 is used rather than phys_addr_t to keep things simple, as this is called from assembly code. Update callers to pass a 64-bit address in r3/r4. Other unused register assignments that were once parameters to machine_init are dropped. For FSL BookE, look up the physical address of the device tree from the effective address passed in r3 by the loader. This is required for situations where memory does not start at zero (due to AMP or IOMMU-less virtualization), and thus the IMA doesn't start at zero, and thus the device tree effective address does not equal the physical address. Signed-off-by: Scott Wood Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/head_32.S | 7 +++--- arch/powerpc/kernel/head_40x.S | 15 +++---------- arch/powerpc/kernel/head_44x.S | 16 +++----------- arch/powerpc/kernel/head_8xx.S | 13 +++-------- arch/powerpc/kernel/head_fsl_booke.S | 42 ++++++++++++++++++++++-------------- arch/powerpc/kernel/setup_32.c | 2 +- 6 files changed, 39 insertions(+), 56 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index ba250d505e07..0654dba2c1f1 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -139,8 +139,7 @@ __start: trap #endif /* CONFIG_PPC_PMAC */ -1: mr r31,r3 /* save parameters */ - mr r30,r4 +1: mr r31,r3 /* save device tree ptr */ li r24,0 /* cpu # */ /* @@ -964,8 +963,8 @@ start_here: * Do early platform-specific initialization, * and set up the MMU. */ - mr r3,r31 - mr r4,r30 + li r3,0 + mr r4,r31 bl machine_init bl __save_cpu_setup bl MMU_init diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index a91626d87fc9..872a6af83bad 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -58,13 +58,7 @@ _ENTRY(_stext); _ENTRY(_start); - /* Save parameters we are passed. - */ - mr r31,r3 - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 + mr r31,r3 /* save device tree ptr */ /* We have to turn on the MMU right away so we get cache modes * set correctly. @@ -849,11 +843,8 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + li r3,0 + mr r4,r31 bl machine_init bl MMU_init diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index f8e971ba94f5..b725dab0f88a 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -61,14 +61,7 @@ _ENTRY(_start); * of abatron_pteptrs */ nop -/* - * Save parameters we are passed - */ - mr r31,r3 - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 + mr r31,r3 /* save device tree ptr */ li r24,0 /* CPU number */ bl init_cpu_state @@ -120,11 +113,8 @@ _ENTRY(_start); /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + li r3,0 + mr r4,r31 bl machine_init bl MMU_init diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 1cbf64e6b416..b68cb173ba2c 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -76,11 +76,7 @@ _ENTRY(_start); */ .globl __start __start: - mr r31,r3 /* save parameters */ - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 + mr r31,r3 /* save device tree ptr */ /* We have to turn on the MMU right away so we get cache modes * set correctly. @@ -723,11 +719,8 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + li r3,0 + mr r4,r31 bl machine_init bl MMU_init diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 4ea9bfbf67e9..e1c699f3b7a7 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -63,17 +63,30 @@ _ENTRY(_start); * of abatron_pteptrs */ nop -/* - * Save parameters we are passed - */ - mr r31,r3 - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 - li r25,0 /* phys kernel start (low) */ - li r24,0 /* CPU number */ - li r23,0 /* phys kernel start (high) */ + + /* Translate device tree address to physical, save in r30/r31 */ + mfmsr r16 + mfspr r17,SPRN_PID + rlwinm r17,r17,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + rlwimi r17,r16,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */ + mtspr SPRN_MAS6,r17 + + tlbsx 0,r3 /* must succeed */ + + mfspr r16,SPRN_MAS1 + mfspr r20,SPRN_MAS3 + rlwinm r17,r16,25,0x1f /* r17 = log2(page size) */ + li r18,1024 + slw r18,r18,r17 /* r18 = page size */ + addi r18,r18,-1 + and r19,r3,r18 /* r19 = page offset */ + andc r31,r20,r18 /* r3 = page base */ + or r31,r31,r19 /* r3 = devtree phys addr */ + mfspr r30,SPRN_MAS7 + + li r25,0 /* phys kernel start (low) */ + li r24,0 /* CPU number */ + li r23,0 /* phys kernel start (high) */ /* We try to not make any assumptions about how the boot loader * setup or used the TLBs. We invalidate all mappings from the @@ -198,11 +211,8 @@ _ENTRY(__early_start) /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + mr r3,r30 + mr r4,r31 bl machine_init bl MMU_init diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 209135af0a40..c1ce86357ecb 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -117,7 +117,7 @@ notrace unsigned long __init early_init(unsigned long dt_ptr) * This is called very early on the boot process, after a minimal * MMU environment has been set up but before MMU_init is called. */ -notrace void __init machine_init(unsigned long dt_ptr) +notrace void __init machine_init(u64 dt_ptr) { lockdep_init(); -- cgit v1.2.3 From 0330581ab3b9002d55ee66f377ccbbb742175c01 Mon Sep 17 00:00:00 2001 From: Tang Yuantian Date: Tue, 16 Aug 2011 19:51:33 +0000 Subject: powerpc/mm: Fix the call trace when resumed from hibernation In SMP mode, the kernel would produce call trace when resumed from hibernation. The reason is when the function destroy_context is called to drop the resuming mm context, the mm->context.active is 1 which is wrong and should be zero. We pass the current->active_mm as previous mm context to function switch_mmu_context to decrease the context.active by 1. In UP mode, there is no effect. Signed-off-by: Tang Yuantian Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/swsusp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index aa17b76dd427..641f9adc6205 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -33,6 +33,6 @@ void save_processor_state(void) void restore_processor_state(void) { #ifdef CONFIG_PPC32 - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(current->active_mm, current->active_mm); #endif } -- cgit v1.2.3 From c26afe9e8591f306d79aab8071f1d34e4f60b700 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Wed, 31 Aug 2011 06:32:26 +0000 Subject: powerpc/ps3: Add gelic udbg driver Add a new udbg driver for the PS3 gelic Ehthernet device. This driver shares only a few stucture and constant definitions with the gelic Ethernet device driver, so is implemented as a stand-alone driver with no dependencies on the gelic Ethernet device driver. Signed-off-by: Hector Martin Signed-off-by: Andre Heider Signed-off-by: Geoff Levand Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig.debug | 8 + arch/powerpc/include/asm/udbg.h | 1 + arch/powerpc/kernel/udbg.c | 2 + arch/powerpc/platforms/ps3/Kconfig | 12 ++ arch/powerpc/platforms/ps3/Makefile | 1 + arch/powerpc/platforms/ps3/gelic_udbg.c | 273 ++++++++++++++++++++++++++++++++ drivers/net/ps3_gelic_net.c | 3 + drivers/net/ps3_gelic_net.h | 6 + 8 files changed, 306 insertions(+) create mode 100644 arch/powerpc/platforms/ps3/gelic_udbg.c (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 067cb8480747..ab2335f160a0 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -258,6 +258,14 @@ config PPC_EARLY_DEBUG_WSP depends on PPC_WSP select PPC_UDBG_16550 +config PPC_EARLY_DEBUG_PS3GELIC + bool "Early debugging through the PS3 Ethernet port" + depends on PPC_PS3 + select PS3GELIC_UDBG + help + Select this to enable early debugging for the PlayStation3 via + UDP broadcasts sent out through the Ethernet port. + endchoice config PPC_EARLY_DEBUG_HVSI_VTERMNO diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 93e05d1b34b2..7cf796fa03f2 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -54,6 +54,7 @@ extern void __init udbg_init_40x_realmode(void); extern void __init udbg_init_cpm(void); extern void __init udbg_init_usbgecko(void); extern void __init udbg_init_wsp(void); +extern void __init udbg_init_ps3gelic(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_UDBG_H */ diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index faa82c1f3f68..5b3e98e03158 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -67,6 +67,8 @@ void __init udbg_early_init(void) udbg_init_usbgecko(); #elif defined(CONFIG_PPC_EARLY_DEBUG_WSP) udbg_init_wsp(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) + udbg_init_ps3gelic(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index dfe316b161a9..476d9d9b2405 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -148,4 +148,16 @@ config PS3_LPM profiling support of the Cell processor with programs like oprofile and perfmon2, then say Y or M, otherwise say N. +config PS3GELIC_UDBG + bool "PS3 udbg output via UDP broadcasts on Ethernet" + depends on PPC_PS3 + help + Enables udbg early debugging output by sending broadcast UDP + via the Ethernet port (UDP port number 18194). + + This driver uses a trivial implementation and is independent + from the main network driver. + + If in doubt, say N here. + endmenu diff --git a/arch/powerpc/platforms/ps3/Makefile b/arch/powerpc/platforms/ps3/Makefile index ac1bdf844eca..02b9e636dab7 100644 --- a/arch/powerpc/platforms/ps3/Makefile +++ b/arch/powerpc/platforms/ps3/Makefile @@ -2,6 +2,7 @@ obj-y += setup.o mm.o time.o hvcall.o htab.o repository.o obj-y += interrupt.o exports.o os-area.o obj-y += system-bus.o +obj-$(CONFIG_PS3GELIC_UDBG) += gelic_udbg.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SPU_BASE) += spu.o obj-y += device-init.o diff --git a/arch/powerpc/platforms/ps3/gelic_udbg.c b/arch/powerpc/platforms/ps3/gelic_udbg.c new file mode 100644 index 000000000000..20b46a19a48f --- /dev/null +++ b/arch/powerpc/platforms/ps3/gelic_udbg.c @@ -0,0 +1,273 @@ +/* + * udbg debug output routine via GELIC UDP broadcasts + * + * Copyright (C) 2007 Sony Computer Entertainment Inc. + * Copyright 2006, 2007 Sony Corporation + * Copyright (C) 2010 Hector Martin + * Copyright (C) 2011 Andre Heider + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + */ + +#include +#include +#include + +#define GELIC_BUS_ID 1 +#define GELIC_DEVICE_ID 0 +#define GELIC_DEBUG_PORT 18194 +#define GELIC_MAX_MESSAGE_SIZE 1000 + +#define GELIC_LV1_GET_MAC_ADDRESS 1 +#define GELIC_LV1_GET_VLAN_ID 4 +#define GELIC_LV1_VLAN_TX_ETHERNET_0 2 + +#define GELIC_DESCR_DMA_STAT_MASK 0xf0000000 +#define GELIC_DESCR_DMA_CARDOWNED 0xa0000000 + +#define GELIC_DESCR_TX_DMA_IKE 0x00080000 +#define GELIC_DESCR_TX_DMA_NO_CHKSUM 0x00000000 +#define GELIC_DESCR_TX_DMA_FRAME_TAIL 0x00040000 + +#define GELIC_DESCR_DMA_CMD_NO_CHKSUM (GELIC_DESCR_DMA_CARDOWNED | \ + GELIC_DESCR_TX_DMA_IKE | \ + GELIC_DESCR_TX_DMA_NO_CHKSUM) + +static u64 bus_addr; + +struct gelic_descr { + /* as defined by the hardware */ + __be32 buf_addr; + __be32 buf_size; + __be32 next_descr_addr; + __be32 dmac_cmd_status; + __be32 result_size; + __be32 valid_size; /* all zeroes for tx */ + __be32 data_status; + __be32 data_error; /* all zeroes for tx */ +} __attribute__((aligned(32))); + +struct debug_block { + struct gelic_descr descr; + u8 pkt[1520]; +} __packed; + +struct ethhdr { + u8 dest[6]; + u8 src[6]; + u16 type; +} __packed; + +struct vlantag { + u16 vlan; + u16 subtype; +} __packed; + +struct iphdr { + u8 ver_len; + u8 dscp_ecn; + u16 total_length; + u16 ident; + u16 frag_off_flags; + u8 ttl; + u8 proto; + u16 checksum; + u32 src; + u32 dest; +} __packed; + +struct udphdr { + u16 src; + u16 dest; + u16 len; + u16 checksum; +} __packed; + +static __iomem struct ethhdr *h_eth; +static __iomem struct vlantag *h_vlan; +static __iomem struct iphdr *h_ip; +static __iomem struct udphdr *h_udp; + +static __iomem char *pmsg; +static __iomem char *pmsgc; + +static __iomem struct debug_block dbg __attribute__((aligned(32))); + +static int header_size; + +static void map_dma_mem(int bus_id, int dev_id, void *start, size_t len, + u64 *real_bus_addr) +{ + s64 result; + u64 real_addr = ((u64)start) & 0x0fffffffffffffffUL; + u64 real_end = real_addr + len; + u64 map_start = real_addr & ~0xfff; + u64 map_end = (real_end + 0xfff) & ~0xfff; + u64 bus_addr = 0; + + u64 flags = 0xf800000000000000UL; + + result = lv1_allocate_device_dma_region(bus_id, dev_id, + map_end - map_start, 12, 0, + &bus_addr); + if (result) + lv1_panic(0); + + result = lv1_map_device_dma_region(bus_id, dev_id, map_start, + bus_addr, map_end - map_start, + flags); + if (result) + lv1_panic(0); + + *real_bus_addr = bus_addr + real_addr - map_start; +} + +static int unmap_dma_mem(int bus_id, int dev_id, u64 bus_addr, size_t len) +{ + s64 result; + u64 real_bus_addr; + + real_bus_addr = bus_addr & ~0xfff; + len += bus_addr - real_bus_addr; + len = (len + 0xfff) & ~0xfff; + + result = lv1_unmap_device_dma_region(bus_id, dev_id, real_bus_addr, + len); + if (result) + return result; + + return lv1_free_device_dma_region(bus_id, dev_id, real_bus_addr); +} + +static void gelic_debug_init(void) +{ + s64 result; + u64 v2; + u64 mac; + u64 vlan_id; + + result = lv1_open_device(GELIC_BUS_ID, GELIC_DEVICE_ID, 0); + if (result) + lv1_panic(0); + + map_dma_mem(GELIC_BUS_ID, GELIC_DEVICE_ID, &dbg, sizeof(dbg), + &bus_addr); + + memset(&dbg, 0, sizeof(dbg)); + + dbg.descr.buf_addr = bus_addr + offsetof(struct debug_block, pkt); + + wmb(); + + result = lv1_net_control(GELIC_BUS_ID, GELIC_DEVICE_ID, + GELIC_LV1_GET_MAC_ADDRESS, 0, 0, 0, + &mac, &v2); + if (result) + lv1_panic(0); + + mac <<= 16; + + h_eth = (struct ethhdr *)dbg.pkt; + + memset(&h_eth->dest, 0xff, 6); + memcpy(&h_eth->src, &mac, 6); + + header_size = sizeof(struct ethhdr); + + result = lv1_net_control(GELIC_BUS_ID, GELIC_DEVICE_ID, + GELIC_LV1_GET_VLAN_ID, + GELIC_LV1_VLAN_TX_ETHERNET_0, 0, 0, + &vlan_id, &v2); + if (!result) { + h_eth->type = 0x8100; + + header_size += sizeof(struct vlantag); + h_vlan = (struct vlantag *)(h_eth + 1); + h_vlan->vlan = vlan_id; + h_vlan->subtype = 0x0800; + h_ip = (struct iphdr *)(h_vlan + 1); + } else { + h_eth->type = 0x0800; + h_ip = (struct iphdr *)(h_eth + 1); + } + + header_size += sizeof(struct iphdr); + h_ip->ver_len = 0x45; + h_ip->ttl = 10; + h_ip->proto = 0x11; + h_ip->src = 0x00000000; + h_ip->dest = 0xffffffff; + + header_size += sizeof(struct udphdr); + h_udp = (struct udphdr *)(h_ip + 1); + h_udp->src = GELIC_DEBUG_PORT; + h_udp->dest = GELIC_DEBUG_PORT; + + pmsgc = pmsg = (char *)(h_udp + 1); +} + +static void gelic_debug_shutdown(void) +{ + if (bus_addr) + unmap_dma_mem(GELIC_BUS_ID, GELIC_DEVICE_ID, + bus_addr, sizeof(dbg)); + lv1_close_device(GELIC_BUS_ID, GELIC_DEVICE_ID); +} + +static void gelic_sendbuf(int msgsize) +{ + u16 *p; + u32 sum; + int i; + + dbg.descr.buf_size = header_size + msgsize; + h_ip->total_length = msgsize + sizeof(struct udphdr) + + sizeof(struct iphdr); + h_udp->len = msgsize + sizeof(struct udphdr); + + h_ip->checksum = 0; + sum = 0; + p = (u16 *)h_ip; + for (i = 0; i < 5; i++) + sum += *p++; + h_ip->checksum = ~(sum + (sum >> 16)); + + dbg.descr.dmac_cmd_status = GELIC_DESCR_DMA_CMD_NO_CHKSUM | + GELIC_DESCR_TX_DMA_FRAME_TAIL; + dbg.descr.result_size = 0; + dbg.descr.data_status = 0; + + wmb(); + + lv1_net_start_tx_dma(GELIC_BUS_ID, GELIC_DEVICE_ID, bus_addr, 0); + + while ((dbg.descr.dmac_cmd_status & GELIC_DESCR_DMA_STAT_MASK) == + GELIC_DESCR_DMA_CARDOWNED) + cpu_relax(); +} + +static void ps3gelic_udbg_putc(char ch) +{ + *pmsgc++ = ch; + if (ch == '\n' || (pmsgc-pmsg) >= GELIC_MAX_MESSAGE_SIZE) { + gelic_sendbuf(pmsgc-pmsg); + pmsgc = pmsg; + } +} + +void __init udbg_init_ps3gelic(void) +{ + gelic_debug_init(); + udbg_putc = ps3gelic_udbg_putc; +} + +void udbg_shutdown_ps3gelic(void) +{ + udbg_putc = NULL; + gelic_debug_shutdown(); +} +EXPORT_SYMBOL(udbg_shutdown_ps3gelic); diff --git a/drivers/net/ps3_gelic_net.c b/drivers/net/ps3_gelic_net.c index d82a82d9870c..e743c9418ac9 100644 --- a/drivers/net/ps3_gelic_net.c +++ b/drivers/net/ps3_gelic_net.c @@ -1674,6 +1674,9 @@ static int __devinit ps3_gelic_driver_probe(struct ps3_system_bus_device *dev) int result; pr_debug("%s: called\n", __func__); + + udbg_shutdown_ps3gelic(); + result = ps3_open_hv_device(dev); if (result) { diff --git a/drivers/net/ps3_gelic_net.h b/drivers/net/ps3_gelic_net.h index d3fadfbc3bcc..a93df6ac1909 100644 --- a/drivers/net/ps3_gelic_net.h +++ b/drivers/net/ps3_gelic_net.h @@ -359,6 +359,12 @@ static inline void *port_priv(struct gelic_port *port) return port->priv; } +#ifdef CONFIG_PPC_EARLY_DEBUG_PS3GELIC +extern void udbg_shutdown_ps3gelic(void); +#else +static inline void udbg_shutdown_ps3gelic(void) {} +#endif + extern int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask); /* shared netdev ops */ extern void gelic_card_up(struct gelic_card *card); -- cgit v1.2.3 From 94db7c5e14f44b943febe54e089d077cd983d284 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 10 Aug 2011 20:44:22 +0000 Subject: powerpc: Use for_each_node_by_type instead of open coding it Use for_each_node_by_type instead of open coding it. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/machine_kexec_64.c | 3 +-- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/mm/numa.c | 10 +++++----- 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 583af70c4b14..26ccbf77dd41 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -74,8 +74,7 @@ int default_machine_kexec_prepare(struct kimage *image) } /* We also should not overwrite the tce tables */ - for (node = of_find_node_by_type(NULL, "pci"); node != NULL; - node = of_find_node_by_type(node, "pci")) { + for_each_node_by_type(node, "pci") { basep = of_get_property(node, "linux,tce-base", NULL); sizep = of_get_property(node, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index aebef1320ed7..eade1fd8ee2e 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -278,7 +278,7 @@ static void __init initialize_cache_info(void) DBG(" -> initialize_cache_info()\n"); - for (np = NULL; (np = of_find_node_by_type(np, "cpu"));) { + for_each_node_by_type(np, "cpu") { num_cpus += 1; /* We're assuming *all* of the CPUs have the same diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 2c1ae7a5fb53..00cc090cd1dc 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -710,7 +710,7 @@ static void __init parse_drconf_memory(struct device_node *memory) static int __init parse_numa_properties(void) { struct device_node *cpu = NULL; - struct device_node *memory = NULL; + struct device_node *memory; int default_nid = 0; unsigned long i; @@ -750,8 +750,8 @@ static int __init parse_numa_properties(void) } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + + for_each_node_by_type(memory, "memory") { unsigned long start; unsigned long size; int nid; @@ -1187,10 +1187,10 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory, */ int hot_add_node_scn_to_nid(unsigned long scn_addr) { - struct device_node *memory = NULL; + struct device_node *memory; int nid = -1; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + for_each_node_by_type(memory, "memory") { unsigned long start, size; int ranges; const unsigned int *memcell_buf; -- cgit v1.2.3 From dfbe93a222e74b6f96ad84eff2b04a0f864fac65 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 10 Aug 2011 20:44:23 +0000 Subject: powerpc: Coding style cleanups While converting code to use for_each_node_by_type I noticed a number of coding style issues. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/setup_64.c | 20 +++++++++++++------- arch/powerpc/mm/numa.c | 7 ++++--- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index eade1fd8ee2e..d4168c93098f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -281,11 +281,11 @@ static void __init initialize_cache_info(void) for_each_node_by_type(np, "cpu") { num_cpus += 1; - /* We're assuming *all* of the CPUs have the same + /* + * We're assuming *all* of the CPUs have the same * d-cache and i-cache sizes... -Peter */ - - if ( num_cpus == 1 ) { + if (num_cpus == 1) { const u32 *sizep, *lsizep; u32 size, lsize; @@ -294,10 +294,13 @@ static void __init initialize_cache_info(void) sizep = of_get_property(np, "d-cache-size", NULL); if (sizep != NULL) size = *sizep; - lsizep = of_get_property(np, "d-cache-block-size", NULL); + lsizep = of_get_property(np, "d-cache-block-size", + NULL); /* fallback if block size missing */ if (lsizep == NULL) - lsizep = of_get_property(np, "d-cache-line-size", NULL); + lsizep = of_get_property(np, + "d-cache-line-size", + NULL); if (lsizep != NULL) lsize = *lsizep; if (sizep == 0 || lsizep == 0) @@ -314,9 +317,12 @@ static void __init initialize_cache_info(void) sizep = of_get_property(np, "i-cache-size", NULL); if (sizep != NULL) size = *sizep; - lsizep = of_get_property(np, "i-cache-block-size", NULL); + lsizep = of_get_property(np, "i-cache-block-size", + NULL); if (lsizep == NULL) - lsizep = of_get_property(np, "i-cache-line-size", NULL); + lsizep = of_get_property(np, + "i-cache-line-size", + NULL); if (lsizep != NULL) lsize = *lsizep; if (sizep == 0 || lsizep == 0) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 00cc090cd1dc..0bfb90c50571 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -709,7 +709,6 @@ static void __init parse_drconf_memory(struct device_node *memory) static int __init parse_numa_properties(void) { - struct device_node *cpu = NULL; struct device_node *memory; int default_nid = 0; unsigned long i; @@ -732,6 +731,7 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + struct device_node *cpu; int nid; cpu = of_get_cpu_node(i, NULL); @@ -800,8 +800,9 @@ new_range: } /* - * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory - * property in the ibm,dynamic-reconfiguration-memory node. + * Now do the same thing for each MEMBLOCK listed in the + * ibm,dynamic-memory property in the + * ibm,dynamic-reconfiguration-memory node. */ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (memory) -- cgit v1.2.3 From fb82b83970a32263698e54a8779d2ce88cd3b060 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:49 +0000 Subject: powerpc/smp: More generic support for "soft hotplug" This adds more generic support for doing CPU hotplug with a simple idle loop and no actual reset of the processors. The generic smp_generic_kick_cpu() does the hotplug bringup trick if the PACA shows that the CPU has already been started at boot and we provide an accessor for the CPU state. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/smp.h | 1 + arch/powerpc/kernel/smp.c | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 15a70b7f638b..adba970ce918 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -65,6 +65,7 @@ int generic_cpu_disable(void); void generic_cpu_die(unsigned int cpu); void generic_mach_cpu_die(void); void generic_set_cpu_dead(unsigned int cpu); +int generic_check_cpu_restart(unsigned int cpu); #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 7bf2187dfd99..af7e7722ecae 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -70,6 +70,10 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) + +/* State of each CPU during hotplug phases */ +static DEFINE_PER_CPU(int, cpu_state) = { 0 }; + #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) @@ -104,12 +108,25 @@ int __devinit smp_generic_kick_cpu(int nr) * cpu_start field to become non-zero After we set cpu_start, * the processor will continue on to secondary_start */ - paca[nr].cpu_start = 1; - smp_mb(); + if (!paca[nr].cpu_start) { + paca[nr].cpu_start = 1; + smp_mb(); + return 0; + } + +#ifdef CONFIG_HOTPLUG_CPU + /* + * Ok it's not there, so it might be soft-unplugged, let's + * try to bring it back + */ + per_cpu(cpu_state, nr) = CPU_UP_PREPARE; + smp_wmb(); + smp_send_reschedule(nr); +#endif /* CONFIG_HOTPLUG_CPU */ return 0; } -#endif +#endif /* CONFIG_PPC64 */ static irqreturn_t call_function_action(int irq, void *data) { @@ -357,8 +374,6 @@ void __devinit smp_prepare_boot_cpu(void) } #ifdef CONFIG_HOTPLUG_CPU -/* State of each CPU during hotplug phases */ -static DEFINE_PER_CPU(int, cpu_state) = { 0 }; int generic_cpu_disable(void) { @@ -406,6 +421,11 @@ void generic_set_cpu_dead(unsigned int cpu) { per_cpu(cpu_state, cpu) = CPU_DEAD; } + +int generic_check_cpu_restart(unsigned int cpu) +{ + return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; +} #endif struct create_idle { -- cgit v1.2.3 From 781fb7a3e4cdca28236ae23e2c77070ed3ae531f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:50 +0000 Subject: powerpc/pci: Call pcie_bus_configure_settings() This new function is used to properly setup the PCI Express Max Payload Size (and in some circumstances Max Read Request Size). Some systems will not operate properly if these aren't set correctly and the firmware doesn't always do it. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/pci-common.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 32656f105250..1bd47f36b25f 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1730,6 +1730,17 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose) if (mode == PCI_PROBE_NORMAL) hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); + + /* Configure PCI Express settings */ + if (bus) { + struct pci_bus *child; + list_for_each_entry(child, &bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + pcie_bus_configure_settings(child, self->pcie_mpss); + } + } } static void fixup_hide_host_resource_fsl(struct pci_dev *dev) -- cgit v1.2.3 From e550592e689cf8d682937f356497f989f3d88292 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:51 +0000 Subject: powerpc/powernv: Don't clobber r9 in relative_toc() With OPAL, r8 and r9 will be used to pass the OPAL base and entry for debugging purposes (those informations are also in the device-tree). We don't want to clobber those registers that early. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/head_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 3564c49c683e..e708abe576d3 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -674,9 +674,9 @@ _GLOBAL(enable_64b_mode) _GLOBAL(relative_toc) mflr r0 bcl 20,31,$+4 -0: mflr r9 - ld r2,(p_toc - 0b)(r9) - add r2,r2,r9 +0: mflr r11 + ld r2,(p_toc - 0b)(r11) + add r2,r2,r11 mtlr r0 blr -- cgit v1.2.3 From 27f4488872d9ef2a4b9aa2be58fb0789d6c0ba84 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 18:27:58 +0000 Subject: powerpc/powernv: Add OPAL takeover from PowerVM On machines supporting the OPAL firmware version 1, the system is initially booted under pHyp. We then use a special hypercall to verify if OPAL is available and if it is, we then trigger a "takeover" which disables pHyp and loads the OPAL runtime firmware, giving control to the kernel in hypervisor mode. This patch add the necessary code to detect that the OPAL takeover capability is present when running under PowerVM (aka pHyp) and perform said takeover to get hypervisor control of the processor. To perform the takeover, we must first use RTAS (within Open Firmware runtime environment) to start all processors & threads, in order to give control to OPAL on all of them. We then call the takeover hypercall on everybody, OPAL will re-enter the kernel main entry point passing it a flat device-tree. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 50 ++++++ arch/powerpc/kernel/head_64.S | 4 + arch/powerpc/kernel/prom_init.c | 239 +++++++++++++++++++++++-- arch/powerpc/kernel/prom_init_check.sh | 3 +- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-takeover.S | 140 +++++++++++++++ 6 files changed, 419 insertions(+), 19 deletions(-) create mode 100644 arch/powerpc/include/asm/opal.h create mode 100644 arch/powerpc/platforms/powernv/opal-takeover.S (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h new file mode 100644 index 000000000000..ecdb283f8b7c --- /dev/null +++ b/arch/powerpc/include/asm/opal.h @@ -0,0 +1,50 @@ +/* + * PowerNV OPAL definitions. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __OPAL_H +#define __OPAL_H + +/****** Takeover interface ********/ + +/* PAPR H-Call used to querty the HAL existence and/or instanciate + * it from within pHyp (tech preview only). + * + * This is exclusively used in prom_init.c + */ + +#ifndef __ASSEMBLY__ + +struct opal_takeover_args { + u64 k_image; /* r4 */ + u64 k_size; /* r5 */ + u64 k_entry; /* r6 */ + u64 k_entry2; /* r7 */ + u64 hal_addr; /* r8 */ + u64 rd_image; /* r9 */ + u64 rd_size; /* r10 */ + u64 rd_loc; /* r11 */ +}; + +extern long opal_query_takeover(u64 *hal_size, u64 *hal_align); + +extern long opal_do_takeover(struct opal_takeover_args *args); + +extern int opal_enter_rtas(struct rtas_args *args, + unsigned long data, + unsigned long entry); + + +#endif /* __ASSEMBLY__ */ + +/****** OPAL APIs ******/ + + +#endif /* __OPAL_H */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index e708abe576d3..dea8191253d2 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -51,6 +51,10 @@ * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. * 2. The kernel is entered at __start + * -or- For OPAL entry: + * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 + * with device-tree in gpr3 + * 2. Secondary processors enter at 0x60 with PIR in gpr3 * * For iSeries: * 1. The MMU is on (as it always is for iSeries) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a909f4e9343b..9369287aa8c2 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -43,6 +43,7 @@ #include #include #include +#include #include @@ -185,6 +186,7 @@ static unsigned long __initdata prom_tce_alloc_end; #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_GENERIC 0x0500 +#define PLATFORM_OPAL 0x0600 static int __initdata of_platform; @@ -644,7 +646,7 @@ static void __init early_cmdline_parse(void) } } -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * There are two methods for telling firmware what our capabilities are. * Newer machines have an "ibm,client-architecture-support" method on the @@ -1274,6 +1276,195 @@ static void __init prom_init_mem(void) prom_printf(" ram_top : %x\n", RELOC(ram_top)); } +static void __init prom_close_stdin(void) +{ + struct prom_t *_prom = &RELOC(prom); + ihandle val; + + if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0) + call_prom("close", 1, 0, val); +} + +#ifdef CONFIG_PPC_POWERNV + +static u64 __initdata prom_opal_size; +static u64 __initdata prom_opal_align; +static int __initdata prom_rtas_start_cpu; +static u64 __initdata prom_rtas_data; +static u64 __initdata prom_rtas_entry; + +/* XXX Don't change this structure without updating opal-takeover.S */ +static struct opal_secondary_data { + s64 ack; /* 0 */ + u64 go; /* 8 */ + struct opal_takeover_args args; /* 16 */ +} opal_secondary_data; + +extern char opal_secondary_entry; + +static void prom_query_opal(void) +{ + long rc; + + prom_printf("Querying for OPAL presence... "); + rc = opal_query_takeover(&RELOC(prom_opal_size), + &RELOC(prom_opal_align)); + prom_debug("(rc = %ld) ", rc); + if (rc != 0) { + prom_printf("not there.\n"); + return; + } + RELOC(of_platform) = PLATFORM_OPAL; + prom_printf(" there !\n"); + prom_debug(" opal_size = 0x%lx\n", RELOC(prom_opal_size)); + prom_debug(" opal_align = 0x%lx\n", RELOC(prom_opal_align)); + if (RELOC(prom_opal_align) < 0x10000) + RELOC(prom_opal_align) = 0x10000; +} + +static int prom_rtas_call(int token, int nargs, int nret, int *outputs, ...) +{ + struct rtas_args rtas_args; + va_list list; + int i; + + rtas_args.token = token; + rtas_args.nargs = nargs; + rtas_args.nret = nret; + rtas_args.rets = (rtas_arg_t *)&(rtas_args.args[nargs]); + va_start(list, outputs); + for (i = 0; i < nargs; ++i) + rtas_args.args[i] = va_arg(list, rtas_arg_t); + va_end(list); + + for (i = 0; i < nret; ++i) + rtas_args.rets[i] = 0; + + opal_enter_rtas(&rtas_args, RELOC(prom_rtas_data), + RELOC(prom_rtas_entry)); + + if (nret > 1 && outputs != NULL) + for (i = 0; i < nret-1; ++i) + outputs[i] = rtas_args.rets[i+1]; + return (nret > 0)? rtas_args.rets[0]: 0; +} + +static void __init prom_opal_hold_cpus(void) +{ + int i, cnt, cpu, rc; + long j; + phandle node; + char type[64]; + u32 servers[8]; + struct prom_t *_prom = &RELOC(prom); + void *entry = (unsigned long *)&RELOC(opal_secondary_entry); + struct opal_secondary_data *data = &RELOC(opal_secondary_data); + + prom_debug("prom_opal_hold_cpus: start...\n"); + prom_debug(" - entry = 0x%x\n", entry); + prom_debug(" - data = 0x%x\n", data); + + data->ack = -1; + data->go = 0; + + /* look for cpus */ + for (node = 0; prom_next_node(&node); ) { + type[0] = 0; + prom_getprop(node, "device_type", type, sizeof(type)); + if (strcmp(type, RELOC("cpu")) != 0) + continue; + + /* Skip non-configured cpus. */ + if (prom_getprop(node, "status", type, sizeof(type)) > 0) + if (strcmp(type, RELOC("okay")) != 0) + continue; + + cnt = prom_getprop(node, "ibm,ppc-interrupt-server#s", servers, + sizeof(servers)); + if (cnt == PROM_ERROR) + break; + cnt >>= 2; + for (i = 0; i < cnt; i++) { + cpu = servers[i]; + prom_debug("CPU %d ... ", cpu); + if (cpu == _prom->cpu) { + prom_debug("booted !\n"); + continue; + } + prom_debug("starting ... "); + + /* Init the acknowledge var which will be reset by + * the secondary cpu when it awakens from its OF + * spinloop. + */ + data->ack = -1; + rc = prom_rtas_call(RELOC(prom_rtas_start_cpu), 3, 1, + NULL, cpu, entry, data); + prom_debug("rtas rc=%d ...", rc); + + for (j = 0; j < 100000000 && data->ack == -1; j++) { + HMT_low(); + mb(); + } + HMT_medium(); + if (data->ack != -1) + prom_debug("done, PIR=0x%x\n", data->ack); + else + prom_debug("timeout !\n"); + } + } + prom_debug("prom_opal_hold_cpus: end...\n"); +} + +static void prom_opal_takeover(void) +{ + struct opal_secondary_data *data = &RELOC(opal_secondary_data); + struct opal_takeover_args *args = &data->args; + u64 align = RELOC(prom_opal_align); + u64 top_addr, opal_addr; + + args->k_image = (u64)RELOC(_stext); + args->k_size = _end - _stext; + args->k_entry = 0; + args->k_entry2 = 0x60; + + top_addr = _ALIGN_UP(args->k_size, align); + + if (RELOC(prom_initrd_start) != 0) { + args->rd_image = RELOC(prom_initrd_start); + args->rd_size = RELOC(prom_initrd_end) - args->rd_image; + args->rd_loc = top_addr; + top_addr = _ALIGN_UP(args->rd_loc + args->rd_size, align); + } + + /* Pickup an address for the HAL. We want to go really high + * up to avoid problem with future kexecs. On the other hand + * we don't want to be all over the TCEs on P5IOC2 machines + * which are going to be up there too. We assume the machine + * has plenty of memory, and we ask for the HAL for now to + * be just below the 1G point, or above the initrd + */ + opal_addr = _ALIGN_DOWN(0x40000000 - RELOC(prom_opal_size), align); + if (opal_addr < top_addr) + opal_addr = top_addr; + args->hal_addr = opal_addr; + + prom_debug(" k_image = 0x%lx\n", args->k_image); + prom_debug(" k_size = 0x%lx\n", args->k_size); + prom_debug(" k_entry = 0x%lx\n", args->k_entry); + prom_debug(" k_entry2 = 0x%lx\n", args->k_entry2); + prom_debug(" hal_addr = 0x%lx\n", args->hal_addr); + prom_debug(" rd_image = 0x%lx\n", args->rd_image); + prom_debug(" rd_size = 0x%lx\n", args->rd_size); + prom_debug(" rd_loc = 0x%lx\n", args->rd_loc); + prom_printf("Performing OPAL takeover,this can take a few minutes..\n"); + prom_close_stdin(); + mb(); + data->go = 1; + for (;;) + opal_do_takeover(args); +} +#endif /* CONFIG_PPC_POWERNV */ /* * Allocate room for and instantiate RTAS @@ -1326,6 +1517,12 @@ static void __init prom_instantiate_rtas(void) prom_setprop(rtas_node, "/rtas", "linux,rtas-entry", &entry, sizeof(entry)); +#ifdef CONFIG_PPC_POWERNV + /* PowerVN takeover hack */ + RELOC(prom_rtas_data) = base; + RELOC(prom_rtas_entry) = entry; + prom_getprop(rtas_node, "start-cpu", &RELOC(prom_rtas_start_cpu), 4); +#endif prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); prom_debug("rtas size = 0x%x\n", (long)size); @@ -1543,7 +1740,7 @@ static void __init prom_hold_cpus(void) *acknowledge = (unsigned long)-1; if (reg != _prom->cpu) { - /* Primary Thread of non-boot cpu */ + /* Primary Thread of non-boot cpu or any thread */ prom_printf("starting cpu hw idx %lu... ", reg); call_prom("start-cpu", 3, 0, node, secondary_hold, reg); @@ -1652,15 +1849,6 @@ static void __init prom_init_stdout(void) prom_setprop(val, path, "linux,boot-display", NULL, 0); } -static void __init prom_close_stdin(void) -{ - struct prom_t *_prom = &RELOC(prom); - ihandle val; - - if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0) - call_prom("close", 1, 0, val); -} - static int __init prom_find_machine_type(void) { struct prom_t *_prom = &RELOC(prom); @@ -2504,6 +2692,7 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) #endif /* CONFIG_BLK_DEV_INITRD */ } + /* * We enter here early on, when the Open Firmware prom is still * handling exceptions and the MMU hash table for us. @@ -2565,7 +2754,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ prom_check_initrd(r3, r4); -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * On pSeries, inform the firmware about our capabilities */ @@ -2611,14 +2800,30 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, #endif /* - * On non-powermacs, try to instantiate RTAS and puts all CPUs - * in spin-loops. PowerMacs don't have a working RTAS and use - * a different way to spin CPUs + * On non-powermacs, try to instantiate RTAS. PowerMacs don't + * have a usable RTAS implementation. */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) { + if (RELOC(of_platform) != PLATFORM_POWERMAC) prom_instantiate_rtas(); - prom_hold_cpus(); + +#ifdef CONFIG_PPC_POWERNV + /* Detect HAL and try instanciating it & doing takeover */ + if (RELOC(of_platform) == PLATFORM_PSERIES_LPAR) { + prom_query_opal(); + if (RELOC(of_platform) == PLATFORM_OPAL) { + prom_opal_hold_cpus(); + prom_opal_takeover(); + } } +#endif + + /* + * On non-powermacs, put all CPUs in spin-loops. + * + * PowerMacs use a different mechanism to spin CPUs + */ + if (RELOC(of_platform) != PLATFORM_POWERMAC) + prom_hold_cpus(); /* * Fill in some infos for use by the kernel later on diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 9f82f4937892..20af6aada517 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -20,7 +20,8 @@ WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush _end enter_prom memcpy memset reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224 -reloc_got2 kernstart_addr memstart_addr linux_banner" +reloc_got2 kernstart_addr memstart_addr linux_banner _stext +opal_query_takeover opal_do_takeover opal_enter_rtas opal_secondary_entry" NM="$1" OBJ="$2" diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 1c4325056b85..497133027ae5 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,2 +1,2 @@ -obj-y += setup.o +obj-y += setup.o opal-takeover.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S new file mode 100644 index 000000000000..77b48b2b9309 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-takeover.S @@ -0,0 +1,140 @@ +/* + * PowerNV OPAL takeover assembly code, for use by prom_init.c + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#define STK_PARAM(i) (48 + ((i)-3)*8) + +#define H_HAL_TAKEOVER 0x5124 +#define H_HAL_TAKEOVER_QUERY_MAGIC -1 + + .text +_GLOBAL(opal_query_takeover) + mfcr r0 + stw r0,8(r1) + std r3,STK_PARAM(r3)(r1) + std r4,STK_PARAM(r4)(r1) + li r3,H_HAL_TAKEOVER + li r4,H_HAL_TAKEOVER_QUERY_MAGIC + HVSC + ld r10,STK_PARAM(r3)(r1) + std r4,0(r10) + ld r10,STK_PARAM(r4)(r1) + std r5,0(r10) + lwz r0,8(r1) + mtcrf 0xff,r0 + blr + +_GLOBAL(opal_do_takeover) + mfcr r0 + stw r0,8(r1) + mflr r0 + std r0,16(r1) + bl __opal_do_takeover + ld r0,16(r1) + mtlr r0 + lwz r0,8(r1) + mtcrf 0xff,r0 + blr + +__opal_do_takeover: + ld r4,0(r3) + ld r5,0x8(r3) + ld r6,0x10(r3) + ld r7,0x18(r3) + ld r8,0x20(r3) + ld r9,0x28(r3) + ld r10,0x30(r3) + ld r11,0x38(r3) + li r3,H_HAL_TAKEOVER + HVSC + blr + + .globl opal_secondary_entry +opal_secondary_entry: + mr r31,r3 + mfmsr r11 + li r12,(MSR_SF | MSR_ISF)@highest + sldi r12,r12,48 + or r11,r11,r12 + mtmsrd r11 + isync + mfspr r4,SPRN_PIR + std r4,0(r3) +1: HMT_LOW + ld r4,8(r3) + cmpli cr0,r4,0 + beq 1b + HMT_MEDIUM +1: addi r3,r31,16 + bl __opal_do_takeover + b 1b + +_GLOBAL(opal_enter_rtas) + mflr r0 + std r0,16(r1) + stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ + + /* Because PROM is running in 32b mode, it clobbers the high order half + * of all registers that it saves. We therefore save those registers + * PROM might touch to the stack. (r0, r3-r13 are caller saved) + */ + SAVE_GPR(2, r1) + SAVE_GPR(13, r1) + SAVE_8GPRS(14, r1) + SAVE_10GPRS(22, r1) + mfcr r10 + mfmsr r11 + std r10,_CCR(r1) + std r11,_MSR(r1) + + /* Get the PROM entrypoint */ + mtlr r5 + + /* Switch MSR to 32 bits mode + */ + li r12,1 + rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) + andc r11,r11,r12 + li r12,1 + rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) + andc r11,r11,r12 + mtmsrd r11 + isync + + /* Enter RTAS here... */ + blrl + + /* Just make sure that r1 top 32 bits didn't get + * corrupt by OF + */ + rldicl r1,r1,0,32 + + /* Restore the MSR (back to 64 bits) */ + ld r0,_MSR(r1) + MTMSRD(r0) + isync + + /* Restore other registers */ + REST_GPR(2, r1) + REST_GPR(13, r1) + REST_8GPRS(14, r1) + REST_10GPRS(22, r1) + ld r4,_CCR(r1) + mtcr r4 + + addi r1,r1,PROM_FRAME_SIZE + ld r0,16(r1) + mtlr r0 + blr -- cgit v1.2.3 From 817c21ad9a1f00926f080265493923ada3458c63 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:56 +0000 Subject: powerpc/powernv: Get kernel command line accross OPAL takeover We stash it in boot_command_line which isn't in BSS and so won't be overwritten. We then use that as a default cmd_line before we walk the device-tree. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom.c | 7 +++++++ arch/powerpc/kernel/prom_init.c | 4 ++++ arch/powerpc/kernel/prom_init_check.sh | 3 ++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 174e1e96175e..7b90c564e932 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -712,6 +712,13 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); #endif + /* Pre-initialize the cmd_line with the content of boot_commmand_line, + * which will be empty except when the content of the variable has + * been overriden by a bootloading mechanism. This happens typically + * with HAL takeover + */ + strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE); + /* Retrieve various informations from the /chosen node of the * device-tree, including the platform type, initrd location and * size, TCE reserve, and more ... diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 9369287aa8c2..e3f390427216 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1449,6 +1449,10 @@ static void prom_opal_takeover(void) opal_addr = top_addr; args->hal_addr = opal_addr; + /* Copy the command line to the kernel image */ + strlcpy(RELOC(boot_command_line), RELOC(prom_cmd_line), + COMMAND_LINE_SIZE); + prom_debug(" k_image = 0x%lx\n", args->k_image); prom_debug(" k_size = 0x%lx\n", args->k_size); prom_debug(" k_entry = 0x%lx\n", args->k_entry); diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 20af6aada517..70f4286eaa7a 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -21,7 +21,8 @@ _end enter_prom memcpy memset reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224 reloc_got2 kernstart_addr memstart_addr linux_banner _stext -opal_query_takeover opal_do_takeover opal_enter_rtas opal_secondary_entry" +opal_query_takeover opal_do_takeover opal_enter_rtas opal_secondary_entry +boot_command_line" NM="$1" OBJ="$2" -- cgit v1.2.3 From 14a43e69ed257a1fadadf9fea2c05adb1686419f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:57 +0000 Subject: powerpc/powernv: Basic support for OPAL Add definition of OPAL interfaces along with the wrappers to call into OPAL runtime and the early device-tree parsing hook to locate the OPAL runtime firmware. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/firmware.h | 10 + arch/powerpc/include/asm/opal.h | 382 ++++++++++++++++++++++++- arch/powerpc/kernel/prom.c | 7 + arch/powerpc/platforms/powernv/Kconfig | 9 +- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-wrappers.S | 101 +++++++ arch/powerpc/platforms/powernv/opal.c | 154 ++++++++++ arch/powerpc/platforms/powernv/setup.c | 6 + arch/powerpc/platforms/powernv/smp.c | 25 +- 9 files changed, 691 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-wrappers.S create mode 100644 arch/powerpc/platforms/powernv/opal.c (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 3a6c586c4e40..14db29b18d0e 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -48,6 +48,8 @@ #define FW_FEATURE_CMO ASM_CONST(0x0000000002000000) #define FW_FEATURE_VPHN ASM_CONST(0x0000000004000000) #define FW_FEATURE_XCMO ASM_CONST(0x0000000008000000) +#define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) +#define FW_FEATURE_OPALv2 ASM_CONST(0x0000000020000000) #ifndef __ASSEMBLY__ @@ -65,6 +67,8 @@ enum { FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, + FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv2, + FW_FEATURE_POWERNV_ALWAYS = 0, FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_CELLEB_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_BEAT, @@ -78,6 +82,9 @@ enum { #ifdef CONFIG_PPC_ISERIES FW_FEATURE_ISERIES_POSSIBLE | #endif +#ifdef CONFIG_PPC_POWERNV + FW_FEATURE_POWERNV_POSSIBLE | +#endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_POSSIBLE | #endif @@ -95,6 +102,9 @@ enum { #ifdef CONFIG_PPC_ISERIES FW_FEATURE_ISERIES_ALWAYS & #endif +#ifdef CONFIG_PPC_POWERNV + FW_FEATURE_POWERNV_ALWAYS & +#endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_ALWAYS & #endif diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ecdb283f8b7c..c7a3202d10a0 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -37,14 +37,394 @@ extern long opal_query_takeover(u64 *hal_size, u64 *hal_align); extern long opal_do_takeover(struct opal_takeover_args *args); +struct rtas_args; extern int opal_enter_rtas(struct rtas_args *args, unsigned long data, unsigned long entry); - #endif /* __ASSEMBLY__ */ /****** OPAL APIs ******/ +/* Return codes */ +#define OPAL_SUCCESS 0 +#define OPAL_PARAMETER -1 +#define OPAL_BUSY -2 +#define OPAL_PARTIAL -3 +#define OPAL_CONSTRAINED -4 +#define OPAL_CLOSED -5 +#define OPAL_HARDWARE -6 +#define OPAL_UNSUPPORTED -7 +#define OPAL_PERMISSION -8 +#define OPAL_NO_MEM -9 +#define OPAL_RESOURCE -10 +#define OPAL_INTERNAL_ERROR -11 +#define OPAL_BUSY_EVENT -12 +#define OPAL_HARDWARE_FROZEN -13 + +/* API Tokens (in r0) */ +#define OPAL_CONSOLE_WRITE 1 +#define OPAL_CONSOLE_READ 2 +#define OPAL_RTC_READ 3 +#define OPAL_RTC_WRITE 4 +#define OPAL_CEC_POWER_DOWN 5 +#define OPAL_CEC_REBOOT 6 +#define OPAL_READ_NVRAM 7 +#define OPAL_WRITE_NVRAM 8 +#define OPAL_HANDLE_INTERRUPT 9 +#define OPAL_POLL_EVENTS 10 +#define OPAL_PCI_SET_HUB_TCE_MEMORY 11 +#define OPAL_PCI_SET_PHB_TCE_MEMORY 12 +#define OPAL_PCI_CONFIG_READ_BYTE 13 +#define OPAL_PCI_CONFIG_READ_HALF_WORD 14 +#define OPAL_PCI_CONFIG_READ_WORD 15 +#define OPAL_PCI_CONFIG_WRITE_BYTE 16 +#define OPAL_PCI_CONFIG_WRITE_HALF_WORD 17 +#define OPAL_PCI_CONFIG_WRITE_WORD 18 +#define OPAL_SET_XIVE 19 +#define OPAL_GET_XIVE 20 +#define OPAL_GET_COMPLETION_TOKEN_STATUS 21 /* obsolete */ +#define OPAL_REGISTER_OPAL_EXCEPTION_HANDLER 22 +#define OPAL_PCI_EEH_FREEZE_STATUS 23 +#define OPAL_PCI_SHPC 24 +#define OPAL_CONSOLE_WRITE_BUFFER_SPACE 25 +#define OPAL_PCI_EEH_FREEZE_CLEAR 26 +#define OPAL_PCI_PHB_MMIO_ENABLE 27 +#define OPAL_PCI_SET_PHB_MEM_WINDOW 28 +#define OPAL_PCI_MAP_PE_MMIO_WINDOW 29 +#define OPAL_PCI_SET_PHB_TABLE_MEMORY 30 +#define OPAL_PCI_SET_PE 31 +#define OPAL_PCI_SET_PELTV 32 +#define OPAL_PCI_SET_MVE 33 +#define OPAL_PCI_SET_MVE_ENABLE 34 +#define OPAL_PCI_GET_XIVE_REISSUE 35 +#define OPAL_PCI_SET_XIVE_REISSUE 36 +#define OPAL_PCI_SET_XIVE_PE 37 +#define OPAL_GET_XIVE_SOURCE 38 +#define OPAL_GET_MSI_32 39 +#define OPAL_GET_MSI_64 40 +#define OPAL_START_CPU 41 +#define OPAL_QUERY_CPU_STATUS 42 +#define OPAL_WRITE_OPPANEL 43 +#define OPAL_PCI_MAP_PE_DMA_WINDOW 44 +#define OPAL_PCI_MAP_PE_DMA_WINDOW_REAL 45 +#define OPAL_PCI_RESET 49 + +#ifndef __ASSEMBLY__ + +/* Other enums */ +enum OpalVendorApiTokens { + OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999 +}; +enum OpalFreezeState { + OPAL_EEH_STOPPED_NOT_FROZEN = 0, + OPAL_EEH_STOPPED_MMIO_FREEZE = 1, + OPAL_EEH_STOPPED_DMA_FREEZE = 2, + OPAL_EEH_STOPPED_MMIO_DMA_FREEZE = 3, + OPAL_EEH_STOPPED_RESET = 4, + OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5, + OPAL_EEH_STOPPED_PERM_UNAVAIL = 6 +}; +enum OpalEehFreezeActionToken { + OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1, + OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3 +}; +enum OpalPciStatusToken { + OPAL_EEH_PHB_NO_ERROR = 0, + OPAL_EEH_PHB_FATAL = 1, + OPAL_EEH_PHB_RECOVERABLE = 2, + OPAL_EEH_PHB_BUS_ERROR = 3, + OPAL_EEH_PCI_NO_DEVSEL = 4, + OPAL_EEH_PCI_TA = 5, + OPAL_EEH_PCIEX_UR = 6, + OPAL_EEH_PCIEX_CA = 7, + OPAL_EEH_PCI_MMIO_ERROR = 8, + OPAL_EEH_PCI_DMA_ERROR = 9 +}; +enum OpalShpcAction { + OPAL_SHPC_GET_LINK_STATE = 0, + OPAL_SHPC_GET_SLOT_STATE = 1 +}; +enum OpalShpcLinkState { + OPAL_SHPC_LINK_DOWN = 0, + OPAL_SHPC_LINK_UP = 1 +}; +enum OpalMmioWindowType { + OPAL_M32_WINDOW_TYPE = 1, + OPAL_M64_WINDOW_TYPE = 2, + OPAL_IO_WINDOW_TYPE = 3 +}; +enum OpalShpcSlotState { + OPAL_SHPC_DEV_NOT_PRESENT = 0, + OPAL_SHPC_DEV_PRESENT = 1 +}; +enum OpalExceptionHandler { + OPAL_MACHINE_CHECK_HANDLER = 1, + OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2, + OPAL_SOFTPATCH_HANDLER = 3 +}; +enum OpalPendingState { + OPAL_EVENT_OPAL_INTERNAL = 0x1, + OPAL_EVENT_NVRAM = 0x2, + OPAL_EVENT_RTC = 0x4, + OPAL_EVENT_CONSOLE_OUTPUT = 0x8, + OPAL_EVENT_CONSOLE_INPUT = 0x10 +}; + +/* Machine check related definitions */ +enum OpalMCE_Version { + OpalMCE_V1 = 1, +}; + +enum OpalMCE_Severity { + OpalMCE_SEV_NO_ERROR = 0, + OpalMCE_SEV_WARNING = 1, + OpalMCE_SEV_ERROR_SYNC = 2, + OpalMCE_SEV_FATAL = 3, +}; + +enum OpalMCE_Disposition { + OpalMCE_DISPOSITION_RECOVERED = 0, + OpalMCE_DISPOSITION_NOT_RECOVERED = 1, +}; + +enum OpalMCE_Initiator { + OpalMCE_INITIATOR_UNKNOWN = 0, + OpalMCE_INITIATOR_CPU = 1, +}; + +enum OpalMCE_ErrorType { + OpalMCE_ERROR_TYPE_UNKNOWN = 0, + OpalMCE_ERROR_TYPE_UE = 1, + OpalMCE_ERROR_TYPE_SLB = 2, + OpalMCE_ERROR_TYPE_ERAT = 3, + OpalMCE_ERROR_TYPE_TLB = 4, +}; + +enum OpalMCE_UeErrorType { + OpalMCE_UE_ERROR_INDETERMINATE = 0, + OpalMCE_UE_ERROR_IFETCH = 1, + OpalMCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH = 2, + OpalMCE_UE_ERROR_LOAD_STORE = 3, + OpalMCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 4, +}; + +enum OpalMCE_SlbErrorType { + OpalMCE_SLB_ERROR_INDETERMINATE = 0, + OpalMCE_SLB_ERROR_PARITY = 1, + OpalMCE_SLB_ERROR_MULTIHIT = 2, +}; + +enum OpalMCE_EratErrorType { + OpalMCE_ERAT_ERROR_INDETERMINATE = 0, + OpalMCE_ERAT_ERROR_PARITY = 1, + OpalMCE_ERAT_ERROR_MULTIHIT = 2, +}; + +enum OpalMCE_TlbErrorType { + OpalMCE_TLB_ERROR_INDETERMINATE = 0, + OpalMCE_TLB_ERROR_PARITY = 1, + OpalMCE_TLB_ERROR_MULTIHIT = 2, +}; + +enum OpalThreadStatus { + OPAL_THREAD_INACTIVE = 0x0, + OPAL_THREAD_STARTED = 0x1 +}; + +enum OpalPciBusCompare { + OpalPciBusAny = 0, /* Any bus number match */ + OpalPciBus3Bits = 2, /* Match top 3 bits of bus number */ + OpalPciBus4Bits = 3, /* Match top 4 bits of bus number */ + OpalPciBus5Bits = 4, /* Match top 5 bits of bus number */ + OpalPciBus6Bits = 5, /* Match top 6 bits of bus number */ + OpalPciBus7Bits = 6, /* Match top 7 bits of bus number */ + OpalPciBusAll = 7, /* Match bus number exactly */ +}; + +enum OpalDeviceCompare { + OPAL_IGNORE_RID_DEVICE_NUMBER = 0, + OPAL_COMPARE_RID_DEVICE_NUMBER = 1 +}; + +enum OpalFuncCompare { + OPAL_IGNORE_RID_FUNCTION_NUMBER = 0, + OPAL_COMPARE_RID_FUNCTION_NUMBER = 1 +}; + +enum OpalPeAction { + OPAL_UNMAP_PE = 0, + OPAL_MAP_PE = 1 +}; + +enum OpalPciResetAndReinitScope { + OPAL_PHB_COMPLETE = 1, OPAL_PCI_LINK = 2, OPAL_PHB_ERROR = 3, + OPAL_PCI_HOT_RESET = 4, OPAL_PCI_FUNDAMENTAL_RESET = 5, + OPAL_PCI_IODA_RESET = 6, +}; + +enum OpalPciResetState { OPAL_DEASSERT_RESET = 0, OPAL_ASSERT_RESET = 1 }; + +struct opal_machine_check_event { + enum OpalMCE_Version version:8; /* 0x00 */ + uint8_t in_use; /* 0x01 */ + enum OpalMCE_Severity severity:8; /* 0x02 */ + enum OpalMCE_Initiator initiator:8; /* 0x03 */ + enum OpalMCE_ErrorType error_type:8; /* 0x04 */ + enum OpalMCE_Disposition disposition:8; /* 0x05 */ + uint8_t reserved_1[2]; /* 0x06 */ + uint64_t gpr3; /* 0x08 */ + uint64_t srr0; /* 0x10 */ + uint64_t srr1; /* 0x18 */ + union { /* 0x20 */ + struct { + enum OpalMCE_UeErrorType ue_error_type:8; + uint8_t effective_address_provided; + uint8_t physical_address_provided; + uint8_t reserved_1[5]; + uint64_t effective_address; + uint64_t physical_address; + uint8_t reserved_2[8]; + } ue_error; + + struct { + enum OpalMCE_SlbErrorType slb_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } slb_error; + + struct { + enum OpalMCE_EratErrorType erat_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } erat_error; + + struct { + enum OpalMCE_TlbErrorType tlb_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } tlb_error; + } u; +}; + +typedef struct oppanel_line { + /* XXX */ +} oppanel_line_t; + +/* API functions */ +int64_t opal_console_write(int64_t term_number, int64_t *length, + const uint8_t *buffer); +int64_t opal_console_read(int64_t term_number, int64_t *length, + uint8_t *buffer); +int64_t opal_console_write_buffer_space(int64_t term_number, + int64_t *length); +int64_t opal_rtc_read(uint32_t *year_month_day, + uint64_t *hour_minute_second_millisecond); +int64_t opal_rtc_write(uint32_t year_month_day, + uint64_t hour_minute_second_millisecond); +int64_t opal_cec_power_down(uint64_t request); +int64_t opal_cec_reboot(void); +int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); +int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); +int64_t opal_handle_interrupt(uint64_t isn, uint64_t *outstanding_event_mask); +int64_t opal_poll_events(uint64_t *outstanding_event_mask); +int64_t opal_pci_set_hub_tce_memory(uint64_t hub_id, uint64_t tce_mem_addr, + uint64_t tce_mem_size); +int64_t opal_pci_set_phb_tce_memory(uint64_t phb_id, uint64_t tce_mem_addr, + uint64_t tce_mem_size); +int64_t opal_pci_config_read_byte(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint8_t *data); +int64_t opal_pci_config_read_half_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint16_t *data); +int64_t opal_pci_config_read_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint32_t *data); +int64_t opal_pci_config_write_byte(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint8_t data); +int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint16_t data); +int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func, + uint64_t offset, uint32_t data); +int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority); +int64_t opal_get_xive(uint32_t isn, uint16_t *server, uint8_t *priority); +int64_t opal_register_exception_handler(uint64_t opal_exception, + uint64_t handler_address, + uint64_t glue_cache_line); +int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint64_t *phb_status); +int64_t opal_pci_eeh_freeze_clear(uint64_t phb_id, uint64_t pe_number, + uint64_t eeh_action_token); +int64_t opal_pci_shpc(uint64_t phb_id, uint64_t shpc_action, uint8_t *state); + + + +int64_t opal_pci_phb_mmio_enable(uint64_t phb_id, uint16_t window_type, + uint16_t window_num, uint16_t enable); +int64_t opal_pci_set_phb_mem_window(uint64_t phb_id, uint16_t window_type, + uint16_t window_num, + uint64_t starting_real_address, + uint64_t starting_pci_address, + uint16_t segment_size); +int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint16_t pe_number, + uint16_t window_type, uint16_t window_num, + uint16_t segment_num); +int64_t opal_pci_set_phb_table_memory(uint64_t phb_id, uint64_t rtt_addr, + uint64_t ivt_addr, uint64_t ivt_len, + uint64_t reject_array_addr, + uint64_t peltv_addr); +int64_t opal_pci_set_pe(uint64_t phb_id, uint64_t pe_number, uint64_t bus_dev_func, + uint8_t bus_compare, uint8_t dev_compare, uint8_t func_compare, + uint8_t pe_action); +int64_t opal_pci_set_peltv(uint64_t phb_id, uint32_t parent_pe, uint32_t child_pe, + uint8_t state); +int64_t opal_pci_set_mve(uint64_t phb_id, uint32_t mve_number, uint32_t pe_number); +int64_t opal_pci_set_mve_enable(uint64_t phb_id, uint32_t mve_number, + uint32_t state); +int64_t opal_pci_get_xive_reissue(uint64_t phb_id, uint32_t xive_number, + uint8_t *p_bit, uint8_t *q_bit); +int64_t opal_pci_set_xive_reissue(uint64_t phb_id, uint32_t xive_number, + uint8_t p_bit, uint8_t q_bit); +int64_t opal_pci_set_xive_pe(uint64_t phb_id, uint32_t pe_number, + uint32_t xive_num); +int64_t opal_get_xive_source(uint64_t phb_id, uint32_t xive_num, + int32_t *interrupt_source_number); +int64_t opal_get_msi_32(uint64_t phb_id, uint32_t mve_number, uint32_t xive_num, + uint8_t msi_range, uint32_t *msi_address, + uint32_t *message_data); +int64_t opal_get_msi_64(uint64_t phb_id, uint32_t mve_number, + uint32_t xive_num, uint8_t msi_range, + uint64_t *msi_address, uint32_t *message_data); +int64_t opal_start_cpu(uint64_t thread_number, uint64_t start_address); +int64_t opal_query_cpu_status(uint64_t thread_number, uint8_t *thread_status); +int64_t opal_write_oppanel(oppanel_line_t *lines, uint64_t num_lines); +int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, uint16_t pe_number, uint16_t window_id, + uint16_t tce_levels, uint64_t tce_table_addr, + uint64_t tce_table_size, uint64_t tce_page_size); +int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number, + uint16_t dma_window_number, uint64_t pci_start_addr, + uint64_t pci_mem_size); +int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state); + +/* Internal functions */ +extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); + +extern int opal_get_chars(uint32_t vtermno, char *buf, int count); +extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); + +extern void hvc_opal_init_early(void); + +/* Internal functions */ +extern int early_init_dt_scan_opal(unsigned long node, const char *uname, + int depth, void *data); + +#endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7b90c564e932..831a201e03d2 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -54,6 +54,8 @@ #include #include #include +#include + #include #ifdef DEBUG @@ -707,6 +709,11 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_rtas, NULL); #endif +#ifdef CONFIG_PPC_POWERNV + /* Some machines might need OPAL info for debugging, grab it now. */ + of_scan_flat_dt(early_init_dt_scan_opal, NULL); +#endif + #ifdef CONFIG_PHYP_DUMP /* scan tree to see if dump occurred during last boot */ of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 24700e8df19c..74fea5c21839 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -1,11 +1,16 @@ config PPC_POWERNV depends on PPC64 && PPC_BOOK3S bool "IBM PowerNV (Non-Virtualized) platform support" - select PPC_RTAS select PPC_NATIVE select PPC_XICS select PPC_ICP_NATIVE - select PPC_ICS_RTAS select PPC_P7_NAP select PPC_PCI_CHOICE if EMBEDDED default y + +config PPC_POWERNV_RTAS + depends on PPC_POWERNV + bool "Support for RTAS based PowerNV platforms such as BML" + default y + select PPC_ICS_RTAS + select PPC_RTAS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 497133027ae5..8f69c0db612c 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,2 +1,2 @@ -obj-y += setup.o opal-takeover.o +obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S new file mode 100644 index 000000000000..4a3f46d8533e --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -0,0 +1,101 @@ +/* + * PowerNV OPAL API wrappers + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +/* TODO: + * + * - Trace irqs in/off (needs saving/restoring all args, argh...) + * - Get r11 feed up by Dave so I can have better register usage + */ +#define OPAL_CALL(name, token) \ + _GLOBAL(name); \ + mflr r0; \ + mfcr r12; \ + std r0,16(r1); \ + std r12,8(r1); \ + std r1,PACAR1(r13); \ + li r0,0; \ + mfmsr r12; \ + ori r0,r0,MSR_EE; \ + std r12,PACASAVEDMSR(r13); \ + andc r12,r12,r0; \ + mtmsrd r12,1; \ + LOAD_REG_ADDR(r0,.opal_return); \ + mtlr r0; \ + li r0,MSR_DR|MSR_IR; \ + andc r12,r12,r0; \ + li r0,token; \ + mtspr SPRN_HSRR1,r12; \ + LOAD_REG_ADDR(r11,opal); \ + ld r12,8(r11); \ + ld r2,0(r11); \ + mtspr SPRN_HSRR0,r12; \ + hrfid + +_STATIC(opal_return) + ld r2,PACATOC(r13); + ld r4,8(r1); + ld r5,16(r1); + ld r6,PACASAVEDMSR(r13); + mtspr SPRN_SRR0,r5; + mtspr SPRN_SRR1,r6; + mtcr r4; + rfid + +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c new file mode 100644 index 000000000000..8d5510784cc2 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal.c @@ -0,0 +1,154 @@ +/* + * PowerNV OPAL high level interfaces + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include + +#include "powernv.h" + +struct opal { + u64 base; + u64 entry; +} opal; + +static struct device_node *opal_node; +static DEFINE_SPINLOCK(opal_write_lock); + +int __init early_init_dt_scan_opal(unsigned long node, + const char *uname, int depth, void *data) +{ + const void *basep, *entryp; + unsigned long basesz, entrysz; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz); + entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); + + if (!basep || !entryp) + return 1; + + opal.base = of_read_number(basep, basesz/4); + opal.entry = of_read_number(entryp, entrysz/4); + + pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n", + opal.base, basep, basesz); + pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n", + opal.entry, entryp, entrysz); + + powerpc_firmware_features |= FW_FEATURE_OPAL; + if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { + powerpc_firmware_features |= FW_FEATURE_OPALv2; + printk("OPAL V2 detected !\n"); + } else { + printk("OPAL V1 detected !\n"); + } + + return 1; +} + +int opal_get_chars(uint32_t vtermno, char *buf, int count) +{ + s64 len, rc; + u64 evt; + + if (!opal.entry) + return 0; + opal_poll_events(&evt); + if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0) + return 0; + len = count; + rc = opal_console_read(vtermno, &len, buf); + if (rc == OPAL_SUCCESS) + return len; + return 0; +} + +int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +{ + int written = 0; + s64 len, rc = OPAL_BUSY; + unsigned long flags; + u64 evt; + + if (!opal.entry) + return 0; + + /* We want put_chars to be atomic to avoid mangling of hvsi + * packets. To do that, we first test for room and return + * -EAGAIN if there isn't enough + */ + spin_lock_irqsave(&opal_write_lock, flags); + rc = opal_console_write_buffer_space(vtermno, &len); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(&evt); + return -EAGAIN; + } + + /* We still try to handle partial completions, though they + * should no longer happen. + */ + while(total_len > 0 && (rc == OPAL_BUSY || + rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { + len = total_len; + rc = opal_console_write(vtermno, &len, data); + if (rc == OPAL_SUCCESS) { + total_len -= len; + data += len; + written += len; + } + /* This is a bit nasty but we need that for the console to + * flush when there aren't any interrupts. We will clean + * things a bit later to limit that to synchronous path + * such as the kernel console and xmon/udbg + */ + do + opal_poll_events(&evt); + while(rc == OPAL_SUCCESS && (evt & OPAL_EVENT_CONSOLE_OUTPUT)); + } + spin_unlock_irqrestore(&opal_write_lock, flags); + return written; +} + +static int __init opal_init(void) +{ + struct device_node *np, *consoles; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_warn("opal: Node not found\n"); + return -ENODEV; + } + if (firmware_has_feature(FW_FEATURE_OPALv2)) + consoles = of_find_node_by_path("/ibm,opal/consoles"); + else + consoles = of_node_get(opal_node); + + /* Register serial ports */ + for_each_child_of_node(consoles, np) { + if (strcmp(np->name, "serial")) + continue; + of_platform_device_create(np, NULL, NULL); + } + of_node_put(consoles); + return 0; +} +subsys_initcall(opal_init); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 569f9cc4eb04..b6e5ff85cc6f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -74,6 +74,12 @@ static void pnv_show_cpuinfo(struct seq_file *m) if (root) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: PowerNV %s\n", model); + if (firmware_has_feature(FW_FEATURE_OPALv2)) + seq_printf(m, "firmware\t: OPAL v2\n"); + else if (firmware_has_feature(FW_FEATURE_OPAL)) + seq_printf(m, "firmware\t: OPAL v1\n"); + else + seq_printf(m, "firmware\t: BML\n"); of_node_put(root); } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 4f4ec3797eb6..e87736685243 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "powernv.h" @@ -62,6 +63,28 @@ static int pnv_smp_cpu_bootable(unsigned int nr) return 1; } +int __devinit pnv_smp_kick_cpu(int nr) +{ + unsigned int pcpu = get_hard_smp_processor_id(nr); + unsigned long start_here = __pa(*((unsigned long *) + generic_secondary_smp_init)); + long rc; + + BUG_ON(nr < 0 || nr >= NR_CPUS); + + /* On OPAL v2 the CPU are still spinning inside OPAL itself, + * get them back now + */ + if (firmware_has_feature(FW_FEATURE_OPALv2)) { + pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu); + rc = opal_start_cpu(pcpu, start_here); + if (rc != OPAL_SUCCESS) + pr_warn("OPAL Error %ld starting CPU %d\n", + rc, nr); + } + return smp_generic_kick_cpu(nr); +} + #ifdef CONFIG_HOTPLUG_CPU static int pnv_smp_cpu_disable(void) @@ -127,7 +150,7 @@ static struct smp_ops_t pnv_smp_ops = { .message_pass = smp_muxed_ipi_message_pass, .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ .probe = xics_smp_probe, - .kick_cpu = smp_generic_kick_cpu, + .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, .cpu_bootable = pnv_smp_cpu_bootable, #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 6e35d5dac0c83ebb616ff3b9c2d6155c9a9ccb86 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 18:28:01 +0000 Subject: powerpc/powernv: Add support for instanciating OPAL v2 from Open Firmware OPAL v2 is instantiated in a way similar to RTAS using Open Firmware client interface calls, and the resulting address and entry point are put in the device-tree Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom_init.c | 136 ++++++++++++++++++++++++++++++++++------ 1 file changed, 117 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index e3f390427216..e96f5d0d2c78 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -140,7 +140,9 @@ struct mem_map_entry { typedef u32 cell_t; -extern void __start(unsigned long r3, unsigned long r4, unsigned long r5); +extern void __start(unsigned long r3, unsigned long r4, unsigned long r5, + unsigned long r6, unsigned long r7, unsigned long r8, + unsigned long r9); #ifdef CONFIG_PPC64 extern int enter_prom(struct prom_args *args, unsigned long entry); @@ -1293,6 +1295,11 @@ static int __initdata prom_rtas_start_cpu; static u64 __initdata prom_rtas_data; static u64 __initdata prom_rtas_entry; +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL +static u64 __initdata prom_opal_base; +static u64 __initdata prom_opal_entry; +#endif + /* XXX Don't change this structure without updating opal-takeover.S */ static struct opal_secondary_data { s64 ack; /* 0 */ @@ -1468,6 +1475,76 @@ static void prom_opal_takeover(void) for (;;) opal_do_takeover(args); } + +/* + * Allocate room for and instantiate OPAL + */ +static void __init prom_instantiate_opal(void) +{ + phandle opal_node; + ihandle opal_inst; + u64 base, entry; + u64 size = 0, align = 0x10000; + u32 rets[2]; + + prom_debug("prom_instantiate_opal: start...\n"); + + opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal")); + prom_debug("opal_node: %x\n", opal_node); + if (!PHANDLE_VALID(opal_node)) + return; + + prom_getprop(opal_node, "opal-runtime-size", &size, sizeof(size)); + if (size == 0) + return; + prom_getprop(opal_node, "opal-runtime-alignment", &align, + sizeof(align)); + + base = alloc_down(size, align, 0); + if (base == 0) { + prom_printf("OPAL allocation failed !\n"); + return; + } + + opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal")); + if (!IHANDLE_VALID(opal_inst)) { + prom_printf("opening opal package failed (%x)\n", opal_inst); + return; + } + + prom_printf("instantiating opal at 0x%x...", base); + + if (call_prom_ret("call-method", 4, 3, rets, + ADDR("load-opal-runtime"), + opal_inst, + base >> 32, base & 0xffffffff) != 0 + || (rets[0] == 0 && rets[1] == 0)) { + prom_printf(" failed\n"); + return; + } + entry = (((u64)rets[0]) << 32) | rets[1]; + + prom_printf(" done\n"); + + reserve_mem(base, size); + + prom_debug("opal base = 0x%x\n", base); + prom_debug("opal align = 0x%x\n", align); + prom_debug("opal entry = 0x%x\n", entry); + prom_debug("opal size = 0x%x\n", (long)size); + + prom_setprop(opal_node, "/ibm,opal", "opal-base-address", + &base, sizeof(base)); + prom_setprop(opal_node, "/ibm,opal", "opal-entry-address", + &entry, sizeof(entry)); + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + RELOC(prom_opal_base) = base; + RELOC(prom_opal_entry) = entry; +#endif + prom_debug("prom_instantiate_opal: end...\n"); +} + #endif /* CONFIG_PPC_POWERNV */ /* @@ -1863,7 +1940,7 @@ static int __init prom_find_machine_type(void) int x; #endif - /* Look for a PowerMac */ + /* Look for a PowerMac or a Cell */ len = prom_getprop(_prom->root, "compatible", compat, sizeof(compat)-1); if (len > 0) { @@ -1889,7 +1966,11 @@ static int __init prom_find_machine_type(void) } } #ifdef CONFIG_PPC64 - /* If not a mac, try to figure out if it's an IBM pSeries or any other + /* Try to detect OPAL */ + if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal")))) + return PLATFORM_OPAL; + + /* Try to figure out if it's an IBM pSeries or any other * PAPR compliant platform. We assume it is if : * - /device_type is "chrp" (please, do NOT use that for future * non-IBM designs ! @@ -2116,7 +2197,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, unsigned long soff; unsigned char *valp; static char pname[MAX_PROPERTY_NAME]; - int l, room; + int l, room, has_phandle = 0; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); @@ -2200,19 +2281,26 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, valp = make_room(mem_start, mem_end, l, 4); call_prom("getprop", 4, 1, node, RELOC(pname), valp, l); *mem_start = _ALIGN(*mem_start, 4); + + if (!strcmp(RELOC(pname), RELOC("phandle"))) + has_phandle = 1; } - /* Add a "linux,phandle" property. */ - soff = dt_find_string(RELOC("linux,phandle")); - if (soff == 0) - prom_printf("WARNING: Can't find string index for" - " node %s\n", path); - else { - dt_push_token(OF_DT_PROP, mem_start, mem_end); - dt_push_token(4, mem_start, mem_end); - dt_push_token(soff, mem_start, mem_end); - valp = make_room(mem_start, mem_end, 4, 4); - *(u32 *)valp = node; + /* Add a "linux,phandle" property if no "phandle" property already + * existed (can happen with OPAL) + */ + if (!has_phandle) { + soff = dt_find_string(RELOC("linux,phandle")); + if (soff == 0) + prom_printf("WARNING: Can't find string index for" + " node %s\n", path); + else { + dt_push_token(OF_DT_PROP, mem_start, mem_end); + dt_push_token(4, mem_start, mem_end); + dt_push_token(soff, mem_start, mem_end); + valp = make_room(mem_start, mem_end, 4, 4); + *(u32 *)valp = node; + } } /* do all our children */ @@ -2746,6 +2834,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * between pSeries SMP and pSeries LPAR */ RELOC(of_platform) = prom_find_machine_type(); + prom_printf("Detected machine type: %x\n", RELOC(of_platform)); #ifndef CONFIG_RELOCATABLE /* Bail if this is a kdump kernel. */ @@ -2807,7 +2896,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * On non-powermacs, try to instantiate RTAS. PowerMacs don't * have a usable RTAS implementation. */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) + if (RELOC(of_platform) != PLATFORM_POWERMAC && + RELOC(of_platform) != PLATFORM_OPAL) prom_instantiate_rtas(); #ifdef CONFIG_PPC_POWERNV @@ -2818,7 +2908,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, prom_opal_hold_cpus(); prom_opal_takeover(); } - } + } else if (RELOC(of_platform) == PLATFORM_OPAL) + prom_instantiate_opal(); #endif /* @@ -2826,7 +2917,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * * PowerMacs use a different mechanism to spin CPUs */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) + if (RELOC(of_platform) != PLATFORM_POWERMAC && + RELOC(of_platform) != PLATFORM_OPAL) prom_hold_cpus(); /* @@ -2894,7 +2986,13 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, reloc_got2(-offset); #endif - __start(hdr, kbase, 0); +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* OPAL early debug gets the OPAL base & entry in r8 and r9 */ + __start(hdr, kbase, 0, 0, 0, + RELOC(prom_opal_base), RELOC(prom_opal_entry)); +#else + __start(hdr, kbase, 0, 0, 0, 0, 0); +#endif return 0; } -- cgit v1.2.3 From daea1175a9f0f70eab5b33e2827d57ba8c686816 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:44:59 +0000 Subject: powerpc/powernv: Support for OPAL console This adds a udbg and an hvc console backend for supporting a console using the OPAL console interfaces. On OPAL v1 we have hvc0 mapped to whatever console the system was configured for (network or hvsi serial port) via the service processor. On OPAL v2 we have hvcN mapped to the Nth console provided by OPAL which generally corresponds to: hvc0 : network console (raw protocol) hvc1 : serial port S1 (hvsi) hvc2 : serial port S2 (hvsi) Note: At this point, early debug console only works with OPAL v1 and shouldn't be enabled in a normal kernel. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig.debug | 31 +++ arch/powerpc/include/asm/opal.h | 5 + arch/powerpc/include/asm/udbg.h | 2 + arch/powerpc/kernel/head_64.S | 14 +- arch/powerpc/kernel/udbg.c | 4 + arch/powerpc/platforms/powernv/opal.c | 31 ++- arch/powerpc/platforms/powernv/setup.c | 14 +- drivers/tty/hvc/Kconfig | 9 + drivers/tty/hvc/Makefile | 1 + drivers/tty/hvc/hvc_opal.c | 424 +++++++++++++++++++++++++++++++++ drivers/tty/hvc/hvsi_lib.c | 4 +- 11 files changed, 517 insertions(+), 22 deletions(-) create mode 100644 drivers/tty/hvc/hvc_opal.c (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 06eb62b0fd90..1b8a9c905cf7 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -265,8 +265,27 @@ config PPC_EARLY_DEBUG_PS3GELIC Select this to enable early debugging for the PlayStation3 via UDP broadcasts sent out through the Ethernet port. +config PPC_EARLY_DEBUG_OPAL_RAW + bool "OPAL raw console" + depends on HVC_OPAL + help + Select this to enable early debugging for the PowerNV platform + using a "raw" console + +config PPC_EARLY_DEBUG_OPAL_HVSI + bool "OPAL hvsi console" + depends on HVC_OPAL + help + Select this to enable early debugging for the PowerNV platform + using an "hvsi" console + endchoice +config PPC_EARLY_DEBUG_OPAL + def_bool y + depends on PPC_EARLY_DEBUG_OPAL_RAW || PPC_EARLY_DEBUG_OPAL_HVSI + + config PPC_EARLY_DEBUG_HVSI_VTERMNO hex "vterm number to use with early debug HVSI" depends on PPC_EARLY_DEBUG_LPAR_HVSI @@ -275,6 +294,18 @@ config PPC_EARLY_DEBUG_HVSI_VTERMNO You probably want 0x30000000 for your first serial port and 0x30000001 for your second one +config PPC_EARLY_DEBUG_OPAL_VTERMNO + hex "vterm number to use with OPAL early debug" + depends on PPC_EARLY_DEBUG_OPAL + default "0" + help + This correspond to which /dev/hvcN you want to use for early + debug. + + On OPAL v1 (takeover) this should always be 0 + On OPAL v2, this will be 0 for network console and 1 or 2 for + the machine built-in serial ports. + config PPC_EARLY_DEBUG_44x_PHYSLOW hex "Low 32 bits of early debug UART physical address" depends on PPC_EARLY_DEBUG_44x diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index c7a3202d10a0..749de00a02d5 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -425,6 +425,11 @@ extern void hvc_opal_init_early(void); extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); +extern int opal_get_chars(uint32_t vtermno, char *buf, int count); +extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); + +extern void hvc_opal_init_early(void); + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 7cf796fa03f2..6587ec7bc6ec 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -55,6 +55,8 @@ extern void __init udbg_init_cpm(void); extern void __init udbg_init_usbgecko(void); extern void __init udbg_init_wsp(void); extern void __init udbg_init_ps3gelic(void); +extern void __init udbg_init_debug_opal_raw(void); +extern void __init udbg_init_debug_opal_hvsi(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_UDBG_H */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index dea8191253d2..06c7251c1bf7 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -53,7 +53,8 @@ * 2. The kernel is entered at __start * -or- For OPAL entry: * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 - * with device-tree in gpr3 + * with device-tree in gpr3. We also get OPAL base in r8 and + * entry in r9 for debugging purposes * 2. Secondary processors enter at 0x60 with PIR in gpr3 * * For iSeries: @@ -335,6 +336,11 @@ _GLOBAL(__start_initialization_multiplatform) /* Save parameters */ mr r31,r3 mr r30,r4 +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* Save OPAL entry */ + mr r28,r8 + mr r29,r9 +#endif #ifdef CONFIG_PPC_BOOK3E bl .start_initialization_book3e @@ -711,6 +717,12 @@ _INIT_STATIC(start_here_multiplatform) bdnz 3b 4: +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* Setup OPAL entry */ + std r28,0(r11); + std r29,8(r11); +#endif + #ifndef CONFIG_PPC_BOOK3E mfmsr r6 ori r6,r6,MSR_RI diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 5b3e98e03158..35f948203ec5 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -69,6 +69,10 @@ void __init udbg_early_init(void) udbg_init_wsp(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) udbg_init_ps3gelic(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_RAW) + udbg_init_debug_opal_raw(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI) + udbg_init_debug_opal_hvsi(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 8d5510784cc2..7887733b9b31 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -67,7 +67,7 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) u64 evt; if (!opal.entry) - return 0; + return -ENODEV; opal_poll_events(&evt); if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0) return 0; @@ -81,31 +81,38 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) int opal_put_chars(uint32_t vtermno, const char *data, int total_len) { int written = 0; - s64 len, rc = OPAL_BUSY; + s64 len, rc; unsigned long flags; u64 evt; if (!opal.entry) - return 0; + return -ENODEV; /* We want put_chars to be atomic to avoid mangling of hvsi * packets. To do that, we first test for room and return - * -EAGAIN if there isn't enough + * -EAGAIN if there isn't enough. + * + * Unfortunately, opal_console_write_buffer_space() doesn't + * appear to work on opal v1, so we just assume there is + * enough room and be done with it */ spin_lock_irqsave(&opal_write_lock, flags); - rc = opal_console_write_buffer_space(vtermno, &len); - if (rc || len < total_len) { - spin_unlock_irqrestore(&opal_write_lock, flags); - /* Closed -> drop characters */ - if (rc) - return total_len; - opal_poll_events(&evt); - return -EAGAIN; + if (firmware_has_feature(FW_FEATURE_OPALv2)) { + rc = opal_console_write_buffer_space(vtermno, &len); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(&evt); + return -EAGAIN; + } } /* We still try to handle partial completions, though they * should no longer happen. */ + rc = OPAL_BUSY; while(total_len > 0 && (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { len = total_len; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index b6e5ff85cc6f..07ba1ecd1807 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -29,17 +29,12 @@ #include #include #include +#include #include "powernv.h" static void __init pnv_setup_arch(void) { - /* Force console to hvc for now until we have sorted out the - * real console situation for the platform. This will make - * hvc_udbg work at least. - */ - add_preferred_console("hvc", 0, NULL); - /* Initialize SMP */ pnv_smp_init(); @@ -55,7 +50,12 @@ static void __init pnv_setup_arch(void) static void __init pnv_init_early(void) { - /* XXX IOMMU */ +#ifdef CONFIG_HVC_OPAL + if (firmware_has_feature(FW_FEATURE_OPAL)) + hvc_opal_init_early(); + else +#endif + add_preferred_console("hvc", 0, NULL); } static void __init pnv_init_IRQ(void) diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig index e371753ba921..4222035acfb7 100644 --- a/drivers/tty/hvc/Kconfig +++ b/drivers/tty/hvc/Kconfig @@ -34,6 +34,15 @@ config HVC_ISERIES help iSeries machines support a hypervisor virtual console. +config HVC_OPAL + bool "OPAL Console support" + depends on PPC_POWERNV + select HVC_DRIVER + select HVC_IRQ + default y + help + PowerNV machines running under OPAL need that driver to get a console + config HVC_RTAS bool "IBM RTAS Console support" depends on PPC_RTAS diff --git a/drivers/tty/hvc/Makefile b/drivers/tty/hvc/Makefile index e29205316376..89abf40bc73d 100644 --- a/drivers/tty/hvc/Makefile +++ b/drivers/tty/hvc/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi_lib.o +obj-$(CONFIG_HVC_OPAL) += hvc_opal.o hvsi_lib.o obj-$(CONFIG_HVC_OLD_HVSI) += hvsi.o obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c new file mode 100644 index 000000000000..7b38512d6c41 --- /dev/null +++ b/drivers/tty/hvc/hvc_opal.c @@ -0,0 +1,424 @@ +/* + * opal driver interface to hvc_console.c + * + * Copyright 2011 Benjamin Herrenschmidt , IBM Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hvc_console.h" + +static const char hvc_opal_name[] = "hvc_opal"; + +static struct of_device_id hvc_opal_match[] __devinitdata = { + { .name = "serial", .compatible = "ibm,opal-console-raw" }, + { .name = "serial", .compatible = "ibm,opal-console-hvsi" }, + { }, +}; + +typedef enum hv_protocol { + HV_PROTOCOL_RAW, + HV_PROTOCOL_HVSI +} hv_protocol_t; + +struct hvc_opal_priv { + hv_protocol_t proto; /* Raw data or HVSI packets */ + struct hvsi_priv hvsi; /* HVSI specific data */ +}; +static struct hvc_opal_priv *hvc_opal_privs[MAX_NR_HVC_CONSOLES]; + +/* For early boot console */ +static struct hvc_opal_priv hvc_opal_boot_priv; +static u32 hvc_opal_boot_termno; + +static const struct hv_ops hvc_opal_raw_ops = { + .get_chars = opal_get_chars, + .put_chars = opal_put_chars, + .notifier_add = notifier_add_irq, + .notifier_del = notifier_del_irq, + .notifier_hangup = notifier_hangup_irq, +}; + +static int hvc_opal_hvsi_get_chars(uint32_t vtermno, char *buf, int count) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[vtermno]; + + if (WARN_ON(!pv)) + return -ENODEV; + + return hvsilib_get_chars(&pv->hvsi, buf, count); +} + +static int hvc_opal_hvsi_put_chars(uint32_t vtermno, const char *buf, int count) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[vtermno]; + + if (WARN_ON(!pv)) + return -ENODEV; + + return hvsilib_put_chars(&pv->hvsi, buf, count); +} + +static int hvc_opal_hvsi_open(struct hvc_struct *hp, int data) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + int rc; + + pr_devel("HVSI@%x: do open !\n", hp->vtermno); + + rc = notifier_add_irq(hp, data); + if (rc) + return rc; + + return hvsilib_open(&pv->hvsi, hp); +} + +static void hvc_opal_hvsi_close(struct hvc_struct *hp, int data) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + pr_devel("HVSI@%x: do close !\n", hp->vtermno); + + hvsilib_close(&pv->hvsi, hp); + + notifier_del_irq(hp, data); +} + +void hvc_opal_hvsi_hangup(struct hvc_struct *hp, int data) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + pr_devel("HVSI@%x: do hangup !\n", hp->vtermno); + + hvsilib_close(&pv->hvsi, hp); + + notifier_hangup_irq(hp, data); +} + +static int hvc_opal_hvsi_tiocmget(struct hvc_struct *hp) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + if (!pv) + return -EINVAL; + return pv->hvsi.mctrl; +} + +static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set, + unsigned int clear) +{ + struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno]; + + pr_devel("HVSI@%x: Set modem control, set=%x,clr=%x\n", + hp->vtermno, set, clear); + + if (set & TIOCM_DTR) + hvsilib_write_mctrl(&pv->hvsi, 1); + else if (clear & TIOCM_DTR) + hvsilib_write_mctrl(&pv->hvsi, 0); + + return 0; +} + +static const struct hv_ops hvc_opal_hvsi_ops = { + .get_chars = hvc_opal_hvsi_get_chars, + .put_chars = hvc_opal_hvsi_put_chars, + .notifier_add = hvc_opal_hvsi_open, + .notifier_del = hvc_opal_hvsi_close, + .notifier_hangup = hvc_opal_hvsi_hangup, + .tiocmget = hvc_opal_hvsi_tiocmget, + .tiocmset = hvc_opal_hvsi_tiocmset, +}; + +static int __devinit hvc_opal_probe(struct platform_device *dev) +{ + const struct hv_ops *ops; + struct hvc_struct *hp; + struct hvc_opal_priv *pv; + hv_protocol_t proto; + unsigned int termno, boot = 0; + const __be32 *reg; + + if (of_device_is_compatible(dev->dev.of_node, "ibm,opal-console-raw")) { + proto = HV_PROTOCOL_RAW; + ops = &hvc_opal_raw_ops; + } else if (of_device_is_compatible(dev->dev.of_node, + "ibm,opal-console-hvsi")) { + proto = HV_PROTOCOL_HVSI; + ops = &hvc_opal_hvsi_ops; + } else { + pr_err("hvc_opal: Unkown protocol for %s\n", + dev->dev.of_node->full_name); + return -ENXIO; + } + + reg = of_get_property(dev->dev.of_node, "reg", NULL); + termno = reg ? be32_to_cpup(reg) : 0; + + /* Is it our boot one ? */ + if (hvc_opal_privs[termno] == &hvc_opal_boot_priv) { + pv = hvc_opal_privs[termno]; + boot = 1; + } else if (hvc_opal_privs[termno] == NULL) { + pv = kzalloc(sizeof(struct hvc_opal_priv), GFP_KERNEL); + if (!pv) + return -ENOMEM; + pv->proto = proto; + hvc_opal_privs[termno] = pv; + if (proto == HV_PROTOCOL_HVSI) + hvsilib_init(&pv->hvsi, opal_get_chars, opal_put_chars, + termno, 0); + + /* Instanciate now to establish a mapping index==vtermno */ + hvc_instantiate(termno, termno, ops); + } else { + pr_err("hvc_opal: Device %s has duplicate terminal number #%d\n", + dev->dev.of_node->full_name, termno); + return -ENXIO; + } + + pr_info("hvc%d: %s protocol on %s%s\n", termno, + proto == HV_PROTOCOL_RAW ? "raw" : "hvsi", + dev->dev.of_node->full_name, + boot ? " (boot console)" : ""); + + /* We don't do IRQ yet */ + hp = hvc_alloc(termno, 0, ops, MAX_VIO_PUT_CHARS); + if (IS_ERR(hp)) + return PTR_ERR(hp); + dev_set_drvdata(&dev->dev, hp); + + return 0; +} + +static int __devexit hvc_opal_remove(struct platform_device *dev) +{ + struct hvc_struct *hp = dev_get_drvdata(&dev->dev); + int rc, termno; + + termno = hp->vtermno; + rc = hvc_remove(hp); + if (rc == 0) { + if (hvc_opal_privs[termno] != &hvc_opal_boot_priv) + kfree(hvc_opal_privs[termno]); + hvc_opal_privs[termno] = NULL; + } + return rc; +} + +static struct platform_driver hvc_opal_driver = { + .probe = hvc_opal_probe, + .remove = __devexit_p(hvc_opal_remove), + .driver = { + .name = hvc_opal_name, + .owner = THIS_MODULE, + .of_match_table = hvc_opal_match, + } +}; + +static int __init hvc_opal_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + /* Register as a vio device to receive callbacks */ + return platform_driver_register(&hvc_opal_driver); +} +module_init(hvc_opal_init); + +static void __exit hvc_opal_exit(void) +{ + platform_driver_unregister(&hvc_opal_driver); +} +module_exit(hvc_opal_exit); + +static void udbg_opal_putc(char c) +{ + unsigned int termno = hvc_opal_boot_termno; + int count = -1; + + if (c == '\n') + udbg_opal_putc('\r'); + + do { + switch(hvc_opal_boot_priv.proto) { + case HV_PROTOCOL_RAW: + count = opal_put_chars(termno, &c, 1); + break; + case HV_PROTOCOL_HVSI: + count = hvc_opal_hvsi_put_chars(termno, &c, 1); + break; + } + } while(count == 0 || count == -EAGAIN); +} + +static int udbg_opal_getc_poll(void) +{ + unsigned int termno = hvc_opal_boot_termno; + int rc = 0; + char c; + + switch(hvc_opal_boot_priv.proto) { + case HV_PROTOCOL_RAW: + rc = opal_get_chars(termno, &c, 1); + break; + case HV_PROTOCOL_HVSI: + rc = hvc_opal_hvsi_get_chars(termno, &c, 1); + break; + } + if (!rc) + return -1; + return c; +} + +static int udbg_opal_getc(void) +{ + int ch; + for (;;) { + ch = udbg_opal_getc_poll(); + if (ch == -1) { + /* This shouldn't be needed...but... */ + volatile unsigned long delay; + for (delay=0; delay < 2000000; delay++) + ; + } else { + return ch; + } + } +} + +static void udbg_init_opal_common(void) +{ + udbg_putc = udbg_opal_putc; + udbg_getc = udbg_opal_getc; + udbg_getc_poll = udbg_opal_getc_poll; + tb_ticks_per_usec = 0x200; /* Make udelay not suck */ +} + +void __init hvc_opal_init_early(void) +{ + struct device_node *stdout_node = NULL; + const u32 *termno; + const char *name = NULL; + const struct hv_ops *ops; + u32 index; + + /* find the boot console from /chosen/stdout */ + if (of_chosen) + name = of_get_property(of_chosen, "linux,stdout-path", NULL); + if (name) { + stdout_node = of_find_node_by_path(name); + if (!stdout_node) { + pr_err("hvc_opal: Failed to locate default console!\n"); + return; + } + } else { + struct device_node *opal, *np; + + /* Current OPAL takeover doesn't provide the stdout + * path, so we hard wire it + */ + opal = of_find_node_by_path("/ibm,opal/consoles"); + if (opal) + pr_devel("hvc_opal: Found consoles in new location\n"); + if (!opal) { + opal = of_find_node_by_path("/ibm,opal"); + if (opal) + pr_devel("hvc_opal: " + "Found consoles in old location\n"); + } + if (!opal) + return; + for_each_child_of_node(opal, np) { + if (!strcmp(np->name, "serial")) { + stdout_node = np; + break; + } + } + of_node_put(opal); + } + if (!stdout_node) + return; + termno = of_get_property(stdout_node, "reg", NULL); + index = termno ? *termno : 0; + if (index >= MAX_NR_HVC_CONSOLES) + return; + hvc_opal_privs[index] = &hvc_opal_boot_priv; + + /* Check the protocol */ + if (of_device_is_compatible(stdout_node, "ibm,opal-console-raw")) { + hvc_opal_boot_priv.proto = HV_PROTOCOL_RAW; + ops = &hvc_opal_raw_ops; + pr_devel("hvc_opal: Found RAW console\n"); + } + else if (of_device_is_compatible(stdout_node,"ibm,opal-console-hvsi")) { + hvc_opal_boot_priv.proto = HV_PROTOCOL_HVSI; + ops = &hvc_opal_hvsi_ops; + hvsilib_init(&hvc_opal_boot_priv.hvsi, opal_get_chars, + opal_put_chars, index, 1); + /* HVSI, perform the handshake now */ + hvsilib_establish(&hvc_opal_boot_priv.hvsi); + pr_devel("hvc_opal: Found HVSI console\n"); + } else + goto out; + hvc_opal_boot_termno = index; + udbg_init_opal_common(); + add_preferred_console("hvc", index, NULL); + hvc_instantiate(index, index, ops); +out: + of_node_put(stdout_node); +} + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL_RAW +void __init udbg_init_debug_opal(void) +{ + u32 index = CONFIG_PPC_EARLY_DEBUG_OPAL_VTERMNO; + hvc_opal_privs[index] = &hvc_opal_boot_priv; + hvc_opal_boot_priv.proto = HV_PROTOCOL_RAW; + hvc_opal_boot_termno = index; + udbg_init_opal_common(); +} +#endif /* CONFIG_PPC_EARLY_DEBUG_OPAL_RAW */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI +void __init udbg_init_debug_opal_hvsi(void) +{ + u32 index = CONFIG_PPC_EARLY_DEBUG_OPAL_VTERMNO; + hvc_opal_privs[index] = &hvc_opal_boot_priv; + hvc_opal_boot_termno = index; + udbg_init_opal_common(); + hvsilib_init(&hvc_opal_boot_priv.hvsi, opal_get_chars, opal_put_chars, + index, 1); + hvsilib_establish(&hvc_opal_boot_priv.hvsi); +} +#endif /* CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI */ diff --git a/drivers/tty/hvc/hvsi_lib.c b/drivers/tty/hvc/hvsi_lib.c index bd9b09827b24..6f4dd83d8695 100644 --- a/drivers/tty/hvc/hvsi_lib.c +++ b/drivers/tty/hvc/hvsi_lib.c @@ -183,7 +183,7 @@ int hvsilib_get_chars(struct hvsi_priv *pv, char *buf, int count) unsigned int tries, read = 0; if (WARN_ON(!pv)) - return 0; + return -ENXIO; /* If we aren't open, don't do anything in order to avoid races * with connection establishment. The hvc core will call this @@ -234,7 +234,7 @@ int hvsilib_put_chars(struct hvsi_priv *pv, const char *buf, int count) int rc, adjcount = min(count, HVSI_MAX_OUTGOING_DATA); if (WARN_ON(!pv)) - return 0; + return -ENODEV; dp.hdr.type = VS_DATA_PACKET_HEADER; dp.hdr.len = adjcount + sizeof(struct hvsi_header); -- cgit v1.2.3 From ed79ba9e15f84cef05aba5cbfe6e93f9b43c31f4 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Sep 2011 17:45:04 +0000 Subject: powerpc/powernv: Machine check and other system interrupts OPAL can handle various interrupt for us such as Machine Checks (it performs all sorts of recovery tasks and passes back control to us with informations about the error), Hardware Management Interrupts and Softpatch interrupts. This wires up the mechanisms and prints out specific informations returned by HAL when a machine check occurs. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 2 + arch/powerpc/include/asm/paca.h | 8 ++ arch/powerpc/kernel/asm-offsets.c | 10 +++ arch/powerpc/kernel/exceptions-64s.S | 27 ++++++- arch/powerpc/platforms/powernv/opal.c | 130 +++++++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/setup.c | 1 + 6 files changed, 174 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 77ebe50020a2..2893e8f5406d 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -436,6 +436,8 @@ extern void opal_get_rtc_time(struct rtc_time *tm); extern unsigned long opal_get_boot_time(void); extern void opal_nvram_init(void); +extern int opal_machine_check(struct pt_regs *regs); + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 516bfb3f47d9..17722c73ba2e 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -43,6 +43,7 @@ extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */ #define get_slb_shadow() (get_paca()->slb_shadow_ptr) struct task_struct; +struct opal_machine_check_event; /* * Defines the layout of the paca. @@ -135,6 +136,13 @@ struct paca_struct { u8 io_sync; /* writel() needs spin_unlock sync */ u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ +#ifdef CONFIG_PPC_POWERNV + /* Pointer to OPAL machine check event structure set by the + * early exception handler for use by high level C handler + */ + struct opal_machine_check_event *opal_mc_evt; +#endif + /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ u64 system_time; /* accumulated system TB ticks */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 5f078bc2063e..536ffa897c6c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -48,6 +48,9 @@ #ifdef CONFIG_PPC_ISERIES #include #endif +#ifdef CONFIG_PPC_POWERNV +#include +#endif #if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST) #include #endif @@ -609,5 +612,12 @@ int main(void) arch.timing_last_enter.tv32.tbl)); #endif +#ifdef CONFIG_PPC_POWERNV + DEFINE(OPAL_MC_GPR3, offsetof(struct opal_machine_check_event, gpr3)); + DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0)); + DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1)); + DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt)); +#endif + return 0; } diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 41b02c792aa3..d51458fa8dee 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1143,7 +1143,7 @@ _GLOBAL(do_stab_bolted) rfid b . /* prevent speculative execution */ -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. * This address (0x7000) is fixed by the RPA. @@ -1151,7 +1151,7 @@ _GLOBAL(do_stab_bolted) .= 0x7000 .globl fwnmi_data_area fwnmi_data_area: -#endif /* CONFIG_PPC_PSERIES */ +#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ /* iSeries does not use the FWNMI stuff, so it is safe to put * this here, even if we later allow kernels that will boot on @@ -1176,9 +1176,12 @@ xLparMap: #endif /* CONFIG_PPC_ISERIES */ -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) + /* pseries and powernv need to keep the whole page from + * 0x7000 to 0x8000 free for use by the firmware + */ . = 0x8000 -#endif /* CONFIG_PPC_PSERIES */ +#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ /* * Space for CPU0's segment table. @@ -1193,3 +1196,19 @@ xLparMap: .globl initial_stab initial_stab: .space 4096 +#ifdef CONFIG_PPC_POWERNV +_GLOBAL(opal_mc_secondary_handler) + HMT_MEDIUM + SET_SCRATCH0(r13) + GET_PACA(r13) + clrldi r3,r3,2 + tovirt(r3,r3) + std r3,PACA_OPAL_MC_EVT(r13) + ld r13,OPAL_MC_SRR0(r3) + mtspr SPRN_SRR0,r13 + ld r13,OPAL_MC_SRR1(r3) + mtspr SPRN_SRR1,r13 + ld r3,OPAL_MC_GPR3(r3) + GET_SCRATCH0(r13) + b machine_check_pSeries +#endif /* CONFIG_PPC_POWERNV */ diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 5a598caefcba..aaa0dba49471 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -27,12 +27,14 @@ struct opal { static struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); +extern u64 opal_mc_secondary_handler[]; int __init early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data) { const void *basep, *entryp; unsigned long basesz, entrysz; + u64 glue; if (depth != 1 || strcmp(uname, "ibm,opal") != 0) return 0; @@ -59,6 +61,19 @@ int __init early_init_dt_scan_opal(unsigned long node, printk("OPAL V1 detected !\n"); } + /* Hookup some exception handlers. We use the fwnmi area at 0x7000 + * to provide the glue space to OPAL + */ + glue = 0x7000; + opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER, + __pa(opal_mc_secondary_handler[0]), + glue); + glue += 128; + opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, + 0, glue); + glue += 128; + opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); + return 1; } @@ -136,6 +151,121 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) return written; } +int opal_machine_check(struct pt_regs *regs) +{ + struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt; + struct opal_machine_check_event evt; + const char *level, *sevstr, *subtype; + static const char *opal_mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + static const char *opal_mc_slb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *opal_mc_erat_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *opal_mc_tlb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + + /* Copy the event structure and release the original */ + evt = *opal_evt; + opal_evt->in_use = 0; + + /* Print things out */ + if (evt.version != OpalMCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; + } + switch(evt.severity) { + case OpalMCE_SEV_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case OpalMCE_SEV_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case OpalMCE_SEV_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case OpalMCE_SEV_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + evt.disposition == OpalMCE_DISPOSITION_RECOVERED ? + "Recovered" : "[Not recovered"); + printk("%s Initiator: %s\n", level, + evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown"); + switch(evt.error_type) { + case OpalMCE_ERROR_TYPE_UE: + subtype = evt.u.ue_error.ue_error_type < + ARRAY_SIZE(opal_mc_ue_types) ? + opal_mc_ue_types[evt.u.ue_error.ue_error_type] + : "Unknown"; + printk("%s Error type: UE [%s]\n", level, subtype); + if (evt.u.ue_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.ue_error.effective_address); + if (evt.u.ue_error.physical_address_provided) + printk("%s Physial address: %016llx\n", + level, evt.u.ue_error.physical_address); + break; + case OpalMCE_ERROR_TYPE_SLB: + subtype = evt.u.slb_error.slb_error_type < + ARRAY_SIZE(opal_mc_slb_types) ? + opal_mc_slb_types[evt.u.slb_error.slb_error_type] + : "Unknown"; + printk("%s Error type: SLB [%s]\n", level, subtype); + if (evt.u.slb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.slb_error.effective_address); + break; + case OpalMCE_ERROR_TYPE_ERAT: + subtype = evt.u.erat_error.erat_error_type < + ARRAY_SIZE(opal_mc_erat_types) ? + opal_mc_erat_types[evt.u.erat_error.erat_error_type] + : "Unknown"; + printk("%s Error type: ERAT [%s]\n", level, subtype); + if (evt.u.erat_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.erat_error.effective_address); + break; + case OpalMCE_ERROR_TYPE_TLB: + subtype = evt.u.tlb_error.tlb_error_type < + ARRAY_SIZE(opal_mc_tlb_types) ? + opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type] + : "Unknown"; + printk("%s Error type: TLB [%s]\n", level, subtype); + if (evt.u.tlb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt.u.tlb_error.effective_address); + break; + default: + case OpalMCE_ERROR_TYPE_UNKNOWN: + printk("%s Error type: Unknown\n", level); + break; + } + return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1; +} + static irqreturn_t opal_interrupt(int irq, void *data) { uint64_t events; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 4a2b2e279593..f0242f3fd3e6 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -141,6 +141,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.restart = pnv_restart; ppc_md.power_off = pnv_power_off; ppc_md.halt = pnv_halt; + ppc_md.machine_check_exception = opal_machine_check; } #ifdef CONFIG_PPC_POWERNV_RTAS -- cgit v1.2.3 From a120db06c3f435c37d028b6e5a1968dad06b7df0 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 8 Sep 2011 21:12:06 +0000 Subject: perf events, powerpc: Add POWER7 stalled-cycles-frontend/backend events perf events, powerpc: Add POWER7 stalled-cycles-frontend/backend events Extent the POWER7 PMU driver with definitions for generic front-end and back-end stall events. As explained in Ingo's original comment(8f62242246351b5a4bc0c1f00c0c7003edea128a ), the exact definitions of the stall events are very much processor specific as different things mean different in their respective instruction pipeline. These two Power7 raw events are the closest approximation to the concept detailed in Ingo's comment. [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x100f8, /* GCT_NOSLOT_CYC */ It means cycles when the Global Completion Table has no slots from this thread [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x4000a, /* CMPLU_STALL */ It means no groups completed and GCT not empty for this thread Signed-off-by: Anshuman Khandual Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/power7-pmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index de83d6060dda..1251e4d7e262 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -297,6 +297,8 @@ static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[]) static int power7_generic_events[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x1e, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x100f8, /* GCT_NOSLOT_CYC */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x4000a, /* CMPLU_STALL */ [PERF_COUNT_HW_INSTRUCTIONS] = 2, [PERF_COUNT_HW_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU*/ [PERF_COUNT_HW_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */ -- cgit v1.2.3 From d12b524f8b2f4e45cabe8bc1501e8b967d543111 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 20 Sep 2011 03:07:24 +0000 Subject: powerpc: Reserve iommu page 0 Some devices have a dma-window that starts at the address 0. This allows DMA addresses to be mapped to this address and returned to drivers as a valid DMA address. Some drivers may not behave well in this case, since the address 0 is considered an error or not allocated. The solution to avoid this kind of error from happening is reserve the page addressed as 0 so it cannot be allocated for a DMA mapping. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 961bb03413f3..0cfcf98aafca 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -501,6 +501,14 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) tbl->it_map = page_address(page); memset(tbl->it_map, 0, sz); + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + tbl->it_hint = 0; tbl->it_largehint = tbl->it_halfpoint; spin_lock_init(&tbl->it_lock); -- cgit v1.2.3 From bb36c44557a4fcbaa17c0f2776e12a05a691b432 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 26 Sep 2011 14:22:39 +1000 Subject: powerpc/pci: Don't configure PCIe settings when PCI_PROBE_ONLY is set We don't want to configure PCI Express Max Payload Size or Max Read Request Size on systems that set that flag. The firmware will have done it for us, and under hypervisors such as pHyp we don't even see the parent switches and bridges and thus can make no assumption on what values are safe to use. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/pci-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 1bd47f36b25f..677ecccbe10d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1732,7 +1732,7 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose) hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); /* Configure PCI Express settings */ - if (bus) { + if (bus && !pci_has_flag(PCI_PROBE_ONLY)) { struct pci_bus *child; list_for_each_entry(child, &bus->children, node) { struct pci_dev *self = child->self; -- cgit v1.2.3 From e69b742a6793dc5bf16f6eedca534d4bc10d68b2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 26 Sep 2011 19:37:57 +0000 Subject: powerpc/ptrace: Fix build with gcc 4.6 gcc (rightfully) complains that we are accessing beyond the end of the fpr array (we do, to access the fpscr). The only sane thing to do (whether anything in that code can be called remotely sane is debatable) is to special case fpscr and handle it as a separate statement. I initially tried to do it it by making the array access conditional to index < PT_FPSCR and using a 3rd else leg but for some reason gcc was unable to understand it and still spewed the warning. So I ended up with something a tad more intricated but it seems to build on 32-bit and on 64-bit with and without VSX. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/ptrace.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 05b7dd217f60..18447c4fbad3 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1497,9 +1497,14 @@ long arch_ptrace(struct task_struct *child, long request, if (index < PT_FPR0) { tmp = ptrace_get_reg(child, (int) index); } else { + unsigned int fpidx = index - PT_FPR0; + flush_fp_to_thread(child); - tmp = ((unsigned long *)child->thread.fpr) - [TS_FPRWIDTH * (index - PT_FPR0)]; + if (fpidx < (PT_FPSCR - PT_FPR0)) + tmp = ((unsigned long *)child->thread.fpr) + [fpidx * TS_FPRWIDTH]; + else + tmp = child->thread.fpscr.val; } ret = put_user(tmp, datalp); break; @@ -1525,9 +1530,14 @@ long arch_ptrace(struct task_struct *child, long request, if (index < PT_FPR0) { ret = ptrace_put_reg(child, index, data); } else { + unsigned int fpidx = index - PT_FPR0; + flush_fp_to_thread(child); - ((unsigned long *)child->thread.fpr) - [TS_FPRWIDTH * (index - PT_FPR0)] = data; + if (fpidx < (PT_FPSCR - PT_FPR0)) + ((unsigned long *)child->thread.fpr) + [fpidx * TS_FPRWIDTH] = data; + else + child->thread.fpscr.val = data; ret = 0; } break; -- cgit v1.2.3 From d15f02eb4e8992cfacfca2ff306e5585bcf721d1 Mon Sep 17 00:00:00 2001 From: "Carl E. Love" Date: Wed, 28 Sep 2011 11:23:33 +0000 Subject: powerpc/perf_event: Fix Power6 L1 cache read & write event codes] The current L1 cache read event code 0x80082 only counts for thread 0. The event code 0x280030 should be used to count events on thread 0 and 1. The patch fixes the event code for the L1 cache read. The current L1 cache write event code 0x80086 only counts for thread 0. The event code 0x180032 should be used to count events on thread 0 and 1. The patch fixes the event code for the L1 cache write. FYI, the documentation lists three event codes for the L1 cache read event and three event codes for the L1 cache write event. The event description for the event codes is as follows: L1 cache read requests 0x80082 LSU 0 only L1 cache read requests 0x8008A LSU 1 only L1 cache read requests 0x80030 LSU 1 or LSU 0, counter 2 only. L1 cache store requests 0x80086 LSU 0 only L1 cache store requests 0x8008E LSU 1 only L1 cache store requests 0x80032 LSU 0 or LSU 1, counter 1 only. There can only be one request from either LSU 0 or 1 active at a time. Signed-off-by: Carl Love Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/power6-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 03b95e2c6d65..0bbc901e7efc 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -487,8 +487,8 @@ static int power6_generic_events[] = { */ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ - [C(OP_READ)] = { 0x80082, 0x80080 }, - [C(OP_WRITE)] = { 0x80086, 0x80088 }, + [C(OP_READ)] = { 0x280030, 0x80080 }, + [C(OP_WRITE)] = { 0x180032, 0x80088 }, [C(OP_PREFETCH)] = { 0x810a4, 0 }, }, [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ -- cgit v1.2.3 From 7680057cc4c7d9caada12767831bfd9738dd7b43 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 28 Sep 2011 20:51:46 +0000 Subject: powerpc: Don't try OPAL takeover on old 970 blades The firmware on old 970 blades supports some kind of takeover called "TNK takeover" which will crash if we try to probe for OPAL takeover, so don't do it. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom_init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index e96f5d0d2c78..b4fa66127495 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1313,6 +1313,16 @@ static void prom_query_opal(void) { long rc; + /* We must not query for OPAL presence on a machine that + * supports TNK takeover (970 blades), as this uses the same + * h-call with different arguments and will crash + */ + if (PHANDLE_VALID(call_prom("finddevice", 1, 1, + ADDR("/tnk-memory-map")))) { + prom_printf("TNK takeover detected, skipping OPAL check\n"); + return; + } + prom_printf("Querying for OPAL presence... "); rc = opal_query_takeover(&RELOC(prom_opal_size), &RELOC(prom_opal_align)); -- cgit v1.2.3 From 37caf9f2a1b99d11ba71e17168d221da9ca13f24 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Sat, 27 Aug 2011 06:14:23 -0500 Subject: powerpc/fsl-booke: Handle L1 D-cache parity error correctly on e500mc If the L1 D-Cache is in write shadow mode the HW will auto-recover the error. However we might still log the error and cause a machine check (if L1CSR0[CPE] - Cache error checking enable). We should only treat the non-write shadow case as non-recoverable. Signed-off-by: Kumar Gala --- arch/powerpc/include/asm/reg_booke.h | 3 +++ arch/powerpc/kernel/traps.c | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 9ec0b39f9ddc..28cdbd9f399c 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -548,6 +548,9 @@ #define L1CSR1_ICFI 0x00000002 /* Instr Cache Flash Invalidate */ #define L1CSR1_ICE 0x00000001 /* Instr Cache Enable */ +/* Bit definitions for L1CSR2. */ +#define L1CSR2_DCWS 0x40000000 /* Data Cache write shadow */ + /* Bit definitions for L2CSR0. */ #define L2CSR0_L2E 0x80000000 /* L2 Cache Enable */ #define L2CSR0_L2PE 0x40000000 /* L2 Cache Parity/ECC Enable */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f19d9777d3c1..4e5908264d1a 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -457,7 +457,14 @@ int machine_check_e500mc(struct pt_regs *regs) if (reason & MCSR_DCPERR_MC) { printk("Data Cache Parity Error\n"); - recoverable = 0; + + /* + * In write shadow mode we auto-recover from the error, but it + * may still get logged and cause a machine check. We should + * only treat the non-write shadow case as non-recoverable. + */ + if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) + recoverable = 0; } if (reason & MCSR_L2MMU_MHIT) { -- cgit v1.2.3 From e33ee8b6f473b33a41c281f5329d9d7b3bed8ff5 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Tue, 11 Oct 2011 11:26:08 +0530 Subject: powerpc: e500mc: Fix: use CONFIG_PPC_E500MC in idle_e500.S It is wrongly using undefined CONFIG_E500MC. Signed-off-by: Bharat Bhushan Signed-off-by: Kumar Gala --- arch/powerpc/kernel/idle_e500.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S index 3e2b95c6ae67..4f0ab85f3788 100644 --- a/arch/powerpc/kernel/idle_e500.S +++ b/arch/powerpc/kernel/idle_e500.S @@ -26,7 +26,7 @@ _GLOBAL(e500_idle) ori r4,r4,_TLF_NAPPING /* so when we take an exception */ stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */ -#ifdef CONFIG_E500MC +#ifdef CONFIG_PPC_E500MC wrteei 1 1: wait -- cgit v1.2.3 From ba14f6491768acad5cf50a3c7dc8927b7614d692 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 16 Sep 2011 10:39:58 -0500 Subject: powerpc: respect mem= setting for early memory limit setup For those MMUs that have some form of bolt'd linear mapping (TLB) required its rare that one ever sets mem= smaller than the size of that mapping. However, on Book-E 64 parts the initial linear mapping is quite large (1G) so its quite reasonable that mem= is set smaller than that. We need to parse the command line for mem= limit and constrain the amount of memory we map initially by it if need be. Signed-off-by: Kumar Gala --- arch/powerpc/kernel/prom.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 831a201e03d2..8ad825c063c4 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -737,12 +737,15 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); - setup_initial_memory_limit(memstart_addr, first_memblock_size); /* Save command line for /proc/cmdline and then parse parameters */ strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); parse_early_param(); + /* make sure we've parsed cmdline for mem= before this */ + if (memory_limit) + first_memblock_size = min(first_memblock_size, memory_limit); + setup_initial_memory_limit(memstart_addr, first_memblock_size); /* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */ memblock_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START); /* If relocatable, reserve first 32k for interrupt vectors etc. */ -- cgit v1.2.3 From 7d0d3ad5e3cfbf96aa4a2d6ee26a20c98a29d4a1 Mon Sep 17 00:00:00 2001 From: Matthew McClintock Date: Tue, 25 Oct 2011 17:54:03 -0500 Subject: powerpc/fsl_booke: Fix comment in head_fsl_booke.S Fix typo in comments introduced by: commit 6dece0eb69b2a28e18d104bc5d707f1cb673f5e0 Author: Scott Wood Date: Mon Jul 25 11:29:33 2011 +0000 powerpc/32: Pass device tree address as u64 to machine_init Signed-off-by: Matthew McClintock cc: Scott Wood Signed-off-by: Kumar Gala --- arch/powerpc/kernel/head_fsl_booke.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index e1c699f3b7a7..9f5d210ddf3f 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -80,8 +80,8 @@ _ENTRY(_start); slw r18,r18,r17 /* r18 = page size */ addi r18,r18,-1 and r19,r3,r18 /* r19 = page offset */ - andc r31,r20,r18 /* r3 = page base */ - or r31,r31,r19 /* r3 = devtree phys addr */ + andc r31,r20,r18 /* r31 = page base */ + or r31,r31,r19 /* r31 = devtree phys addr */ mfspr r30,SPRN_MAS7 li r25,0 /* phys kernel start (low) */ -- cgit v1.2.3