diff options
Diffstat (limited to 'arch/powerpc')
52 files changed, 1418 insertions, 1000 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b8bab10bd0f1..4ce0be32d153 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -562,6 +562,14 @@ config SCHED_SMT when dealing with POWER5 cpus at a cost of slightly increased overhead in some places. If unsure say N here. +config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 + default "n" + ---help--- + Add support for handling denormalisation of single precision + values. Useful for bare metal only. If unsure say Y here. + config CMDLINE_BOOL bool "Default bootloader kernel arguments" diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index db27c82e0542..e263e6a5aca1 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -51,6 +51,7 @@ CONFIG_KEXEC=y CONFIG_IRQ_ALL_CPUS=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_SCHED_SMT=y +CONFIG_PPC_DENORMALISATION=y CONFIG_PCCARD=y CONFIG_ELECTRA_CF=y CONFIG_HOTPLUG_PCI=m diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 1f65b3c9b59a..c169dfb3e42d 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -48,6 +48,7 @@ CONFIG_MEMORY_HOTREMOVE=y CONFIG_PPC_64K_PAGES=y CONFIG_PPC_SUBPAGE_PROT=y CONFIG_SCHED_SMT=y +CONFIG_PPC_DENORMALISATION=y CONFIG_HOTPLUG_PCI=m CONFIG_HOTPLUG_PCI_RPA=m CONFIG_HOTPLUG_PCI_RPA_DLPAR=m diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 58c5ee61e700..b0ef73882b38 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -45,9 +45,10 @@ struct device_node; * in the corresponding PHB. Therefore, the root PEs should be created * against existing PHBs in on-to-one fashion. */ -#define EEH_PE_PHB 1 /* PHB PE */ -#define EEH_PE_DEVICE 2 /* Device PE */ -#define EEH_PE_BUS 3 /* Bus PE */ +#define EEH_PE_INVALID (1 << 0) /* Invalid */ +#define EEH_PE_PHB (1 << 1) /* PHB PE */ +#define EEH_PE_DEVICE (1 << 2) /* Device PE */ +#define EEH_PE_BUS (1 << 3) /* Bus PE */ #define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */ #define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */ @@ -184,7 +185,7 @@ static inline void eeh_unlock(void) typedef void *(*eeh_traverse_func)(void *data, void *flag); int __devinit eeh_phb_pe_create(struct pci_controller *phb); int eeh_add_to_parent_pe(struct eeh_dev *edev); -int eeh_rmv_from_parent_pe(struct eeh_dev *edev); +int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); void *eeh_pe_dev_traverse(struct eeh_pe *root, eeh_traverse_func fn, void *flag); void eeh_pe_restore_bars(struct eeh_pe *pe); @@ -200,7 +201,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev); void __init eeh_addr_cache_build(void); void eeh_add_device_tree_early(struct device_node *); void eeh_add_device_tree_late(struct pci_bus *); -void eeh_remove_bus_device(struct pci_dev *); +void eeh_remove_bus_device(struct pci_dev *, int); /** * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure. @@ -239,7 +240,7 @@ static inline void eeh_add_device_tree_early(struct device_node *dn) { } static inline void eeh_add_device_tree_late(struct pci_bus *bus) { } -static inline void eeh_remove_bus_device(struct pci_dev *dev) { } +static inline void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { } static inline void eeh_lock(void) { } static inline void eeh_unlock(void) { } diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index f0e0c6a66d97..7aefdb3e1ce4 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -59,7 +59,7 @@ struct hpte_cache { struct hlist_node list_vpte; struct hlist_node list_vpte_long; struct rcu_head rcu_head; - u64 host_va; + u64 host_vpn; u64 pfn; ulong slot; struct kvmppc_pte pte; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 236b4779ec4f..c4231973edd3 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -34,19 +34,19 @@ struct machdep_calls { char *name; #ifdef CONFIG_PPC64 void (*hpte_invalidate)(unsigned long slot, - unsigned long va, + unsigned long vpn, int psize, int ssize, int local); long (*hpte_updatepp)(unsigned long slot, unsigned long newpp, - unsigned long va, + unsigned long vpn, int psize, int ssize, int local); void (*hpte_updateboltedpp)(unsigned long newpp, unsigned long ea, int psize, int ssize); long (*hpte_insert)(unsigned long hpte_group, - unsigned long va, + unsigned long vpn, unsigned long prpn, unsigned long rflags, unsigned long vflags, @@ -215,6 +215,9 @@ struct machdep_calls { /* Called after scan and before resource survey */ void (*pcibios_fixup_phb)(struct pci_controller *hose); + /* Called during PCI resource reassignment */ + resource_size_t (*pcibios_window_alignment)(struct pci_bus *, unsigned long type); + /* Called to shutdown machine specific hardware not already controlled * by other drivers. */ diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 1c65a59881ea..9673f73eb8db 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -16,6 +16,13 @@ #include <asm/page.h> /* + * This is necessary to get the definition of PGTABLE_RANGE which we + * need for various slices related matters. Note that this isn't the + * complete pgtable.h but only a portion of it. + */ +#include <asm/pgtable-ppc64.h> + +/* * Segment table */ @@ -154,9 +161,25 @@ struct mmu_psize_def #define MMU_SEGSIZE_256M 0 #define MMU_SEGSIZE_1T 1 +/* + * encode page number shift. + * in order to fit the 78 bit va in a 64 bit variable we shift the va by + * 12 bits. This enable us to address upto 76 bit va. + * For hpt hash from a va we can ignore the page size bits of va and for + * hpte encoding we ignore up to 23 bits of va. So ignoring lower 12 bits ensure + * we work in all cases including 4k page size. + */ +#define VPN_SHIFT 12 #ifndef __ASSEMBLY__ +static inline int segment_shift(int ssize) +{ + if (ssize == MMU_SEGSIZE_256M) + return SID_SHIFT; + return SID_SHIFT_1T; +} + /* * The current system page and segment sizes */ @@ -180,18 +203,39 @@ extern unsigned long tce_alloc_start, tce_alloc_end; extern int mmu_ci_restrictions; /* + * This computes the AVPN and B fields of the first dword of a HPTE, + * for use when we want to match an existing PTE. The bottom 7 bits + * of the returned value are zero. + */ +static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize, + int ssize) +{ + unsigned long v; + /* + * The AVA field omits the low-order 23 bits of the 78 bits VA. + * These bits are not needed in the PTE, because the + * low-order b of these bits are part of the byte offset + * into the virtual page and, if b < 23, the high-order + * 23-b of these bits are always used in selecting the + * PTEGs to be searched + */ + v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); + v <<= HPTE_V_AVPN_SHIFT; + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + return v; +} + +/* * This function sets the AVPN and L fields of the HPTE appropriately * for the page size */ -static inline unsigned long hpte_encode_v(unsigned long va, int psize, - int ssize) +static inline unsigned long hpte_encode_v(unsigned long vpn, + int psize, int ssize) { unsigned long v; - v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm); - v <<= HPTE_V_AVPN_SHIFT; + v = hpte_encode_avpn(vpn, psize, ssize); if (psize != MMU_PAGE_4K) v |= HPTE_V_LARGE; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; return v; } @@ -216,30 +260,37 @@ static inline unsigned long hpte_encode_r(unsigned long pa, int psize) } /* - * Build a VA given VSID, EA and segment size + * Build a VPN_SHIFT bit shifted va given VSID, EA and segment size. */ -static inline unsigned long hpt_va(unsigned long ea, unsigned long vsid, - int ssize) +static inline unsigned long hpt_vpn(unsigned long ea, + unsigned long vsid, int ssize) { - if (ssize == MMU_SEGSIZE_256M) - return (vsid << 28) | (ea & 0xfffffffUL); - return (vsid << 40) | (ea & 0xffffffffffUL); + unsigned long mask; + int s_shift = segment_shift(ssize); + + mask = (1ul << (s_shift - VPN_SHIFT)) - 1; + return (vsid << (s_shift - VPN_SHIFT)) | ((ea >> VPN_SHIFT) & mask); } /* * This hashes a virtual address */ - -static inline unsigned long hpt_hash(unsigned long va, unsigned int shift, - int ssize) +static inline unsigned long hpt_hash(unsigned long vpn, + unsigned int shift, int ssize) { + int mask; unsigned long hash, vsid; + /* VPN_SHIFT can be atmost 12 */ if (ssize == MMU_SEGSIZE_256M) { - hash = (va >> 28) ^ ((va & 0x0fffffffUL) >> shift); + mask = (1ul << (SID_SHIFT - VPN_SHIFT)) - 1; + hash = (vpn >> (SID_SHIFT - VPN_SHIFT)) ^ + ((vpn & mask) >> (shift - VPN_SHIFT)); } else { - vsid = va >> 40; - hash = vsid ^ (vsid << 25) ^ ((va & 0xffffffffffUL) >> shift); + mask = (1ul << (SID_SHIFT_1T - VPN_SHIFT)) - 1; + vsid = vpn >> (SID_SHIFT_1T - VPN_SHIFT); + hash = vsid ^ (vsid << 25) ^ + ((vpn & mask) >> (shift - VPN_SHIFT)) ; } return hash & 0x7fffffffffUL; } @@ -280,63 +331,61 @@ extern void slb_set_size(u16 size); #endif /* __ASSEMBLY__ */ /* - * VSID allocation + * VSID allocation (256MB segment) + * + * We first generate a 38-bit "proto-VSID". For kernel addresses this + * is equal to the ESID | 1 << 37, for user addresses it is: + * (context << USER_ESID_BITS) | (esid & ((1U << USER_ESID_BITS) - 1) * - * We first generate a 36-bit "proto-VSID". For kernel addresses this - * is equal to the ESID, for user addresses it is: - * (context << 15) | (esid & 0x7fff) + * This splits the proto-VSID into the below range + * 0 - (2^(CONTEXT_BITS + USER_ESID_BITS) - 1) : User proto-VSID range + * 2^(CONTEXT_BITS + USER_ESID_BITS) - 2^(VSID_BITS) : Kernel proto-VSID range * - * The two forms are distinguishable because the top bit is 0 for user - * addresses, whereas the top two bits are 1 for kernel addresses. - * Proto-VSIDs with the top two bits equal to 0b10 are reserved for - * now. + * We also have CONTEXT_BITS + USER_ESID_BITS = VSID_BITS - 1 + * That is, we assign half of the space to user processes and half + * to the kernel. * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: * * VSID = (proto-VSID * VSID_MULTIPLIER) % VSID_MODULUS - * where VSID_MULTIPLIER = 268435399 = 0xFFFFFC7 - * VSID_MODULUS = 2^36-1 = 0xFFFFFFFFF * - * This scramble is only well defined for proto-VSIDs below - * 0xFFFFFFFFF, so both proto-VSID and actual VSID 0xFFFFFFFFF are - * reserved. VSID_MULTIPLIER is prime, so in particular it is + * VSID_MULTIPLIER is prime, so in particular it is * co-prime to VSID_MODULUS, making this a 1:1 scrambling function. * Because the modulus is 2^n-1 we can compute it efficiently without * a divide or extra multiply (see below). * * This scheme has several advantages over older methods: * - * - We have VSIDs allocated for every kernel address + * - We have VSIDs allocated for every kernel address * (i.e. everything above 0xC000000000000000), except the very top * segment, which simplifies several things. * - * - We allow for 16 significant bits of ESID and 19 bits of - * context for user addresses. i.e. 16T (44 bits) of address space for - * up to half a million contexts. + * - We allow for USER_ESID_BITS significant bits of ESID and + * CONTEXT_BITS bits of context for user addresses. + * i.e. 64T (46 bits) of address space for up to half a million contexts. * - * - The scramble function gives robust scattering in the hash + * - The scramble function gives robust scattering in the hash * table (at least based on some initial results). The previous * method was more susceptible to pathological cases giving excessive * hash collisions. */ + /* - * WARNING - If you change these you must make sure the asm - * implementations in slb_allocate (slb_low.S), do_stab_bolted - * (head.S) and ASM_VSID_SCRAMBLE (below) are changed accordingly. + * This should be computed such that protovosid * vsid_mulitplier + * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus */ - -#define VSID_MULTIPLIER_256M ASM_CONST(200730139) /* 28-bit prime */ -#define VSID_BITS_256M 36 +#define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ +#define VSID_BITS_256M 38 #define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1) #define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_1T 24 +#define VSID_BITS_1T 26 #define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1) #define CONTEXT_BITS 19 -#define USER_ESID_BITS 16 -#define USER_ESID_BITS_1T 4 +#define USER_ESID_BITS 18 +#define USER_ESID_BITS_1T 6 #define USER_VSID_RANGE (1UL << (USER_ESID_BITS + SID_SHIFT)) @@ -372,6 +421,8 @@ extern void slb_set_size(u16 size); srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ add rt,rt,rx +/* 4 bits per slice and we have one slice per 1TB */ +#define SLICE_ARRAY_SIZE (PGTABLE_RANGE >> 41) #ifndef __ASSEMBLY__ @@ -416,7 +467,7 @@ typedef struct { #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ - u64 high_slices_psize; /* 4 bits per slice for now */ + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; #else u16 sllp; /* SLB page size encoding */ #endif @@ -452,12 +503,32 @@ typedef struct { }) #endif /* 1 */ -/* This is only valid for addresses >= PAGE_OFFSET */ +/* + * This is only valid for addresses >= PAGE_OFFSET + * The proto-VSID space is divided into two class + * User: 0 to 2^(CONTEXT_BITS + USER_ESID_BITS) -1 + * kernel: 2^(CONTEXT_BITS + USER_ESID_BITS) to 2^(VSID_BITS) - 1 + * + * With KERNEL_START at 0xc000000000000000, the proto vsid for + * the kernel ends up with 0xc00000000 (36 bits). With 64TB + * support we need to have kernel proto-VSID in the + * [2^37 to 2^38 - 1] range due to the increased USER_ESID_BITS. + */ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) { - if (ssize == MMU_SEGSIZE_256M) - return vsid_scramble(ea >> SID_SHIFT, 256M); - return vsid_scramble(ea >> SID_SHIFT_1T, 1T); + unsigned long proto_vsid; + /* + * We need to make sure proto_vsid for the kernel is + * >= 2^(CONTEXT_BITS + USER_ESID_BITS[_1T]) + */ + if (ssize == MMU_SEGSIZE_256M) { + proto_vsid = ea >> SID_SHIFT; + proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS)); + return vsid_scramble(proto_vsid, 256M); + } + proto_vsid = ea >> SID_SHIFT_1T; + proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS_1T)); + return vsid_scramble(proto_vsid, 1T); } /* Returns the segment size indicator for a user address */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e8a26db2e8f3..5e38eedea218 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -146,6 +146,15 @@ extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, extern u64 ppc64_rma_size; #endif /* CONFIG_PPC64 */ +struct mm_struct; +#ifdef CONFIG_DEBUG_VM +extern void assert_pte_locked(struct mm_struct *mm, unsigned long addr); +#else /* CONFIG_DEBUG_VM */ +static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) +{ +} +#endif /* !CONFIG_DEBUG_VM */ + #endif /* !__ASSEMBLY__ */ /* The kernel use the constants below to index in the page sizes array. diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 7796519fd238..e9e7a6999bb8 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -100,7 +100,7 @@ struct paca_struct { /* SLB related definitions */ u16 vmalloc_sllp; u16 slb_cache_ptr; - u16 slb_cache[SLB_CACHE_ENTRIES]; + u32 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_PPC_BOOK3E diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index fed85e6290e1..cd915d6b093d 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -78,11 +78,19 @@ extern u64 ppc64_pft_size; #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) +/* + * 1 bit per slice and we have one slice per 1TB + * Right now we support only 64TB. + * IF we change this we will have to change the type + * of high_slices + */ +#define SLICE_MASK_SIZE 8 + #ifndef __ASSEMBLY__ struct slice_mask { u16 low_slices; - u16 high_slices; + u64 high_slices; }; struct mm_struct; diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 973df4d9d366..025a130729bc 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -182,6 +182,14 @@ static inline int pci_device_from_OF_node(struct device_node *np, #if defined(CONFIG_EEH) static inline struct eeh_dev *of_node_to_eeh_dev(struct device_node *dn) { + /* + * For those OF nodes whose parent isn't PCI bridge, they + * don't have PCI_DN actually. So we have to skip them for + * any EEH operations. + */ + if (!dn || !PCI_DN(dn)) + return NULL; + return PCI_DN(dn)->edev; } #else @@ -192,6 +200,7 @@ static inline struct eeh_dev *of_node_to_eeh_dev(struct device_node *dn) extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); /** Remove all of the PCI devices under this bus */ +extern void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe); extern void pcibios_remove_pci_devices(struct pci_bus *bus); /** Discover new pci devices under this bus, and add them */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64-4k.h b/arch/powerpc/include/asm/pgtable-ppc64-4k.h index 6eefdcffa359..12798c9d4b4b 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64-4k.h +++ b/arch/powerpc/include/asm/pgtable-ppc64-4k.h @@ -7,7 +7,7 @@ */ #define PTE_INDEX_SIZE 9 #define PMD_INDEX_SIZE 7 -#define PUD_INDEX_SIZE 7 +#define PUD_INDEX_SIZE 9 #define PGD_INDEX_SIZE 9 #ifndef __ASSEMBLY__ @@ -19,7 +19,7 @@ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) /* PMD_SHIFT determines what a second-level page table entry can map */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h index 90533ddcd703..be4e2878fbc0 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h +++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h @@ -7,7 +7,7 @@ #define PTE_INDEX_SIZE 12 #define PMD_INDEX_SIZE 12 #define PUD_INDEX_SIZE 0 -#define PGD_INDEX_SIZE 4 +#define PGD_INDEX_SIZE 6 #ifndef __ASSEMBLY__ #define PTE_TABLE_SIZE (sizeof(real_pte_t) << PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index c4205616dfb5..0182c203e411 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -21,17 +21,6 @@ #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) -/* Some sanity checking */ -#if TASK_SIZE_USER64 > PGTABLE_RANGE -#error TASK_SIZE_USER64 exceeds pagetable range -#endif - -#ifdef CONFIG_PPC_STD_MMU_64 -#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) -#error TASK_SIZE_USER64 exceeds user VSID range -#endif -#endif - /* * Define the address range of the kernel non-linear virtual area */ @@ -41,7 +30,7 @@ #else #define KERN_VIRT_START ASM_CONST(0xD000000000000000) #endif -#define KERN_VIRT_SIZE PGTABLE_RANGE +#define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) /* * The vmalloc space starts at the beginning of that region, and @@ -117,9 +106,6 @@ #ifndef __ASSEMBLY__ -#include <linux/stddef.h> -#include <asm/tlbflush.h> - /* * This is the default implementation of various PTE accessors, it's * used in all cases except Book3S with 64K pages where we have a @@ -198,7 +184,8 @@ /* to find an entry in a kernel page-table-directory */ /* This now only contains the vmalloc pages */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) - +extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long pte, int huge); /* Atomic PTE updates */ static inline unsigned long pte_update(struct mm_struct *mm, diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 2e0e4110f7ae..a9cbd3ba5c33 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -9,14 +9,6 @@ struct mm_struct; -#ifdef CONFIG_DEBUG_VM -extern void assert_pte_locked(struct mm_struct *mm, unsigned long addr); -#else /* CONFIG_DEBUG_VM */ -static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) -{ -} -#endif /* !CONFIG_DEBUG_VM */ - #endif /* !__ASSEMBLY__ */ #if defined(CONFIG_PPC64) @@ -27,6 +19,8 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #ifndef __ASSEMBLY__ +#include <asm/tlbflush.h> + /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 4c25319f2fbc..5f73ce63fcae 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -126,6 +126,7 @@ #define PPC_INST_TLBIVAX 0x7c000624 #define PPC_INST_TLBSRX_DOT 0x7c0006a5 #define PPC_INST_XXLOR 0xf0000510 +#define PPC_INST_XVCPSGNDP 0xf0000780 #define PPC_INST_NAP 0x4c000364 #define PPC_INST_SLEEP 0x4c0003a4 @@ -277,6 +278,8 @@ VSX_XX1((s), a, b)) #define XXLOR(t, a, b) stringify_in_c(.long PPC_INST_XXLOR | \ VSX_XX3((t), a, b)) +#define XVCPSGNDP(t, a, b) stringify_in_c(.long (PPC_INST_XVCPSGNDP | \ + VSX_XX3((t), (a), (b)))) #define PPC_NAP stringify_in_c(.long PPC_INST_NAP) #define PPC_SLEEP stringify_in_c(.long PPC_INST_SLEEP) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 83efc6e81543..9dc5cd1fde1a 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -97,8 +97,8 @@ extern struct task_struct *last_task_used_spe; #endif #ifdef CONFIG_PPC64 -/* 64-bit user address space is 44-bits (16TB user VM) */ -#define TASK_SIZE_USER64 (0x0000100000000000UL) +/* 64-bit user address space is 46-bits (64TB user VM) */ +#define TASK_SIZE_USER64 (0x0000400000000000UL) /* * 32-bit user address space is 4GB - 1 page diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h index 59247e816ac5..eedf427c9124 100644 --- a/arch/powerpc/include/asm/pte-hash64-64k.h +++ b/arch/powerpc/include/asm/pte-hash64-64k.h @@ -58,14 +58,16 @@ /* Trick: we set __end to va + 64k, which happens works for * a 16M page as well as we want only one iteration */ -#define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ - do { \ - unsigned long __end = va + PAGE_SIZE; \ - unsigned __split = (psize == MMU_PAGE_4K || \ - psize == MMU_PAGE_64K_AP); \ - shift = mmu_psize_defs[psize].shift; \ - for (index = 0; va < __end; index++, va += (1L << shift)) { \ - if (!__split || __rpte_sub_valid(rpte, index)) do { \ +#define pte_iterate_hashed_subpages(rpte, psize, vpn, index, shift) \ + do { \ + unsigned long __end = vpn + (1UL << (PAGE_SHIFT - VPN_SHIFT)); \ + unsigned __split = (psize == MMU_PAGE_4K || \ + psize == MMU_PAGE_64K_AP); \ + shift = mmu_psize_defs[psize].shift; \ + for (index = 0; vpn < __end; index++, \ + vpn += (1L << (shift - VPN_SHIFT))) { \ + if (!__split || __rpte_sub_valid(rpte, index)) \ + do { #define pte_iterate_hashed_end() } while(0); } } while(0) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 121a90bbf778..a1096fb62816 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -524,6 +524,7 @@ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ +#define HSRR1_DENORM 0x00100000 /* Denorm exception */ #define SPRN_TBCTL 0x35f /* PA6T Timebase control register */ #define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */ diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 0c5fa3145615..f6fc0ee813d7 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -10,8 +10,8 @@ */ #define SECTION_SIZE_BITS 24 -#define MAX_PHYSADDR_BITS 44 -#define MAX_PHYSMEM_BITS 44 +#define MAX_PHYSADDR_BITS 46 +#define MAX_PHYSMEM_BITS 46 #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index e942203cd4a8..8ceea14d6fe4 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -104,6 +104,8 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_UPROBE 14 /* breakpointed or single-stepping */ #define TIF_SYSCALL_TRACEPOINT 15 /* syscall tracepoint instrumentation */ +#define TIF_EMULATE_STACK_STORE 16 /* Is an instruction emulation + for stack store? */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -121,6 +123,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_UPROBE (1<<TIF_UPROBE) #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) +#define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT) diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 81143fcbd113..61a59271665b 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -95,7 +95,7 @@ struct ppc64_tlb_batch { unsigned long index; struct mm_struct *mm; real_pte_t pte[PPC64_TLB_BATCH_NR]; - unsigned long vaddr[PPC64_TLB_BATCH_NR]; + unsigned long vpn[PPC64_TLB_BATCH_NR]; unsigned int psize; int ssize; }; @@ -103,9 +103,6 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); -extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned long pte, int huge); - #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) @@ -127,7 +124,7 @@ static inline void arch_leave_lazy_mmu_mode(void) #define arch_flush_lazy_mmu_mode() do {} while (0) -extern void flush_hash_page(unsigned long va, real_pte_t pte, int psize, +extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, int local); extern void flush_hash_range(unsigned long number, int local); diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 17bb40cad5bf..4db49590acf5 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -98,11 +98,6 @@ struct exception_table_entry { * PowerPC, we can just do these as direct assignments. (Of course, the * exception handling means that it's no longer "just"...) * - * The "user64" versions of the user access functions are versions that - * allow access of 64-bit data. The "get_user" functions do not - * properly handle 64-bit data because the value gets down cast to a long. - * The "put_user" functions already handle 64-bit data properly but we add - * "user64" versions for completeness */ #define get_user(x, ptr) \ __get_user_check((x), (ptr), sizeof(*(ptr))) @@ -114,12 +109,6 @@ struct exception_table_entry { #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#ifndef __powerpc64__ -#define __get_user64(x, ptr) \ - __get_user64_nocheck((x), (ptr), sizeof(*(ptr))) -#define __put_user64(x, ptr) __put_user(x, ptr) -#endif - #define __get_user_inatomic(x, ptr) \ __get_user_nosleep((x), (ptr), sizeof(*(ptr))) #define __put_user_inatomic(x, ptr) \ diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index ead5016b02d0..af37528da49f 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -831,19 +831,56 @@ restore_user: bnel- load_dbcr0 #endif -#ifdef CONFIG_PREEMPT b restore /* N.B. the only way to get here is from the beq following ret_from_except. */ resume_kernel: - /* check current_thread_info->preempt_count */ + /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ CURRENT_THREAD_INFO(r9, r1) + lwz r8,TI_FLAGS(r9) + andis. r8,r8,_TIF_EMULATE_STACK_STORE@h + beq+ 1f + + addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + + lwz r3,GPR1(r1) + subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ + mr r4,r1 /* src: current exception frame */ + mr r1,r3 /* Reroute the trampoline frame to r1 */ + + /* Copy from the original to the trampoline. */ + li r5,INT_FRAME_SIZE/4 /* size: INT_FRAME_SIZE */ + li r6,0 /* start offset: 0 */ + mtctr r5 +2: lwzx r0,r6,r4 + stwx r0,r6,r3 + addi r6,r6,4 + bdnz 2b + + /* Do real store operation to complete stwu */ + lwz r5,GPR1(r1) + stw r8,0(r5) + + /* Clear _TIF_EMULATE_STACK_STORE flag */ + lis r11,_TIF_EMULATE_STACK_STORE@h + addi r5,r9,TI_FLAGS +0: lwarx r8,0,r5 + andc r8,r8,r11 +#ifdef CONFIG_IBM405_ERR77 + dcbt 0,r5 +#endif + stwcx. r8,0,r5 + bne- 0b +1: + +#ifdef CONFIG_PREEMPT + /* check current_thread_info->preempt_count */ lwz r0,TI_PREEMPT(r9) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore - lwz r0,TI_FLAGS(r9) - andi. r0,r0,_TIF_NEED_RESCHED + andi. r8,r8,_TIF_NEED_RESCHED beq+ restore + lwz r3,_MSR(r1) andi. r0,r3,MSR_EE /* interrupts off? */ beq restore /* don't schedule if so */ #ifdef CONFIG_TRACE_IRQFLAGS @@ -864,8 +901,6 @@ resume_kernel: */ bl trace_hardirqs_on #endif -#else -resume_kernel: #endif /* CONFIG_PREEMPT */ /* interrupts are hard-disabled at this point */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index b40e0b4815b3..0e931aaffca2 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -593,6 +593,41 @@ _GLOBAL(ret_from_except_lite) b .ret_from_except resume_kernel: + /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ + CURRENT_THREAD_INFO(r9, r1) + ld r8,TI_FLAGS(r9) + andis. r8,r8,_TIF_EMULATE_STACK_STORE@h + beq+ 1f + + addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + + lwz r3,GPR1(r1) + subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ + mr r4,r1 /* src: current exception frame */ + mr r1,r3 /* Reroute the trampoline frame to r1 */ + + /* Copy from the original to the trampoline. */ + li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */ + li r6,0 /* start offset: 0 */ + mtctr r5 +2: ldx r0,r6,r4 + stdx r0,r6,r3 + addi r6,r6,8 + bdnz 2b + + /* Do real store operation to complete stwu */ + lwz r5,GPR1(r1) + std r8,0(r5) + + /* Clear _TIF_EMULATE_STACK_STORE flag */ + lis r11,_TIF_EMULATE_STACK_STORE@h + addi r5,r9,TI_FLAGS + ldarx r4,0,r5 + andc r4,r4,r11 + stdcx. r4,0,r5 + bne- 0b +1: + #ifdef CONFIG_PREEMPT /* Check if we need to preempt */ andi. r0,r4,_TIF_NEED_RESCHED diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 39aa97d3ff88..10b658ad65e1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -275,6 +275,31 @@ vsx_unavailable_pSeries_1: STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint) KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300) + . = 0x1500 + .global denorm_Hypervisor +denorm_exception_hv: + HMT_MEDIUM + mtspr SPRN_SPRG_HSCRATCH0,r13 + mfspr r13,SPRN_SPRG_HPACA + std r9,PACA_EXGEN+EX_R9(r13) + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r9,SPRN_SPRG_HSCRATCH0 + std r9,PACA_EXGEN+EX_R13(r13) + mfcr r9 + +#ifdef CONFIG_PPC_DENORMALISATION + mfspr r10,SPRN_HSRR1 + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + addi r11,r11,-4 /* HSRR0 is next instruction */ + bne+ denorm_assist +#endif + + EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1500) + #ifdef CONFIG_CBE_RAS STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance) KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602) @@ -336,6 +361,103 @@ do_stab_bolted_pSeries: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982) +#ifdef CONFIG_PPC_DENORMALISATION +denorm_assist: +BEGIN_FTR_SECTION +/* + * To denormalise we need to move a copy of the register to itself. + * For POWER6 do that here for all FP regs. + */ + mfmsr r10 + ori r10,r10,(MSR_FP|MSR_FE0|MSR_FE1) + xori r10,r10,(MSR_FE0|MSR_FE1) + mtmsrd r10 + sync + fmr 0,0 + fmr 1,1 + fmr 2,2 + fmr 3,3 + fmr 4,4 + fmr 5,5 + fmr 6,6 + fmr 7,7 + fmr 8,8 + fmr 9,9 + fmr 10,10 + fmr 11,11 + fmr 12,12 + fmr 13,13 + fmr 14,14 + fmr 15,15 + fmr 16,16 + fmr 17,17 + fmr 18,18 + fmr 19,19 + fmr 20,20 + fmr 21,21 + fmr 22,22 + fmr 23,23 + fmr 24,24 + fmr 25,25 + fmr 26,26 + fmr 27,27 + fmr 28,28 + fmr 29,29 + fmr 30,30 + fmr 31,31 +FTR_SECTION_ELSE +/* + * To denormalise we need to move a copy of the register to itself. + * For POWER7 do that here for the first 32 VSX registers only. + */ + mfmsr r10 + oris r10,r10,MSR_VSX@h + mtmsrd r10 + sync + XVCPSGNDP(0,0,0) + XVCPSGNDP(1,1,1) + XVCPSGNDP(2,2,2) + XVCPSGNDP(3,3,3) + XVCPSGNDP(4,4,4) + XVCPSGNDP(5,5,5) + XVCPSGNDP(6,6,6) + XVCPSGNDP(7,7,7) + XVCPSGNDP(8,8,8) + XVCPSGNDP(9,9,9) + XVCPSGNDP(10,10,10) + XVCPSGNDP(11,11,11) + XVCPSGNDP(12,12,12) + XVCPSGNDP(13,13,13) + XVCPSGNDP(14,14,14) + XVCPSGNDP(15,15,15) + XVCPSGNDP(16,16,16) + XVCPSGNDP(17,17,17) + XVCPSGNDP(18,18,18) + XVCPSGNDP(19,19,19) + XVCPSGNDP(20,20,20) + XVCPSGNDP(21,21,21) + XVCPSGNDP(22,22,22) + XVCPSGNDP(23,23,23) + XVCPSGNDP(24,24,24) + XVCPSGNDP(25,25,25) + XVCPSGNDP(26,26,26) + XVCPSGNDP(27,27,27) + XVCPSGNDP(28,28,28) + XVCPSGNDP(29,29,29) + XVCPSGNDP(30,30,30) + XVCPSGNDP(31,31,31) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206) + mtspr SPRN_HSRR0,r11 + mtcrf 0x80,r9 + ld r9,PACA_EXGEN+EX_R9(r13) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + HRFID + b . +#endif + .align 7 /* moved from 0xe00 */ STD_EXCEPTION_HV(., 0xe02, h_data_storage) @@ -495,6 +617,7 @@ machine_check_common: STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception) STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, .performance_monitor_exception) STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception) + STD_EXCEPTION_COMMON(0x1502, denorm, .unknown_exception) #ifdef CONFIG_ALTIVEC STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception) #else @@ -960,7 +1083,9 @@ _GLOBAL(do_stab_bolted) rldimi r10,r11,7,52 /* r10 = first ste of the group */ /* Calculate VSID */ - /* This is a kernel address, so protovsid = ESID */ + /* This is a kernel address, so protovsid = ESID | 1 << 37 */ + li r9,0x1 + rldimi r11,r9,(CONTEXT_BITS + USER_ESID_BITS),0 ASM_VSID_SCRAMBLE(r11, r9, 256M) rldic r9,r11,12,16 /* r9 = vsid << 12 */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 1f017bb7a7ce..71413f41278f 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -489,10 +489,10 @@ void do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); unsigned int irq; - trace_irq_entry(regs); - irq_enter(); + trace_irq_entry(regs); + check_stack_overflow(); /* @@ -511,10 +511,10 @@ void do_IRQ(struct pt_regs *regs) else __get_cpu_var(irq_stat).spurious_irqs++; + trace_irq_exit(regs); + irq_exit(); set_irq_regs(old_regs); - - trace_irq_exit(regs); } void __init init_IRQ(void) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 4cb714792bea..7f94f760dd0c 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -99,6 +99,26 @@ void pcibios_free_controller(struct pci_controller *phb) kfree(phb); } +/* + * The function is used to return the minimal alignment + * for memory or I/O windows of the associated P2P bridge. + * By default, 4KiB alignment for I/O windows and 1MiB for + * memory windows. + */ +resource_size_t pcibios_window_alignment(struct pci_bus *bus, + unsigned long type) +{ + if (ppc_md.pcibios_window_alignment) + return ppc_md.pcibios_window_alignment(bus, type); + + /* + * PCI core will figure out the default + * alignment: 4KiB for I/O and 1MiB for + * memory window. + */ + return 1; +} + static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e49e93191b69..bd693a11d86e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -493,8 +493,6 @@ void timer_interrupt(struct pt_regs * regs) */ may_hard_irq_enable(); - trace_timer_interrupt_entry(regs); - __get_cpu_var(irq_stat).timer_irqs++; #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC) @@ -505,6 +503,8 @@ void timer_interrupt(struct pt_regs * regs) old_regs = set_irq_regs(regs); irq_enter(); + trace_timer_interrupt_entry(regs); + if (test_irq_work_pending()) { clear_irq_work_pending(); irq_work_run(); @@ -529,10 +529,10 @@ void timer_interrupt(struct pt_regs * regs) } #endif + trace_timer_interrupt_exit(regs); + irq_exit(); set_irq_regs(old_regs); - - trace_timer_interrupt_exit(regs); } /* diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 837f13e7b6bf..00aa61268e0d 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -141,7 +141,7 @@ extern char etext[]; int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) { pfn_t hpaddr; - u64 va; + u64 vpn; u64 vsid; struct kvmppc_sid_map *map; volatile u32 *pteg; @@ -173,7 +173,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) BUG_ON(!map); vsid = map->host_vsid; - va = (vsid << SID_SHIFT) | (eaddr & ~ESID_MASK); + vpn = (vsid << (SID_SHIFT - VPN_SHIFT)) | ((eaddr & ~ESID_MASK) >> VPN_SHIFT) next_pteg: if (rr == 16) { @@ -244,11 +244,11 @@ next_pteg: dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", orig_pte->may_write ? 'w' : '-', orig_pte->may_execute ? 'x' : '-', - orig_pte->eaddr, (ulong)pteg, va, + orig_pte->eaddr, (ulong)pteg, vpn, orig_pte->vpage, hpaddr); pte->slot = (ulong)&pteg[rr]; - pte->host_va = va; + pte->host_vpn = vpn; pte->pte = *orig_pte; pte->pfn = hpaddr >> PAGE_SHIFT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 0688b6b39585..4d72f9ebc554 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -33,7 +33,7 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { - ppc_md.hpte_invalidate(pte->slot, pte->host_va, + ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, MMU_PAGE_4K, MMU_SEGSIZE_256M, false); } @@ -80,8 +80,9 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) { + unsigned long vpn; pfn_t hpaddr; - ulong hash, hpteg, va; + ulong hash, hpteg; u64 vsid; int ret; int rflags = 0x192; @@ -117,7 +118,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) } vsid = map->host_vsid; - va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); + vpn = hpt_vpn(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); if (!orig_pte->may_write) rflags |= HPTE_R_PP; @@ -129,7 +130,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) else kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); - hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M); + hash = hpt_hash(vpn, PTE_SIZE, MMU_SEGSIZE_256M); map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); @@ -141,7 +142,8 @@ map_again: goto out; } - ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); + ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, + MMU_PAGE_4K, MMU_SEGSIZE_256M); if (ret < 0) { /* If we couldn't map a primary PTE, try a secondary */ @@ -152,7 +154,8 @@ map_again: } else { struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte); + trace_kvm_book3s_64_mmu_map(rflags, hpteg, + vpn, hpaddr, orig_pte); /* The ppc_md code may give us a secondary entry even though we asked for a primary. Fix up. */ @@ -162,7 +165,7 @@ map_again: } pte->slot = hpteg + (ret & 7); - pte->host_va = va; + pte->host_vpn = vpn; pte->pte = *orig_pte; pte->pfn = hpaddr >> PAGE_SHIFT; diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 877186b7b1c3..ddb6a2149d44 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -189,7 +189,7 @@ TRACE_EVENT(kvm_book3s_mmu_map, TP_ARGS(pte), TP_STRUCT__entry( - __field( u64, host_va ) + __field( u64, host_vpn ) __field( u64, pfn ) __field( ulong, eaddr ) __field( u64, vpage ) @@ -198,7 +198,7 @@ TRACE_EVENT(kvm_book3s_mmu_map, ), TP_fast_assign( - __entry->host_va = pte->host_va; + __entry->host_vpn = pte->host_vpn; __entry->pfn = pte->pfn; __entry->eaddr = pte->pte.eaddr; __entry->vpage = pte->pte.vpage; @@ -208,8 +208,8 @@ TRACE_EVENT(kvm_book3s_mmu_map, (pte->pte.may_execute ? 0x1 : 0); ), - TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_va, __entry->pfn, __entry->eaddr, + TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, __entry->vpage, __entry->raddr, __entry->flags) ); @@ -218,7 +218,7 @@ TRACE_EVENT(kvm_book3s_mmu_invalidate, TP_ARGS(pte), TP_STRUCT__entry( - __field( u64, host_va ) + __field( u64, host_vpn ) __field( u64, pfn ) __field( ulong, eaddr ) __field( u64, vpage ) @@ -227,7 +227,7 @@ TRACE_EVENT(kvm_book3s_mmu_invalidate, ), TP_fast_assign( - __entry->host_va = pte->host_va; + __entry->host_vpn = pte->host_vpn; __entry->pfn = pte->pfn; __entry->eaddr = pte->pte.eaddr; __entry->vpage = pte->pte.vpage; @@ -238,7 +238,7 @@ TRACE_EVENT(kvm_book3s_mmu_invalidate, ), TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_va, __entry->pfn, __entry->eaddr, + __entry->host_vpn, __entry->pfn, __entry->eaddr, __entry->vpage, __entry->raddr, __entry->flags) ); diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 9a52349874ee..e15c521846ca 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -566,7 +566,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) unsigned long int ea; unsigned int cr, mb, me, sh; int err; - unsigned long old_ra; + unsigned long old_ra, val3; long ival; opcode = instr >> 26; @@ -1486,11 +1486,43 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) goto ldst_done; case 36: /* stw */ - case 37: /* stwu */ val = regs->gpr[rd]; err = write_mem(val, dform_ea(instr, regs), 4, regs); goto ldst_done; + case 37: /* stwu */ + val = regs->gpr[rd]; + val3 = dform_ea(instr, regs); + /* + * For PPC32 we always use stwu to change stack point with r1. So + * this emulated store may corrupt the exception frame, now we + * have to provide the exception frame trampoline, which is pushed + * below the kprobed function stack. So we only update gpr[1] but + * don't emulate the real store operation. We will do real store + * operation safely in exception return code by checking this flag. + */ + if ((ra == 1) && !(regs->msr & MSR_PR) \ + && (val3 >= (regs->gpr[1] - STACK_INT_FRAME_SIZE))) { + /* + * Check if we will touch kernel sack overflow + */ + if (val3 - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) { + printk(KERN_CRIT "Can't kprobe this since Kernel stack overflow.\n"); + err = -EINVAL; + break; + } + + /* + * Check if we already set since that means we'll + * lose the previous value. + */ + WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE)); + set_thread_flag(TIF_EMULATE_STACK_STORE); + err = 0; + } else + err = write_mem(val, val3, 4, regs); + goto ldst_done; + case 38: /* stb */ case 39: /* stbu */ val = regs->gpr[rd]; diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index 602aeb06d298..56585086413a 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -63,7 +63,7 @@ _GLOBAL(__hash_page_4K) /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" - * r29 is "va" + * r29 is vpn * r28 is a hash value * r27 is hashtab mask (maybe dynamic patched instead ?) */ @@ -111,10 +111,10 @@ BEGIN_FTR_SECTION cmpdi r9,0 /* check segment size */ bne 3f END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 - rldicl r3,r3,0,36 - or r29,r3,r29 + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 /* Calculate hash value for primary slot and store it in r28 */ rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ @@ -122,14 +122,19 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) xor r28,r5,r0 b 4f -3: /* Calc VA and hash in r29 and r28 for 1T segment */ - sldi r29,r5,40 /* vsid << 40 */ - clrldi r3,r3,24 /* ea & 0xffffffffff */ +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * calculate hash value for primary slot and + * store it in r28 for 1T segment + */ rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ clrldi r5,r5,40 /* vsid & 0xffffff */ rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */ xor r28,r28,r5 - or r29,r3,r29 /* VA */ xor r28,r28,r0 /* hash */ /* Convert linux PTE bits into HW equivalents */ @@ -185,7 +190,7 @@ htab_insert_pte: /* Call ppc_md.hpte_insert */ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve va */ + mr r4,r29 /* Retrieve vpn */ li r7,0 /* !bolted, !secondary */ li r8,MMU_PAGE_4K /* page size */ ld r9,STK_PARAM(R9)(r1) /* segment size */ @@ -208,7 +213,7 @@ _GLOBAL(htab_call_hpte_insert1) /* Call ppc_md.hpte_insert */ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve va */ + mr r4,r29 /* Retrieve vpn */ li r7,HPTE_V_SECONDARY /* !bolted, secondary */ li r8,MMU_PAGE_4K /* page size */ ld r9,STK_PARAM(R9)(r1) /* segment size */ @@ -278,7 +283,7 @@ htab_modify_pte: add r3,r0,r3 /* add slot idx */ /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ + mr r5,r29 /* vpn */ li r6,MMU_PAGE_4K /* page size */ ld r7,STK_PARAM(R9)(r1) /* segment size */ ld r8,STK_PARAM(R8)(r1) /* get "local" param */ @@ -339,7 +344,7 @@ _GLOBAL(__hash_page_4K) /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" - * r29 is "va" + * r29 is vpn * r28 is a hash value * r27 is hashtab mask (maybe dynamic patched instead ?) * r26 is the hidx mask @@ -394,10 +399,14 @@ BEGIN_FTR_SECTION cmpdi r9,0 /* check segment size */ bne 3f END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 /* r29 = (vsid << 28) */ - rldicl r3,r3,0,36 /* r3 = (ea & 0x0fffffff) */ - or r29,r3,r29 /* r29 = va */ + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + /* + * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff + * srdi r28,r3,VPN_SHIFT + */ + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 /* Calculate hash value for primary slot and store it in r28 */ rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ @@ -405,14 +414,23 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) xor r28,r5,r0 b 4f -3: /* Calc VA and hash in r29 and r28 for 1T segment */ - sldi r29,r5,40 /* vsid << 40 */ - clrldi r3,r3,24 /* ea & 0xffffffffff */ +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + /* + * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff + * srdi r28,r3,VPN_SHIFT + */ + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * Calculate hash value for primary slot and + * store it in r28 for 1T segment + */ rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ clrldi r5,r5,40 /* vsid & 0xffffff */ rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */ xor r28,r28,r5 - or r29,r3,r29 /* VA */ xor r28,r28,r0 /* hash */ /* Convert linux PTE bits into HW equivalents */ @@ -488,7 +506,7 @@ htab_special_pfn: /* Call ppc_md.hpte_insert */ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve va */ + mr r4,r29 /* Retrieve vpn */ li r7,0 /* !bolted, !secondary */ li r8,MMU_PAGE_4K /* page size */ ld r9,STK_PARAM(R9)(r1) /* segment size */ @@ -515,7 +533,7 @@ _GLOBAL(htab_call_hpte_insert1) /* Call ppc_md.hpte_insert */ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve va */ + mr r4,r29 /* Retrieve vpn */ li r7,HPTE_V_SECONDARY /* !bolted, secondary */ li r8,MMU_PAGE_4K /* page size */ ld r9,STK_PARAM(R9)(r1) /* segment size */ @@ -547,7 +565,7 @@ _GLOBAL(htab_call_hpte_remove) * useless now that the segment has been switched to 4k pages. */ htab_inval_old_hpte: - mr r3,r29 /* virtual addr */ + mr r3,r29 /* vpn */ mr r4,r31 /* PTE.pte */ li r5,0 /* PTE.hidx */ li r6,MMU_PAGE_64K /* psize */ @@ -620,7 +638,7 @@ htab_modify_pte: add r3,r0,r3 /* add slot idx */ /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ + mr r5,r29 /* vpn */ li r6,MMU_PAGE_4K /* page size */ ld r7,STK_PARAM(R9)(r1) /* segment size */ ld r8,STK_PARAM(R8)(r1) /* get "local" param */ @@ -676,7 +694,7 @@ _GLOBAL(__hash_page_64K) /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" - * r29 is "va" + * r29 is vpn * r28 is a hash value * r27 is hashtab mask (maybe dynamic patched instead ?) */ @@ -729,10 +747,10 @@ BEGIN_FTR_SECTION cmpdi r9,0 /* check segment size */ bne 3f END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 - rldicl r3,r3,0,36 - or r29,r3,r29 + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 /* Calculate hash value for primary slot and store it in r28 */ rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ @@ -740,14 +758,19 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) xor r28,r5,r0 b 4f -3: /* Calc VA and hash in r29 and r28 for 1T segment */ - sldi r29,r5,40 /* vsid << 40 */ - clrldi r3,r3,24 /* ea & 0xffffffffff */ +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * calculate hash value for primary slot and + * store it in r28 for 1T segment + */ rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ clrldi r5,r5,40 /* vsid & 0xffffff */ rldicl r0,r3,64-16,40 /* (ea >> 16) & 0xffffff */ xor r28,r28,r5 - or r29,r3,r29 /* VA */ xor r28,r28,r0 /* hash */ /* Convert linux PTE bits into HW equivalents */ @@ -806,7 +829,7 @@ ht64_insert_pte: /* Call ppc_md.hpte_insert */ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve va */ + mr r4,r29 /* Retrieve vpn */ li r7,0 /* !bolted, !secondary */ li r8,MMU_PAGE_64K ld r9,STK_PARAM(R9)(r1) /* segment size */ @@ -829,7 +852,7 @@ _GLOBAL(ht64_call_hpte_insert1) /* Call ppc_md.hpte_insert */ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve va */ + mr r4,r29 /* Retrieve vpn */ li r7,HPTE_V_SECONDARY /* !bolted, secondary */ li r8,MMU_PAGE_64K ld r9,STK_PARAM(R9)(r1) /* segment size */ @@ -899,7 +922,7 @@ ht64_modify_pte: add r3,r0,r3 /* add slot idx */ /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ + mr r5,r29 /* vpn */ li r6,MMU_PAGE_64K ld r7,STK_PARAM(R9)(r1) /* segment size */ ld r8,STK_PARAM(R8)(r1) /* get "local" param */ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index f21e8ce8db33..a4a1c728f269 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -39,22 +39,35 @@ DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void __tlbie(unsigned long va, int psize, int ssize) +static inline void __tlbie(unsigned long vpn, int psize, int ssize) { + unsigned long va; unsigned int penc; - /* clear top 16 bits, non SLS segment */ + /* + * We need 14 to 65 bits of va for a tlibe of 4K page + * With vpn we ignore the lower VPN_SHIFT bits already. + * And top two bits are already ignored because we can + * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT + * of 12. + */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ va &= ~(0xffffULL << 48); switch (psize) { case MMU_PAGE_4K: - va &= ~0xffful; va |= ssize << 8; asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) : "memory"); break; default: + /* We need 14 to 14 + i bits of va */ penc = mmu_psize_defs[psize].penc; va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); va |= penc << 12; @@ -67,21 +80,28 @@ static inline void __tlbie(unsigned long va, int psize, int ssize) } } -static inline void __tlbiel(unsigned long va, int psize, int ssize) +static inline void __tlbiel(unsigned long vpn, int psize, int ssize) { + unsigned long va; unsigned int penc; - /* clear top 16 bits, non SLS segment */ + /* VPN_SHIFT can be atmost 12 */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64 bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ va &= ~(0xffffULL << 48); switch (psize) { case MMU_PAGE_4K: - va &= ~0xffful; va |= ssize << 8; asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" : : "r"(va) : "memory"); break; default: + /* We need 14 to 14 + i bits of va */ penc = mmu_psize_defs[psize].penc; va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); va |= penc << 12; @@ -94,7 +114,7 @@ static inline void __tlbiel(unsigned long va, int psize, int ssize) } -static inline void tlbie(unsigned long va, int psize, int ssize, int local) +static inline void tlbie(unsigned long vpn, int psize, int ssize, int local) { unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); @@ -105,10 +125,10 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local) raw_spin_lock(&native_tlbie_lock); asm volatile("ptesync": : :"memory"); if (use_local) { - __tlbiel(va, psize, ssize); + __tlbiel(vpn, psize, ssize); asm volatile("ptesync": : :"memory"); } else { - __tlbie(va, psize, ssize); + __tlbie(vpn, psize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) @@ -134,7 +154,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) clear_bit_unlock(HPTE_LOCK_BIT, word); } -static long native_hpte_insert(unsigned long hpte_group, unsigned long va, +static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, int psize, int ssize) { @@ -143,9 +163,9 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long va, int i; if (!(vflags & HPTE_V_BOLTED)) { - DBG_LOW(" insert(group=%lx, va=%016lx, pa=%016lx," + DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, va, pa, rflags, vflags, psize); + hpte_group, vpn, pa, rflags, vflags, psize); } for (i = 0; i < HPTES_PER_GROUP; i++) { @@ -163,7 +183,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long va, if (i == HPTES_PER_GROUP) return -1; - hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; + hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { @@ -225,17 +245,17 @@ static long native_hpte_remove(unsigned long hpte_group) } static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int psize, int ssize, + unsigned long vpn, int psize, int ssize, int local) { struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0; - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_v(vpn, psize, ssize); - DBG_LOW(" update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)", - va, want_v & HPTE_V_AVPN, slot, newpp); + DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", + vpn, want_v & HPTE_V_AVPN, slot, newpp); native_lock_hpte(hptep); @@ -254,12 +274,12 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, native_unlock_hpte(hptep); /* Ensure it is out of the tlb too. */ - tlbie(va, psize, ssize, local); + tlbie(vpn, psize, ssize, local); return ret; } -static long native_hpte_find(unsigned long va, int psize, int ssize) +static long native_hpte_find(unsigned long vpn, int psize, int ssize) { struct hash_pte *hptep; unsigned long hash; @@ -267,8 +287,8 @@ static long native_hpte_find(unsigned long va, int psize, int ssize) long slot; unsigned long want_v, hpte_v; - hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_v(va, psize, ssize); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_v(vpn, psize, ssize); /* Bolted mappings are only ever in the primary group */ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -295,14 +315,15 @@ static long native_hpte_find(unsigned long va, int psize, int ssize) static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long vsid, va; + unsigned long vpn; + unsigned long vsid; long slot; struct hash_pte *hptep; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = native_hpte_find(va, psize, ssize); + slot = native_hpte_find(vpn, psize, ssize); if (slot == -1) panic("could not find page to bolt\n"); hptep = htab_address + slot; @@ -312,10 +333,10 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, (newpp & (HPTE_R_PP | HPTE_R_N)); /* Ensure it is out of the tlb too. */ - tlbie(va, psize, ssize, 0); + tlbie(vpn, psize, ssize, 0); } -static void native_hpte_invalidate(unsigned long slot, unsigned long va, +static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, int psize, int ssize, int local) { struct hash_pte *hptep = htab_address + slot; @@ -325,9 +346,9 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va, local_irq_save(flags); - DBG_LOW(" invalidate(va=%016lx, hash: %x)\n", va, slot); + DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_v(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = hptep->v; @@ -339,7 +360,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va, hptep->v = 0; /* Invalidate the TLB */ - tlbie(va, psize, ssize, local); + tlbie(vpn, psize, ssize, local); local_irq_restore(flags); } @@ -349,11 +370,12 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va, #define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT) static void hpte_decode(struct hash_pte *hpte, unsigned long slot, - int *psize, int *ssize, unsigned long *va) + int *psize, int *ssize, unsigned long *vpn) { + unsigned long avpn, pteg, vpi; unsigned long hpte_r = hpte->r; unsigned long hpte_v = hpte->v; - unsigned long avpn; + unsigned long vsid, seg_off; int i, size, shift, penc; if (!(hpte_v & HPTE_V_LARGE)) @@ -380,32 +402,38 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, } /* This works for all page sizes, and for 256M and 1T segments */ + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; shift = mmu_psize_defs[size].shift; - avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23; - - if (shift < 23) { - unsigned long vpi, vsid, pteg; - pteg = slot / HPTES_PER_GROUP; - if (hpte_v & HPTE_V_SECONDARY) - pteg = ~pteg; - switch (hpte_v >> HPTE_V_SSIZE_SHIFT) { - case MMU_SEGSIZE_256M: - vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask; - break; - case MMU_SEGSIZE_1T: - vsid = avpn >> 40; + avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + + switch (*ssize) { + case MMU_SEGSIZE_256M: + /* We only have 28 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (shift < 23) { + vpi = (vsid ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; + case MMU_SEGSIZE_1T: + /* We only have 40 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (shift < 23) { vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; - break; - default: - avpn = vpi = size = 0; + seg_off |= vpi << shift; } - avpn |= (vpi << mmu_psize_defs[size].shift); + *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; + default: + *vpn = size = 0; } - - *va = avpn; *psize = size; - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; } /* @@ -418,9 +446,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, */ static void native_hpte_clear(void) { + unsigned long vpn = 0; unsigned long slot, slots, flags; struct hash_pte *hptep = htab_address; - unsigned long hpte_v, va; + unsigned long hpte_v; unsigned long pteg_count; int psize, ssize; @@ -448,9 +477,9 @@ static void native_hpte_clear(void) * already hold the native_tlbie_lock. */ if (hpte_v & HPTE_V_VALID) { - hpte_decode(hptep, slot, &psize, &ssize, &va); + hpte_decode(hptep, slot, &psize, &ssize, &vpn); hptep->v = 0; - __tlbie(va, psize, ssize); + __tlbie(vpn, psize, ssize); } } @@ -465,7 +494,8 @@ static void native_hpte_clear(void) */ static void native_flush_hash_range(unsigned long number, int local) { - unsigned long va, hash, index, hidx, shift, slot; + unsigned long vpn; + unsigned long hash, index, hidx, shift, slot; struct hash_pte *hptep; unsigned long hpte_v; unsigned long want_v; @@ -479,18 +509,18 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_save(flags); for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; hptep = htab_address + slot; - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_v(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = hptep->v; if (!HPTE_V_COMPARE(hpte_v, want_v) || @@ -505,12 +535,12 @@ static void native_flush_hash_range(unsigned long number, int local) mmu_psize_defs[psize].tlbiel && local) { asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, - shift) { - __tlbiel(va, psize, ssize); + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbiel(vpn, psize, ssize); } pte_iterate_hashed_end(); } asm volatile("ptesync":::"memory"); @@ -522,12 +552,12 @@ static void native_flush_hash_range(unsigned long number, int local) asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, - shift) { - __tlbie(va, psize, ssize); + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbie(vpn, psize, ssize); } pte_iterate_hashed_end(); } asm volatile("eieio; tlbsync; ptesync":::"memory"); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index ba45739bdfe8..3a292be2e079 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -191,18 +191,18 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, vaddr += step, paddr += step) { unsigned long hash, hpteg; unsigned long vsid = get_kernel_vsid(vaddr, ssize); - unsigned long va = hpt_va(vaddr, vsid, ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); unsigned long tprot = prot; /* Make kernel text executable */ if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; - hash = hpt_hash(va, shift, ssize); + hash = hpt_hash(vpn, shift, ssize); hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); BUG_ON(!ppc_md.hpte_insert); - ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot, + ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot, HPTE_V_BOLTED, psize, ssize); if (ret < 0) @@ -803,16 +803,19 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) #ifdef CONFIG_PPC_MM_SLICES unsigned int get_paca_psize(unsigned long addr) { - unsigned long index, slices; + u64 lpsizes; + unsigned char *hpsizes; + unsigned long index, mask_index; if (addr < SLICE_LOW_TOP) { - slices = get_paca()->context.low_slices_psize; + lpsizes = get_paca()->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); - } else { - slices = get_paca()->context.high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); + return (lpsizes >> (index * 4)) & 0xF; } - return (slices >> (index * 4)) & 0xF; + hpsizes = get_paca()->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + mask_index = index & 0x1; + return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; } #else @@ -1152,21 +1155,21 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* WARNING: This is called from hash_low_64.S, if you change this prototype, * do not forget to update the assembly call site ! */ -void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize, +void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, int local) { unsigned long hash, index, shift, hidx, slot; - DBG_LOW("flush_hash_page(va=%016lx)\n", va); - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); - ppc_md.hpte_invalidate(slot, va, psize, ssize, local); + ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local); } pte_iterate_hashed_end(); } @@ -1180,7 +1183,7 @@ void flush_hash_range(unsigned long number, int local) &__get_cpu_var(ppc64_tlb_batch); for (i = 0; i < number; i++) - flush_hash_page(batch->vaddr[i], batch->pte[i], + flush_hash_page(batch->vpn[i], batch->pte[i], batch->psize, batch->ssize, local); } } @@ -1207,14 +1210,14 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { unsigned long hash, hpteg; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); int ret; - hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr), + ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr), mode, HPTE_V_BOLTED, mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); @@ -1228,9 +1231,9 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) { unsigned long hash, hidx, slot; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); spin_lock(&linear_map_hash_lock); BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); hidx = linear_map_hash_slots[lmi] & 0x7f; @@ -1240,7 +1243,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; - ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0); + ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index cc5c273086cf..cecad348f604 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -18,14 +18,15 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, int local, int ssize, unsigned int shift, unsigned int mmu_psize) { + unsigned long vpn; unsigned long old_pte, new_pte; - unsigned long va, rflags, pa, sz; + unsigned long rflags, pa, sz; long slot; BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); /* Search the Linux page table for a match with va */ - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); /* At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: @@ -69,19 +70,19 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* There MIGHT be an HPTE for this pte */ unsigned long hash, slot; - hash = hpt_hash(va, shift, ssize); + hash = hpt_hash(vpn, shift, ssize); if (old_pte & _PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += (old_pte & _PAGE_F_GIX) >> 12; - if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, + if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, ssize, local) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } if (likely(!(old_pte & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(va, shift, ssize); + unsigned long hash = hpt_hash(vpn, shift, ssize); unsigned long hpte_group; pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; @@ -101,14 +102,14 @@ repeat: _PAGE_COHERENT | _PAGE_GUARDED)); /* Insert into the hash table, primary slot */ - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, mmu_psize, ssize); /* Primary is full, try the secondary */ if (unlikely(slot == -1)) { hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, HPTE_V_SECONDARY, mmu_psize, ssize); if (slot == -1) { diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index 40677aa0190e..40bc5b0ace54 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -30,11 +30,13 @@ static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); /* - * The proto-VSID space has 2^35 - 1 segments available for user mappings. - * Each segment contains 2^28 bytes. Each context maps 2^44 bytes, - * so we can support 2^19-1 contexts (19 == 35 + 28 - 44). + * 256MB segment + * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments + * available for user mappings. Each segment contains 2^28 bytes. Each + * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts + * (19 == 37 + 28 - 46). */ -#define MAX_CONTEXT ((1UL << 19) - 1) +#define MAX_CONTEXT ((1UL << CONTEXT_BITS) - 1) int __init_new_context(void) { diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 297d49547ea8..e212a271c7a4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -55,8 +55,18 @@ #include "mmu_decl.h" -unsigned long ioremap_bot = IOREMAP_BASE; +/* Some sanity checking */ +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#ifdef CONFIG_PPC_STD_MMU_64 +#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif +#endif +unsigned long ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PPC_MMU_NOHASH static void *early_alloc_pgtable(unsigned long size) diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index b9ee79ce2200..1a16ca227757 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -56,6 +56,12 @@ _GLOBAL(slb_allocate_realmode) */ _GLOBAL(slb_miss_kernel_load_linear) li r11,0 + li r9,0x1 + /* + * for 1T we shift 12 bits more. slb_finish_load_1T will do + * the necessary adjustment + */ + rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0 BEGIN_FTR_SECTION b slb_finish_load END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) @@ -85,6 +91,12 @@ _GLOBAL(slb_miss_kernel_load_vmemmap) _GLOBAL(slb_miss_kernel_load_io) li r11,0 6: + li r9,0x1 + /* + * for 1T we shift 12 bits more. slb_finish_load_1T will do + * the necessary adjustment + */ + rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0 BEGIN_FTR_SECTION b slb_finish_load END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) @@ -108,17 +120,31 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) * between 4k and 64k standard page size */ #ifdef CONFIG_PPC_MM_SLICES + /* r10 have esid */ cmpldi r10,16 - - /* Get the slice index * 4 in r11 and matching slice size mask in r9 */ - ld r9,PACALOWSLICESPSIZE(r13) - sldi r11,r10,2 + /* below SLICE_LOW_TOP */ blt 5f - ld r9,PACAHIGHSLICEPSIZE(r13) - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2) - andi. r11,r11,0x3c + /* + * Handle hpsizes, + * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index + */ + srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ + addi r9,r11,PACAHIGHSLICEPSIZE + lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ + /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ + rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 + b 6f -5: /* Extract the psize and multiply to get an array offset */ +5: + /* + * Handle lpsizes + * r9 is get_paca()->context.low_slices_psize, r11 is index + */ + ld r9,PACALOWSLICESPSIZE(r13) + mr r11,r10 +6: + sldi r11,r11,2 /* index * 4 */ + /* Extract the psize and multiply to get an array offset */ srd r9,r9,r11 andi. r9,r9,0xf mulli r9,r9,MMUPSIZEDEFSIZE @@ -209,7 +235,11 @@ _GLOBAL(slb_allocate_user) */ slb_finish_load: ASM_VSID_SCRAMBLE(r10,r9,256M) - rldimi r11,r10,SLB_VSID_SHIFT,16 /* combine VSID and flags */ + /* + * bits above VSID_BITS_256M need to be ignored from r10 + * also combine VSID and flags + */ + rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) /* r3 = EA, r11 = VSID data */ /* @@ -252,10 +282,10 @@ _GLOBAL(slb_compare_rr_to_size) bge 1f /* still room in the slb cache */ - sldi r11,r3,1 /* r11 = offset * sizeof(u16) */ - rldicl r10,r10,36,28 /* get low 16 bits of the ESID */ - add r11,r11,r13 /* r11 = (u16 *)paca + offset */ - sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ + sldi r11,r3,2 /* r11 = offset * sizeof(u32) */ + srdi r10,r10,28 /* get the 36 bits of the ESID */ + add r11,r11,r13 /* r11 = (u32 *)paca + offset */ + stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ addi r3,r3,1 /* offset++ */ b 2f 1: /* offset >= SLB_CACHE_ENTRIES */ @@ -273,7 +303,11 @@ _GLOBAL(slb_compare_rr_to_size) slb_finish_load_1T: srdi r10,r10,40-28 /* get 1T ESID */ ASM_VSID_SCRAMBLE(r10,r9,1T) - rldimi r11,r10,SLB_VSID_SHIFT_1T,16 /* combine VSID and flags */ + /* + * bits above VSID_BITS_1T need to be ignored from r10 + * also combine VSID and flags + */ + rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 73709f7ce92c..5829d2a950d4 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -34,6 +34,11 @@ #include <asm/mmu.h> #include <asm/spu.h> +/* some sanity checks */ +#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error PGTABLE_RANGE exceeds slice_mask high_slices size +#endif + static DEFINE_SPINLOCK(slice_convert_lock); @@ -42,7 +47,7 @@ int _slice_debug = 1; static void slice_print_mask(const char *label, struct slice_mask mask) { - char *p, buf[16 + 3 + 16 + 1]; + char *p, buf[16 + 3 + 64 + 1]; int i; if (!_slice_debug) @@ -54,7 +59,7 @@ static void slice_print_mask(const char *label, struct slice_mask mask) *(p++) = '-'; *(p++) = ' '; for (i = 0; i < SLICE_NUM_HIGH; i++) - *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0'; + *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; *(p++) = 0; printk(KERN_DEBUG "%s:%s\n", label, buf); @@ -84,8 +89,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start, } if ((start + len) > SLICE_LOW_TOP) - ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1)) - - (1u << GET_HIGH_SLICE_INDEX(start)); + ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) + - (1ul << GET_HIGH_SLICE_INDEX(start)); return ret; } @@ -135,26 +140,31 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm) for (i = 0; i < SLICE_NUM_HIGH; i++) if (!slice_high_has_vma(mm, i)) - ret.high_slices |= 1u << i; + ret.high_slices |= 1ul << i; return ret; } static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) { + unsigned char *hpsizes; + int index, mask_index; struct slice_mask ret = { 0, 0 }; unsigned long i; - u64 psizes; + u64 lpsizes; - psizes = mm->context.low_slices_psize; + lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) - if (((psizes >> (i * 4)) & 0xf) == psize) + if (((lpsizes >> (i * 4)) & 0xf) == psize) ret.low_slices |= 1u << i; - psizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (((psizes >> (i * 4)) & 0xf) == psize) - ret.high_slices |= 1u << i; + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) + ret.high_slices |= 1ul << i; + } return ret; } @@ -183,8 +193,10 @@ static void slice_flush_segments(void *parm) static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize) { + int index, mask_index; /* Write the new slice psize bits */ - u64 lpsizes, hpsizes; + unsigned char *hpsizes; + u64 lpsizes; unsigned long i, flags; slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); @@ -201,14 +213,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz lpsizes = (lpsizes & ~(0xful << (i * 4))) | (((unsigned long)psize) << (i * 4)); - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (mask.high_slices & (1u << i)) - hpsizes = (hpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); - + /* Assign the value back */ mm->context.low_slices_psize = lpsizes; - mm->context.high_slices_psize = hpsizes; + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (mask.high_slices & (1ul << i)) + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } slice_dbg(" lsps=%lx, hsps=%lx\n", mm->context.low_slices_psize, @@ -587,18 +603,19 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) { - u64 psizes; - int index; + unsigned char *hpsizes; + int index, mask_index; if (addr < SLICE_LOW_TOP) { - psizes = mm->context.low_slices_psize; + u64 lpsizes; + lpsizes = mm->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); - } else { - psizes = mm->context.high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); + return (lpsizes >> (index * 4)) & 0xf; } - - return (psizes >> (index * 4)) & 0xf; + hpsizes = mm->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + mask_index = index & 0x1; + return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf; } EXPORT_SYMBOL_GPL(get_slice_psize); @@ -618,7 +635,9 @@ EXPORT_SYMBOL_GPL(get_slice_psize); */ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) { - unsigned long flags, lpsizes, hpsizes; + int index, mask_index; + unsigned char *hpsizes; + unsigned long flags, lpsizes; unsigned int old_psize; int i; @@ -639,15 +658,21 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) if (((lpsizes >> (i * 4)) & 0xf) == old_psize) lpsizes = (lpsizes & ~(0xful << (i * 4))) | (((unsigned long)psize) << (i * 4)); + /* Assign the value back */ + mm->context.low_slices_psize = lpsizes; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (((hpsizes >> (i * 4)) & 0xf) == old_psize) - hpsizes = (hpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize) + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + - mm->context.low_slices_psize = lpsizes; - mm->context.high_slices_psize = hpsizes; slice_dbg(" lsps=%lx, hsps=%lx\n", mm->context.low_slices_psize, @@ -660,18 +685,27 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) void slice_set_psize(struct mm_struct *mm, unsigned long address, unsigned int psize) { + unsigned char *hpsizes; unsigned long i, flags; - u64 *p; + u64 *lpsizes; spin_lock_irqsave(&slice_convert_lock, flags); if (address < SLICE_LOW_TOP) { i = GET_LOW_SLICE_INDEX(address); - p = &mm->context.low_slices_psize; + lpsizes = &mm->context.low_slices_psize; + *lpsizes = (*lpsizes & ~(0xful << (i * 4))) | + ((unsigned long) psize << (i * 4)); } else { + int index, mask_index; i = GET_HIGH_SLICE_INDEX(address); - p = &mm->context.high_slices_psize; + hpsizes = mm->context.high_slices_psize; + mask_index = i & 0x1; + index = i >> 1; + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); } - *p = (*p & ~(0xful << (i * 4))) | ((unsigned long) psize << (i * 4)); + spin_unlock_irqrestore(&slice_convert_lock, flags); #ifdef CONFIG_SPU_BASE diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 31f18207970b..ae758b3ff72c 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -42,8 +42,9 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge) { + unsigned long vpn; struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); - unsigned long vsid, vaddr; + unsigned long vsid; unsigned int psize; int ssize; real_pte_t rpte; @@ -86,7 +87,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, vsid = get_kernel_vsid(addr, mmu_kernel_ssize); ssize = mmu_kernel_ssize; } - vaddr = hpt_va(addr, vsid, ssize); + vpn = hpt_vpn(addr, vsid, ssize); rpte = __real_pte(__pte(pte), ptep); /* @@ -96,7 +97,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, * and decide to use local invalidates instead... */ if (!batch->active) { - flush_hash_page(vaddr, rpte, psize, ssize, 0); + flush_hash_page(vpn, rpte, psize, ssize, 0); put_cpu_var(ppc64_tlb_batch); return; } @@ -122,7 +123,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, batch->ssize = ssize; } batch->pte[i] = rpte; - batch->vaddr[i] = vaddr; + batch->vpn[i] = vpn; batch->index = ++i; if (i >= PPC64_TLB_BATCH_NR) __flush_tlb_pending(batch); @@ -146,7 +147,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) if (cpumask_equal(mm_cpumask(batch->mm), tmp)) local = 1; if (i == 1) - flush_hash_page(batch->vaddr[0], batch->pte[0], + flush_hash_page(batch->vpn[0], batch->pte[0], batch->psize, batch->ssize, local); else flush_hash_range(i, local); diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index 943c9d39aa16..0f6f83988b3d 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -88,7 +88,7 @@ static inline unsigned int beat_read_mask(unsigned hpte_group) } static long beat_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long pa, + unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, int psize, int ssize) { @@ -103,7 +103,7 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group, "rflags=%lx, vflags=%lx, psize=%d)\n", hpte_group, va, pa, rflags, vflags, psize); - hpte_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M) | + hpte_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize) | rflags; @@ -184,14 +184,14 @@ static void beat_lpar_hptab_clear(void) */ static long beat_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, + unsigned long vpn, int psize, int ssize, int local) { unsigned long lpar_rc; u64 dummy0, dummy1; unsigned long want_v; - want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); + want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M); DBG_LOW(" update: " "avpnv=%016lx, slot=%016lx, psize: %d, newpp %016lx ... ", @@ -220,15 +220,15 @@ static long beat_lpar_hpte_updatepp(unsigned long slot, return 0; } -static long beat_lpar_hpte_find(unsigned long va, int psize) +static long beat_lpar_hpte_find(unsigned long vpn, int psize) { unsigned long hash; unsigned long i, j; long slot; unsigned long want_v, hpte_v; - hash = hpt_hash(va, mmu_psize_defs[psize].shift, MMU_SEGSIZE_256M); - want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, MMU_SEGSIZE_256M); + want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M); for (j = 0; j < 2; j++) { slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -255,14 +255,15 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long lpar_rc, slot, vsid, va; + unsigned long vpn; + unsigned long lpar_rc, slot, vsid; u64 dummy0, dummy1; vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M); - va = (vsid << 28) | (ea & 0x0fffffff); + vpn = hpt_vpn(ea, vsid, MMU_SEGSIZE_256M); raw_spin_lock(&beat_htab_lock); - slot = beat_lpar_hpte_find(va, psize); + slot = beat_lpar_hpte_find(vpn, psize); BUG_ON(slot == -1); lpar_rc = beat_write_htab_entry(0, slot, 0, newpp, 0, 7, @@ -272,7 +273,7 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp, BUG_ON(lpar_rc != 0); } -static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long va, +static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, int psize, int ssize, int local) { unsigned long want_v; @@ -282,7 +283,7 @@ static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long va, DBG_LOW(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n", slot, va, psize, local); - want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); + want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M); raw_spin_lock_irqsave(&beat_htab_lock, flags); dummy1 = beat_lpar_hpte_getword0(slot); @@ -311,7 +312,7 @@ void __init hpte_init_beat(void) } static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, - unsigned long va, unsigned long pa, + unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, int psize, int ssize) { @@ -322,11 +323,11 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, return -1; if (!(vflags & HPTE_V_BOLTED)) - DBG_LOW("hpte_insert(group=%lx, va=%016lx, pa=%016lx, " + DBG_LOW("hpte_insert(group=%lx, vpn=%016lx, pa=%016lx, " "rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, va, pa, rflags, vflags, psize); + hpte_group, vpn, pa, rflags, vflags, psize); - hpte_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M) | + hpte_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize) | rflags; @@ -364,14 +365,14 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, */ static long beat_lpar_hpte_updatepp_v3(unsigned long slot, unsigned long newpp, - unsigned long va, + unsigned long vpn, int psize, int ssize, int local) { unsigned long lpar_rc; unsigned long want_v; unsigned long pss; - want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); + want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M); pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc; DBG_LOW(" update: " @@ -392,16 +393,16 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot, return 0; } -static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long va, +static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn, int psize, int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; unsigned long pss; - DBG_LOW(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n", - slot, va, psize, local); - want_v = hpte_encode_v(va, psize, MMU_SEGSIZE_256M); + DBG_LOW(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n", + slot, vpn, psize, local); + want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M); pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc; lpar_rc = beat_invalidate_htab_entry3(0, slot, want_v, pss); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5e75dcfe51b9..471aa3ccd9fd 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -34,14 +34,6 @@ #include "powernv.h" #include "pci.h" -struct resource_wrap { - struct list_head link; - resource_size_t size; - resource_size_t align; - struct pci_dev *dev; /* Set if it's a device */ - struct pci_bus *bus; /* Set if it's a bridge */ -}; - static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe, struct va_format *vaf) { @@ -77,273 +69,6 @@ define_pe_printk_level(pe_err, KERN_ERR); define_pe_printk_level(pe_warn, KERN_WARNING); define_pe_printk_level(pe_info, KERN_INFO); - -/* Calculate resource usage & alignment requirement of a single - * device. This will also assign all resources within the device - * for a given type starting at 0 for the biggest one and then - * assigning in decreasing order of size. - */ -static void __devinit pnv_ioda_calc_dev(struct pci_dev *dev, unsigned int flags, - resource_size_t *size, - resource_size_t *align) -{ - resource_size_t start; - struct resource *r; - int i; - - pr_devel(" -> CDR %s\n", pci_name(dev)); - - *size = *align = 0; - - /* Clear the resources out and mark them all unset */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - r = &dev->resource[i]; - if (!(r->flags & flags)) - continue; - if (r->start) { - r->end -= r->start; - r->start = 0; - } - r->flags |= IORESOURCE_UNSET; - } - - /* We currently keep all memory resources together, we - * will handle prefetch & 64-bit separately in the future - * but for now we stick everybody in M32 - */ - start = 0; - for (;;) { - resource_size_t max_size = 0; - int max_no = -1; - - /* Find next biggest resource */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - r = &dev->resource[i]; - if (!(r->flags & IORESOURCE_UNSET) || - !(r->flags & flags)) - continue; - if (resource_size(r) > max_size) { - max_size = resource_size(r); - max_no = i; - } - } - if (max_no < 0) - break; - r = &dev->resource[max_no]; - if (max_size > *align) - *align = max_size; - *size += max_size; - r->start = start; - start += max_size; - r->end = r->start + max_size - 1; - r->flags &= ~IORESOURCE_UNSET; - pr_devel(" -> R%d %016llx..%016llx\n", - max_no, r->start, r->end); - } - pr_devel(" <- CDR %s size=%llx align=%llx\n", - pci_name(dev), *size, *align); -} - -/* Allocate a resource "wrap" for a given device or bridge and - * insert it at the right position in the sorted list - */ -static void __devinit pnv_ioda_add_wrap(struct list_head *list, - struct pci_bus *bus, - struct pci_dev *dev, - resource_size_t size, - resource_size_t align) -{ - struct resource_wrap *w1, *w = kzalloc(sizeof(*w), GFP_KERNEL); - - w->size = size; - w->align = align; - w->dev = dev; - w->bus = bus; - - list_for_each_entry(w1, list, link) { - if (w1->align < align) { - list_add_tail(&w->link, &w1->link); - return; - } - } - list_add_tail(&w->link, list); -} - -/* Offset device resources of a given type */ -static void __devinit pnv_ioda_offset_dev(struct pci_dev *dev, - unsigned int flags, - resource_size_t offset) -{ - struct resource *r; - int i; - - pr_devel(" -> ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); - - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - r = &dev->resource[i]; - if (r->flags & flags) { - dev->resource[i].start += offset; - dev->resource[i].end += offset; - } - } - - pr_devel(" <- ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset); -} - -/* Offset bus resources (& all children) of a given type */ -static void __devinit pnv_ioda_offset_bus(struct pci_bus *bus, - unsigned int flags, - resource_size_t offset) -{ - struct resource *r; - struct pci_dev *dev; - struct pci_bus *cbus; - int i; - - pr_devel(" -> OBR %s [%x] +%016llx\n", - bus->self ? pci_name(bus->self) : "root", flags, offset); - - pci_bus_for_each_resource(bus, r, i) { - if (r && (r->flags & flags)) { - r->start += offset; - r->end += offset; - } - } - list_for_each_entry(dev, &bus->devices, bus_list) - pnv_ioda_offset_dev(dev, flags, offset); - list_for_each_entry(cbus, &bus->children, node) - pnv_ioda_offset_bus(cbus, flags, offset); - - pr_devel(" <- OBR %s [%x]\n", - bus->self ? pci_name(bus->self) : "root", flags); -} - -/* This is the guts of our IODA resource allocation. This is called - * recursively for each bus in the system. It calculates all the - * necessary size and requirements for children and assign them - * resources such that: - * - * - Each function fits in it's own contiguous set of IO/M32 - * segment - * - * - All segments behind a P2P bridge are contiguous and obey - * alignment constraints of those bridges - */ -static void __devinit pnv_ioda_calc_bus(struct pci_bus *bus, unsigned int flags, - resource_size_t *size, - resource_size_t *align) -{ - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; - resource_size_t dev_size, dev_align, start; - resource_size_t min_align, min_balign; - struct pci_dev *cdev; - struct pci_bus *cbus; - struct list_head head; - struct resource_wrap *w; - unsigned int bres; - - *size = *align = 0; - - pr_devel("-> CBR %s [%x]\n", - bus->self ? pci_name(bus->self) : "root", flags); - - /* Calculate alignment requirements based on the type - * of resource we are working on - */ - if (flags & IORESOURCE_IO) { - bres = 0; - min_align = phb->ioda.io_segsize; - min_balign = 0x1000; - } else { - bres = 1; - min_align = phb->ioda.m32_segsize; - min_balign = 0x100000; - } - - /* Gather all our children resources ordered by alignment */ - INIT_LIST_HEAD(&head); - - /* - Busses */ - list_for_each_entry(cbus, &bus->children, node) { - pnv_ioda_calc_bus(cbus, flags, &dev_size, &dev_align); - pnv_ioda_add_wrap(&head, cbus, NULL, dev_size, dev_align); - } - - /* - Devices */ - list_for_each_entry(cdev, &bus->devices, bus_list) { - pnv_ioda_calc_dev(cdev, flags, &dev_size, &dev_align); - /* Align them to segment size */ - if (dev_align < min_align) - dev_align = min_align; - pnv_ioda_add_wrap(&head, NULL, cdev, dev_size, dev_align); - } - if (list_empty(&head)) - goto empty; - - /* Now we can do two things: assign offsets to them within that - * level and get our total alignment & size requirements. The - * assignment algorithm is going to be uber-trivial for now, we - * can try to be smarter later at filling out holes. - */ - if (bus->self) { - /* No offset for downstream bridges */ - start = 0; - } else { - /* Offset from the root */ - if (flags & IORESOURCE_IO) - /* Don't hand out IO 0 */ - start = hose->io_resource.start + 0x1000; - else - start = hose->mem_resources[0].start; - } - while(!list_empty(&head)) { - w = list_first_entry(&head, struct resource_wrap, link); - list_del(&w->link); - if (w->size) { - if (start) { - start = ALIGN(start, w->align); - if (w->dev) - pnv_ioda_offset_dev(w->dev,flags,start); - else if (w->bus) - pnv_ioda_offset_bus(w->bus,flags,start); - } - if (w->align > *align) - *align = w->align; - } - start += w->size; - kfree(w); - } - *size = start; - - /* Align and setup bridge resources */ - *align = max_t(resource_size_t, *align, - max_t(resource_size_t, min_align, min_balign)); - *size = ALIGN(*size, - max_t(resource_size_t, min_align, min_balign)); - empty: - /* Only setup P2P's, not the PHB itself */ - if (bus->self) { - struct resource *res = bus->resource[bres]; - - if (WARN_ON(res == NULL)) - return; - - /* - * FIXME: We should probably export and call - * pci_bridge_check_ranges() to properly re-initialize - * the PCI portion of the flags here, and to detect - * what the bridge actually supports. - */ - res->start = 0; - res->flags = (*size) ? flags : 0; - res->end = (*size) ? (*size - 1) : 0; - } - - pr_devel("<- CBR %s [%x] *size=%016llx *align=%016llx\n", - bus->self ? pci_name(bus->self) : "root", flags,*size,*align); -} - static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) { struct device_node *np; @@ -354,172 +79,6 @@ static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) return PCI_DN(np); } -static void __devinit pnv_ioda_setup_pe_segments(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pnv_ioda_get_pdn(dev); - unsigned int pe, i; - resource_size_t pos; - struct resource io_res; - struct resource m32_res; - struct pci_bus_region region; - int rc; - - /* Anything not referenced in the device-tree gets PE#0 */ - pe = pdn ? pdn->pe_number : 0; - - /* Calculate the device min/max */ - io_res.start = m32_res.start = (resource_size_t)-1; - io_res.end = m32_res.end = 0; - io_res.flags = IORESOURCE_IO; - m32_res.flags = IORESOURCE_MEM; - - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - struct resource *r = NULL; - if (dev->resource[i].flags & IORESOURCE_IO) - r = &io_res; - if (dev->resource[i].flags & IORESOURCE_MEM) - r = &m32_res; - if (!r) - continue; - if (dev->resource[i].start < r->start) - r->start = dev->resource[i].start; - if (dev->resource[i].end > r->end) - r->end = dev->resource[i].end; - } - - /* Setup IO segments */ - if (io_res.start < io_res.end) { - pcibios_resource_to_bus(dev, ®ion, &io_res); - pos = region.start; - i = pos / phb->ioda.io_segsize; - while(i < phb->ioda.total_pe && pos <= region.end) { - if (phb->ioda.io_segmap[i]) { - pr_err("%s: Trying to use IO seg #%d which is" - " already used by PE# %d\n", - pci_name(dev), i, - phb->ioda.io_segmap[i]); - /* XXX DO SOMETHING TO DISABLE DEVICE ? */ - break; - } - phb->ioda.io_segmap[i] = pe; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, - OPAL_IO_WINDOW_TYPE, - 0, i); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d setting up mapping" - " for IO seg# %d\n", - pci_name(dev), rc, i); - /* XXX DO SOMETHING TO DISABLE DEVICE ? */ - break; - } - pos += phb->ioda.io_segsize; - i++; - }; - } - - /* Setup M32 segments */ - if (m32_res.start < m32_res.end) { - pcibios_resource_to_bus(dev, ®ion, &m32_res); - pos = region.start; - i = pos / phb->ioda.m32_segsize; - while(i < phb->ioda.total_pe && pos <= region.end) { - if (phb->ioda.m32_segmap[i]) { - pr_err("%s: Trying to use M32 seg #%d which is" - " already used by PE# %d\n", - pci_name(dev), i, - phb->ioda.m32_segmap[i]); - /* XXX DO SOMETHING TO DISABLE DEVICE ? */ - break; - } - phb->ioda.m32_segmap[i] = pe; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe, - OPAL_M32_WINDOW_TYPE, - 0, i); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d setting up mapping" - " for M32 seg# %d\n", - pci_name(dev), rc, i); - /* XXX DO SOMETHING TO DISABLE DEVICE ? */ - break; - } - pos += phb->ioda.m32_segsize; - i++; - } - } -} - -/* Check if a resource still fits in the total IO or M32 range - * for a given PHB - */ -static int __devinit pnv_ioda_resource_fit(struct pci_controller *hose, - struct resource *r) -{ - struct resource *bounds; - - if (r->flags & IORESOURCE_IO) - bounds = &hose->io_resource; - else if (r->flags & IORESOURCE_MEM) - bounds = &hose->mem_resources[0]; - else - return 1; - - if (r->start >= bounds->start && r->end <= bounds->end) - return 1; - r->flags = 0; - return 0; -} - -static void __devinit pnv_ioda_update_resources(struct pci_bus *bus) -{ - struct pci_controller *hose = pci_bus_to_host(bus); - struct pci_bus *cbus; - struct pci_dev *cdev; - unsigned int i; - - /* We used to clear all device enables here. However it looks like - * clearing MEM enable causes Obsidian (IPR SCS) to go bonkers, - * and shoot fatal errors to the PHB which in turns fences itself - * and we can't recover from that ... yet. So for now, let's leave - * the enables as-is and hope for the best. - */ - - /* Check if bus resources fit in our IO or M32 range */ - for (i = 0; bus->self && (i < 2); i++) { - struct resource *r = bus->resource[i]; - if (r && !pnv_ioda_resource_fit(hose, r)) - pr_err("%s: Bus %d resource %d disabled, no room\n", - pci_name(bus->self), bus->number, i); - } - - /* Update self if it's not a PHB */ - if (bus->self) - pci_setup_bridge(bus); - - /* Update child devices */ - list_for_each_entry(cdev, &bus->devices, bus_list) { - /* Check if resource fits, if not, disabled it */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - struct resource *r = &cdev->resource[i]; - if (!pnv_ioda_resource_fit(hose, r)) - pr_err("%s: Resource %d disabled, no room\n", - pci_name(cdev), i); - } - - /* Assign segments */ - pnv_ioda_setup_pe_segments(cdev); - - /* Update HW BARs */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) - pci_update_resource(cdev, i); - } - - /* Update child busses */ - list_for_each_entry(cbus, &bus->children, node) - pnv_ioda_update_resources(cbus); -} - static int __devinit pnv_ioda_alloc_pe(struct pnv_phb *phb) { unsigned long pe; @@ -547,7 +106,7 @@ static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe) * but in the meantime, we need to protect them to avoid warnings */ #ifdef CONFIG_PCI_MSI -static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev) +static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); struct pnv_phb *phb = hose->private_data; @@ -559,19 +118,6 @@ static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev) return NULL; return &phb->ioda.pe_array[pdn->pe_number]; } - -static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev) -{ - struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev); - - while (!pe && dev->bus->self) { - dev = dev->bus->self; - pe = __pnv_ioda_get_one_pe(dev); - if (pe) - pe = pe->bus_pe; - } - return pe; -} #endif /* CONFIG_PCI_MSI */ static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb, @@ -588,7 +134,11 @@ static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb, dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; parent = pe->pbus->self; - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + if (pe->flags & PNV_IODA_PE_BUS_ALL) + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + else + count = 1; + switch(count) { case 1: bcomp = OpalPciBusAll; break; case 2: bcomp = OpalPciBus7Bits; break; @@ -665,13 +215,13 @@ static void __devinit pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, { struct pnv_ioda_pe *lpe; - list_for_each_entry(lpe, &phb->ioda.pe_list, link) { + list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { if (lpe->dma_weight < pe->dma_weight) { - list_add_tail(&pe->link, &lpe->link); + list_add_tail(&pe->dma_link, &lpe->dma_link); return; } } - list_add_tail(&pe->link, &phb->ioda.pe_list); + list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); } static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) @@ -698,6 +248,7 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) return 10; } +#if 0 static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); @@ -766,6 +317,7 @@ static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev) return pe; } +#endif /* Useful for SRIOV case */ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) { @@ -783,34 +335,33 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) pdn->pcidev = dev; pdn->pe_number = pe->pe_number; pe->dma_weight += pnv_ioda_dma_weight(dev); - if (dev->subordinate) + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_same_PE(dev->subordinate, pe); } } -static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, - struct pnv_ioda_pe *ppe) +/* + * There're 2 types of PCI bus sensitive PEs: One that is compromised of + * single PCI bus. Another one that contains the primary PCI bus and its + * subordinate PCI devices and buses. The second type of PE is normally + * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. + */ +static void __devinit pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; - struct pci_bus *bus = dev->subordinate; struct pnv_ioda_pe *pe; int pe_num; - if (!bus) { - pr_warning("%s: Bridge without a subordinate bus !\n", - pci_name(dev)); - return; - } pe_num = pnv_ioda_alloc_pe(phb); if (pe_num == IODA_INVALID_PE) { - pr_warning("%s: Not enough PE# available, disabling bus\n", - pci_name(dev)); + pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", + __func__, pci_domain_nr(bus), bus->number); return; } pe = &phb->ioda.pe_array[pe_num]; - ppe->bus_pe = pe; + pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); pe->pbus = bus; pe->pdev = NULL; pe->tce32_seg = -1; @@ -818,8 +369,12 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, pe->rid = bus->busn_res.start << 8; pe->dma_weight = 0; - pe_info(pe, "Secondary busses %pR associated with PE\n", - &bus->busn_res); + if (all) + pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", + bus->busn_res.start, bus->busn_res.end, pe_num); + else + pe_info(pe, "Secondary bus %d associated with PE#%d\n", + bus->busn_res.start, pe_num); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ @@ -832,6 +387,9 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, /* Associate it with all child devices */ pnv_ioda_setup_same_PE(bus, pe); + /* Put PE to the list */ + list_add_tail(&pe->list, &phb->ioda.pe_list); + /* Account for one DMA PE if at least one DMA capable device exist * below the bridge */ @@ -847,17 +405,33 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev, static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus) { struct pci_dev *dev; - struct pnv_ioda_pe *pe; + + pnv_ioda_setup_bus_PE(bus, 0); list_for_each_entry(dev, &bus->devices, bus_list) { - pe = pnv_ioda_setup_dev_PE(dev); - if (pe == NULL) - continue; - /* Leaving the PCIe domain ... single PE# */ - if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) - pnv_ioda_setup_bus_PE(dev, pe); - else if (dev->subordinate) - pnv_ioda_setup_PEs(dev->subordinate); + if (dev->subordinate) { + if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) + pnv_ioda_setup_bus_PE(dev->subordinate, 1); + else + pnv_ioda_setup_PEs(dev->subordinate); + } + } +} + +/* + * Configure PEs so that the downstream PCI buses and devices + * could have their associated PE#. Unfortunately, we didn't + * figure out the way to identify the PLX bridge yet. So we + * simply put the PCI bus and the subordinate behind the root + * port to PE# here. The game rule here is expected to be changed + * as soon as we can detected PLX bridge correctly. + */ +static void __devinit pnv_pci_ioda_setup_PEs(void) +{ + struct pci_controller *hose, *tmp; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pnv_ioda_setup_PEs(hose->bus); } } @@ -999,7 +573,7 @@ static void __devinit pnv_ioda_setup_dma(struct pnv_phb *phb) remaining = phb->ioda.tce32_count; tw = phb->ioda.dma_weight; base = 0; - list_for_each_entry(pe, &phb->ioda.pe_list, link) { + list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { if (!pe->dma_weight) continue; if (!remaining) { @@ -1108,34 +682,151 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } #endif /* CONFIG_PCI_MSI */ -/* This is the starting point of our IODA specific resource - * allocation process +/* + * This function is supposed to be called on basis of PE from top + * to bottom style. So the the I/O or MMIO segment assigned to + * parent PE could be overrided by its child PEs if necessary. */ -static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose) +static void __devinit pnv_ioda_setup_pe_seg(struct pci_controller *hose, + struct pnv_ioda_pe *pe) { - resource_size_t size, align; - struct pci_bus *child; + struct pnv_phb *phb = hose->private_data; + struct pci_bus_region region; + struct resource *res; + int i, index; + int rc; - /* Associate PEs per functions */ - pnv_ioda_setup_PEs(hose->bus); + /* + * NOTE: We only care PCI bus based PE for now. For PCI + * device based PE, for example SRIOV sensitive VF should + * be figured out later. + */ + BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); + + pci_bus_for_each_resource(pe->pbus, res, i) { + if (!res || !res->flags || + res->start > res->end) + continue; + + if (res->flags & IORESOURCE_IO) { + region.start = res->start - phb->ioda.io_pci_base; + region.end = res->end - phb->ioda.io_pci_base; + index = region.start / phb->ioda.io_segsize; + + while (index < phb->ioda.total_pe && + region.start <= region.end) { + phb->ioda.io_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d when mapping IO " + "segment #%d to PE#%d\n", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.io_segsize; + index++; + } + } else if (res->flags & IORESOURCE_MEM) { + region.start = res->start - + hose->pci_mem_offset - + phb->ioda.m32_pci_base; + region.end = res->end - + hose->pci_mem_offset - + phb->ioda.m32_pci_base; + index = region.start / phb->ioda.m32_segsize; + + while (index < phb->ioda.total_pe && + region.start <= region.end) { + phb->ioda.m32_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d when mapping M32 " + "segment#%d to PE#%d", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.m32_segsize; + index++; + } + } + } +} - /* Calculate all resources */ - pnv_ioda_calc_bus(hose->bus, IORESOURCE_IO, &size, &align); - pnv_ioda_calc_bus(hose->bus, IORESOURCE_MEM, &size, &align); +static void __devinit pnv_pci_ioda_setup_seg(void) +{ + struct pci_controller *tmp, *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; - /* Apply then to HW */ - pnv_ioda_update_resources(hose->bus); + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + pnv_ioda_setup_pe_seg(hose, pe); + } + } +} - /* Setup DMA */ - pnv_ioda_setup_dma(hose->private_data); +static void __devinit pnv_pci_ioda_setup_DMA(void) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; - /* Configure PCI Express settings */ - list_for_each_entry(child, &hose->bus->children, node) { - struct pci_dev *self = child->self; - if (!self) - continue; - pcie_bus_configure_settings(child, self->pcie_mpss); + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pnv_ioda_setup_dma(hose->private_data); + + /* Mark the PHB initialization done */ + phb = hose->private_data; + phb->initialized = 1; + } +} + +static void __devinit pnv_pci_ioda_fixup(void) +{ + pnv_pci_ioda_setup_PEs(); + pnv_pci_ioda_setup_seg(); + pnv_pci_ioda_setup_DMA(); +} + +/* + * Returns the alignment for I/O or memory windows for P2P + * bridges. That actually depends on how PEs are segmented. + * For now, we return I/O or M32 segment size for PE sensitive + * P2P bridges. Otherwise, the default values (4KiB for I/O, + * 1MiB for memory) will be returned. + * + * The current PCI bus might be put into one PE, which was + * create against the parent PCI bridge. For that case, we + * needn't enlarge the alignment so that we can save some + * resources. + */ +static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, + unsigned long type) +{ + struct pci_dev *bridge; + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + int num_pci_bridges = 0; + + bridge = bus->self; + while (bridge) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { + num_pci_bridges++; + if (num_pci_bridges >= 2) + return 1; + } + + bridge = bridge->bus->self; } + + /* We need support prefetchable memory window later */ + if (type & IORESOURCE_MEM) + return phb->ioda.m32_segsize; + + return phb->ioda.io_segsize; } /* Prevent enabling devices for which we couldn't properly @@ -1143,10 +834,22 @@ static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose) */ static int __devinit pnv_pci_enable_device_hook(struct pci_dev *dev) { - struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn; + + /* The function is probably called while the PEs have + * not be created yet. For example, resource reassignment + * during PCI probe period. We just skip the check if + * PEs isn't ready. + */ + if (!phb->initialized) + return 0; + pdn = pnv_ioda_get_pdn(dev); if (!pdn || pdn->pe_number == IODA_INVALID_PE) return -EINVAL; + return 0; } @@ -1237,9 +940,9 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) /* Allocate aux data & arrays */ size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); m32map_off = size; - size += phb->ioda.total_pe; + size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); iomap_off = size; - size += phb->ioda.total_pe; + size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); pemap_off = size; size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); aux = alloc_bootmem(size); @@ -1250,6 +953,7 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) phb->ioda.pe_array = aux + pemap_off; set_bit(0, phb->ioda.pe_alloc); + INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); /* Calculate how many 32-bit TCE segments we have */ @@ -1298,14 +1002,17 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); - /* We set both PCI_PROBE_ONLY and PCI_REASSIGN_ALL_RSRC. This is an - * odd combination which essentially means that we skip all resource - * fixups and assignments in the generic code, and do it all - * ourselves here + /* + * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here + * to let the PCI core do resource assignment. It's supposed + * that the PCI core will do correct I/O and MMIO alignment + * for the P2P bridge bars so that each PCI bus (excluding + * the child P2P bridges) can form individual PE. */ - ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb; + ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; - pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC); + ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; + pci_add_flags(PCI_REASSIGN_ALL_RSRC); /* Reset IODA tables to a clean state */ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 8bc479634643..7cfb7c883deb 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -17,9 +17,14 @@ enum pnv_phb_model { }; #define PNV_PCI_DIAG_BUF_SIZE 4096 +#define PNV_IODA_PE_DEV (1 << 0) /* PE has single PCI device */ +#define PNV_IODA_PE_BUS (1 << 1) /* PE has primary PCI bus */ +#define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_ioda_pe { + unsigned long flags; + /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. @@ -40,11 +45,6 @@ struct pnv_ioda_pe { */ unsigned int dma_weight; - /* This is a PCI-E -> PCI-X bridge, this points to the - * corresponding bus PE - */ - struct pnv_ioda_pe *bus_pe; - /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ int tce32_seg; int tce32_segcount; @@ -59,7 +59,8 @@ struct pnv_ioda_pe { int mve_number; /* Link in list of PE#s */ - struct list_head link; + struct list_head dma_link; + struct list_head list; }; struct pnv_phb { @@ -68,6 +69,7 @@ struct pnv_phb { enum pnv_phb_model model; u64 opal_id; void __iomem *regs; + int initialized; spinlock_t lock; #ifdef CONFIG_PCI_MSI @@ -107,6 +109,11 @@ struct pnv_phb { unsigned int *io_segmap; struct pnv_ioda_pe *pe_array; + /* Sorted list of used PE's based + * on the sequence of creation + */ + struct list_head pe_list; + /* Reverse map of PEs, will have to extend if * we are to support more than 256 PEs, indexed * bus { bus, devfn } @@ -125,7 +132,7 @@ struct pnv_phb { /* Sorted list of used PE's, sorted at * boot for resource allocation purposes */ - struct list_head pe_list; + struct list_head pe_dma_list; } ioda; }; diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 3124cf791ebb..d00d7b0a3bda 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -43,7 +43,7 @@ enum ps3_lpar_vas_id { static DEFINE_SPINLOCK(ps3_htab_lock); -static long ps3_hpte_insert(unsigned long hpte_group, unsigned long va, +static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, int psize, int ssize) { @@ -61,7 +61,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long va, */ vflags &= ~HPTE_V_SECONDARY; - hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; + hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize) | rflags; spin_lock_irqsave(&ps3_htab_lock, flags); @@ -75,8 +75,8 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long va, if (result) { /* all entries bolted !*/ - pr_info("%s:result=%d va=%lx pa=%lx ix=%lx v=%llx r=%llx\n", - __func__, result, va, pa, hpte_group, hpte_v, hpte_r); + pr_info("%s:result=%d vpn=%lx pa=%lx ix=%lx v=%llx r=%llx\n", + __func__, result, vpn, pa, hpte_group, hpte_v, hpte_r); BUG(); } @@ -107,7 +107,7 @@ static long ps3_hpte_remove(unsigned long hpte_group) } static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int psize, int ssize, int local) + unsigned long vpn, int psize, int ssize, int local) { int result; u64 hpte_v, want_v, hpte_rs; @@ -115,7 +115,7 @@ static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp, unsigned long flags; long ret; - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_v(vpn, psize, ssize); spin_lock_irqsave(&ps3_htab_lock, flags); @@ -125,8 +125,8 @@ static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp, &hpte_rs); if (result) { - pr_info("%s: res=%d read va=%lx slot=%lx psize=%d\n", - __func__, result, va, slot, psize); + pr_info("%s: res=%d read vpn=%lx slot=%lx psize=%d\n", + __func__, result, vpn, slot, psize); BUG(); } @@ -159,7 +159,7 @@ static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, panic("ps3_hpte_updateboltedpp() not implemented"); } -static void ps3_hpte_invalidate(unsigned long slot, unsigned long va, +static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, int psize, int ssize, int local) { unsigned long flags; @@ -170,8 +170,8 @@ static void ps3_hpte_invalidate(unsigned long slot, unsigned long va, result = lv1_write_htab_entry(PS3_LPAR_VAS_ID_CURRENT, slot, 0, 0); if (result) { - pr_info("%s: res=%d va=%lx slot=%lx psize=%d\n", - __func__, result, va, slot, psize); + pr_info("%s: res=%d vpn=%lx slot=%lx psize=%d\n", + __func__, result, vpn, slot, psize); BUG(); } diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 18c168b752da..9a04322b1736 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -728,7 +728,7 @@ static void eeh_add_device_early(struct device_node *dn) { struct pci_controller *phb; - if (!dn || !of_node_to_eeh_dev(dn)) + if (!of_node_to_eeh_dev(dn)) return; phb = of_node_to_eeh_dev(dn)->phb; @@ -817,6 +817,7 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); /** * eeh_remove_device - Undo EEH setup for the indicated pci device * @dev: pci device to be removed + * @purge_pe: remove the PE or not * * This routine should be called when a device is removed from * a running system (e.g. by hotplug or dlpar). It unregisters @@ -824,7 +825,7 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); * this device will no longer be detected after this call; thus, * i/o errors affecting this slot may leave this device unusable. */ -static void eeh_remove_device(struct pci_dev *dev) +static void eeh_remove_device(struct pci_dev *dev, int purge_pe) { struct eeh_dev *edev; @@ -843,7 +844,7 @@ static void eeh_remove_device(struct pci_dev *dev) dev->dev.archdata.edev = NULL; pci_dev_put(dev); - eeh_rmv_from_parent_pe(edev); + eeh_rmv_from_parent_pe(edev, purge_pe); eeh_addr_cache_rmv_dev(dev); eeh_sysfs_remove_device(dev); } @@ -851,21 +852,22 @@ static void eeh_remove_device(struct pci_dev *dev) /** * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device * @dev: PCI device + * @purge_pe: remove the corresponding PE or not * * This routine must be called when a device is removed from the * running system through hotplug or dlpar. The corresponding * PCI address cache will be removed. */ -void eeh_remove_bus_device(struct pci_dev *dev) +void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { struct pci_bus *bus = dev->subordinate; struct pci_dev *child, *tmp; - eeh_remove_device(dev); + eeh_remove_device(dev, purge_pe); if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { list_for_each_entry_safe(child, tmp, &bus->devices, bus_list) - eeh_remove_bus_device(child); + eeh_remove_bus_device(child, purge_pe); } } EXPORT_SYMBOL_GPL(eeh_remove_bus_device); diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 8370ce7d5931..a3fefb61097c 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -25,6 +25,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/module.h> #include <linux/pci.h> #include <asm/eeh.h> #include <asm/eeh_event.h> @@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name(struct pci_dev *pdev) return ""; } +/** + * eeh_pcid_get - Get the PCI device driver + * @pdev: PCI device + * + * The function is used to retrieve the PCI device driver for + * the indicated PCI device. Besides, we will increase the reference + * of the PCI device driver to prevent that being unloaded on + * the fly. Otherwise, kernel crash would be seen. + */ +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return NULL; + + if (!try_module_get(pdev->driver->driver.owner)) + return NULL; + + return pdev->driver; +} + +/** + * eeh_pcid_put - Dereference on the PCI device driver + * @pdev: PCI device + * + * The function is called to do dereference on the PCI device + * driver of the indicated PCI device. + */ +static inline void eeh_pcid_put(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return; + + module_put(pdev->driver->driver.owner); +} + #if 0 static void print_device_node_tree(struct pci_dn *pdn, int dent) { @@ -128,23 +164,24 @@ static void *eeh_report_error(void *data, void *userdata) struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; + struct pci_driver *driver; /* We might not have the associated PCI device, * then we should continue for next one. */ if (!dev) return NULL; - dev->error_state = pci_channel_io_frozen; - if (!driver) - return NULL; + driver = eeh_pcid_get(dev); + if (!driver) return NULL; eeh_disable_irq(dev); if (!driver->err_handler || - !driver->err_handler->error_detected) + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); return NULL; + } rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); @@ -152,6 +189,7 @@ static void *eeh_report_error(void *data, void *userdata) if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; if (*res == PCI_ERS_RESULT_NONE) *res = rc; + eeh_pcid_put(dev); return NULL; } @@ -171,12 +209,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata) enum pci_ers_result rc, *res = userdata; struct pci_driver *driver; - if (!dev) return NULL; + driver = eeh_pcid_get(dev); + if (!driver) return NULL; - if (!(driver = dev->driver) || - !driver->err_handler || - !driver->err_handler->mmio_enabled) + if (!driver->err_handler || + !driver->err_handler->mmio_enabled) { + eeh_pcid_put(dev); return NULL; + } rc = driver->err_handler->mmio_enabled(dev); @@ -184,6 +224,7 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata) if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; if (*res == PCI_ERS_RESULT_NONE) *res = rc; + eeh_pcid_put(dev); return NULL; } @@ -204,16 +245,19 @@ static void *eeh_report_reset(void *data, void *userdata) enum pci_ers_result rc, *res = userdata; struct pci_driver *driver; - if (!dev || !(driver = dev->driver)) - return NULL; - + if (!dev) return NULL; dev->error_state = pci_channel_io_normal; + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + eeh_enable_irq(dev); if (!driver->err_handler || - !driver->err_handler->slot_reset) + !driver->err_handler->slot_reset) { + eeh_pcid_put(dev); return NULL; + } rc = driver->err_handler->slot_reset(dev); if ((*res == PCI_ERS_RESULT_NONE) || @@ -221,6 +265,7 @@ static void *eeh_report_reset(void *data, void *userdata) if (*res == PCI_ERS_RESULT_DISCONNECT && rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + eeh_pcid_put(dev); return NULL; } @@ -240,20 +285,22 @@ static void *eeh_report_resume(void *data, void *userdata) struct pci_driver *driver; if (!dev) return NULL; - dev->error_state = pci_channel_io_normal; - if (!(driver = dev->driver)) - return NULL; + driver = eeh_pcid_get(dev); + if (!driver) return NULL; eeh_enable_irq(dev); if (!driver->err_handler || - !driver->err_handler->resume) + !driver->err_handler->resume) { + eeh_pcid_put(dev); return NULL; + } driver->err_handler->resume(dev); + eeh_pcid_put(dev); return NULL; } @@ -272,20 +319,22 @@ static void *eeh_report_failure(void *data, void *userdata) struct pci_driver *driver; if (!dev) return NULL; - dev->error_state = pci_channel_io_perm_failure; - if (!(driver = dev->driver)) - return NULL; + driver = eeh_pcid_get(dev); + if (!driver) return NULL; eeh_disable_irq(dev); if (!driver->err_handler || - !driver->err_handler->error_detected) + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); return NULL; + } driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); + eeh_pcid_put(dev); return NULL; } @@ -305,8 +354,14 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; + /* + * We don't remove the corresponding PE instances because + * we need the information afterwords. The attached EEH + * devices are expected to be attached soon when calling + * into pcibios_add_pci_devices(). + */ if (bus) - pcibios_remove_pci_devices(bus); + __pcibios_remove_pci_devices(bus, 0); /* Reset the pci controller. (Asserts RST#; resets config space). * Reconfigure bridges and devices. Don't try to bring the system diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c index 904123c7657b..9d35543736ed 100644 --- a/arch/powerpc/platforms/pseries/eeh_pe.c +++ b/arch/powerpc/platforms/pseries/eeh_pe.c @@ -99,23 +99,19 @@ static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) { struct eeh_pe *pe; - eeh_lock(); - list_for_each_entry(pe, &eeh_phb_pe, child) { /* * Actually, we needn't check the type since * the PE for PHB has been determined when that * was created. */ - if (pe->type == EEH_PE_PHB && + if ((pe->type & EEH_PE_PHB) && pe->phb == phb) { eeh_unlock(); return pe; } } - eeh_unlock(); - return NULL; } @@ -192,14 +188,21 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root, return NULL; } + eeh_lock(); + /* Traverse root PE */ for (pe = root; pe; pe = eeh_pe_next(pe, root)) { eeh_pe_for_each_dev(pe, edev) { ret = fn(edev, flag); - if (ret) return ret; + if (ret) { + eeh_unlock(); + return ret; + } } } + eeh_unlock(); + return NULL; } @@ -219,7 +222,7 @@ static void *__eeh_pe_get(void *data, void *flag) struct eeh_dev *edev = (struct eeh_dev *)flag; /* Unexpected PHB PE */ - if (pe->type == EEH_PE_PHB) + if (pe->type & EEH_PE_PHB) return NULL; /* We prefer PE address */ @@ -251,9 +254,7 @@ static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) struct eeh_pe *root = eeh_phb_pe_get(edev->phb); struct eeh_pe *pe; - eeh_lock(); pe = eeh_pe_traverse(root, __eeh_pe_get, edev); - eeh_unlock(); return pe; } @@ -307,6 +308,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent; + eeh_lock(); + /* * Search the PE has been existing or not according * to the PE address. If that has been existing, the @@ -314,8 +317,9 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * components. */ pe = eeh_pe_get(edev); - if (pe) { + if (pe && !(pe->type & EEH_PE_INVALID)) { if (!edev->pe_config_addr) { + eeh_unlock(); pr_err("%s: PE with addr 0x%x already exists\n", __func__, edev->config_addr); return -EEXIST; @@ -327,15 +331,36 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Put the edev to PE */ list_add_tail(&edev->list, &pe->edevs); + eeh_unlock(); pr_debug("EEH: Add %s to Bus PE#%x\n", edev->dn->full_name, pe->addr); return 0; + } else if (pe && (pe->type & EEH_PE_INVALID)) { + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~EEH_PE_INVALID; + parent = parent->parent; + } + eeh_unlock(); + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; } /* Create a new EEH PE */ pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); if (!pe) { + eeh_unlock(); pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } @@ -352,6 +377,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) if (!parent) { parent = eeh_phb_pe_get(edev->phb); if (!parent) { + eeh_unlock(); pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", __func__, edev->phb->global_number); edev->pe = NULL; @@ -368,6 +394,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) list_add_tail(&pe->child, &parent->child_list); list_add_tail(&edev->list, &pe->edevs); edev->pe = pe; + eeh_unlock(); pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", edev->dn->full_name, pe->addr, pe->parent->addr); @@ -377,15 +404,17 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /** * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE * @edev: EEH device + * @purge_pe: remove PE or not * * The PE hierarchy tree might be changed when doing PCI hotplug. * Also, the PCI devices or buses could be removed from the system * during EEH recovery. So we have to call the function remove the * corresponding PE accordingly if necessary. */ -int eeh_rmv_from_parent_pe(struct eeh_dev *edev) +int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe) { - struct eeh_pe *pe, *parent; + struct eeh_pe *pe, *parent, *child; + int cnt; if (!edev->pe) { pr_warning("%s: No PE found for EEH device %s\n", @@ -393,6 +422,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) return -EEXIST; } + eeh_lock(); + /* Remove the EEH device */ pe = edev->pe; edev->pe = NULL; @@ -406,18 +437,39 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ while (1) { parent = pe->parent; - if (pe->type == EEH_PE_PHB) + if (pe->type & EEH_PE_PHB) break; - if (list_empty(&pe->edevs) && - list_empty(&pe->child_list)) { - list_del(&pe->child); - kfree(pe); + if (purge_pe) { + if (list_empty(&pe->edevs) && + list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } else { + break; + } + } else { + if (list_empty(&pe->edevs)) { + cnt = 0; + list_for_each_entry(child, &pe->child_list, child) { + if (!(pe->type & EEH_PE_INVALID)) { + cnt++; + break; + } + } + + if (!cnt) + pe->type |= EEH_PE_INVALID; + else + break; + } } pe = parent; } + eeh_unlock(); + return 0; } @@ -463,7 +515,9 @@ static void *__eeh_pe_state_mark(void *data, void *flag) */ void eeh_pe_state_mark(struct eeh_pe *pe, int state) { + eeh_lock(); eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); + eeh_unlock(); } /** @@ -497,7 +551,9 @@ static void *__eeh_pe_state_clear(void *data, void *flag) */ void eeh_pe_state_clear(struct eeh_pe *pe, int state) { + eeh_lock(); eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); + eeh_unlock(); } /** @@ -559,6 +615,10 @@ static void *eeh_restore_one_device_bars(void *data, void *flag) */ void eeh_pe_restore_bars(struct eeh_pe *pe) { + /* + * We needn't take the EEH lock since eeh_pe_dev_traverse() + * will take that. + */ eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL); } @@ -578,14 +638,18 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) struct eeh_dev *edev; struct pci_dev *pdev; - if (pe->type == EEH_PE_PHB) { + eeh_lock(); + + if (pe->type & EEH_PE_PHB) { bus = pe->phb->bus; - } else if (pe->type == EEH_PE_BUS) { + } else if (pe->type & EEH_PE_BUS) { edev = list_first_entry(&pe->edevs, struct eeh_dev, list); pdev = eeh_dev_to_pci_dev(edev); if (pdev) bus = pdev->bus; } + eeh_unlock(); + return bus; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 177055d0186b..0da39fed355a 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -107,9 +107,9 @@ void vpa_init(int cpu) } static long pSeries_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long pa, - unsigned long rflags, unsigned long vflags, - int psize, int ssize) + unsigned long vpn, unsigned long pa, + unsigned long rflags, unsigned long vflags, + int psize, int ssize) { unsigned long lpar_rc; unsigned long flags; @@ -117,11 +117,11 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long hpte_v, hpte_r; if (!(vflags & HPTE_V_BOLTED)) - pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, " - "rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, va, pa, rflags, vflags, psize); + pr_devel("hpte_insert(group=%lx, vpn=%016lx, " + "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, vpn, pa, rflags, vflags, psize); - hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; + hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize) | rflags; if (!(vflags & HPTE_V_BOLTED)) @@ -226,22 +226,6 @@ static void pSeries_lpar_hptab_clear(void) } /* - * This computes the AVPN and B fields of the first dword of a HPTE, - * for use when we want to match an existing PTE. The bottom 7 bits - * of the returned value are zero. - */ -static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, - int ssize) -{ - unsigned long v; - - v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm); - v <<= HPTE_V_AVPN_SHIFT; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; - return v; -} - -/* * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and * the low 3 bits of flags happen to line up. So no transform is needed. * We can probably optimize here and assume the high bits of newpp are @@ -249,14 +233,14 @@ static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, */ static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, + unsigned long vpn, int psize, int ssize, int local) { unsigned long lpar_rc; unsigned long flags = (newpp & 7) | H_AVPN; unsigned long want_v; - want_v = hpte_encode_avpn(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", want_v, slot, flags, psize); @@ -294,15 +278,15 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot) return dword0; } -static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize) +static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) { unsigned long hash; unsigned long i; long slot; unsigned long want_v, hpte_v; - hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(va, psize, ssize); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); /* Bolted entries are always in the primary group */ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -322,12 +306,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long lpar_rc, slot, vsid, va, flags; + unsigned long vpn; + unsigned long lpar_rc, slot, vsid, flags; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = pSeries_lpar_hpte_find(va, psize, ssize); + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); flags = newpp & 7; @@ -336,17 +321,17 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, BUG_ON(lpar_rc != H_SUCCESS); } -static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, +static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, int psize, int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; unsigned long dummy1, dummy2; - pr_devel(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n", - slot, va, psize, local); + pr_devel(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n", + slot, vpn, psize, local); - want_v = hpte_encode_avpn(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2); if (lpar_rc == H_NOT_FOUND) return; @@ -357,15 +342,16 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, static void pSeries_lpar_hpte_removebolted(unsigned long ea, int psize, int ssize) { - unsigned long slot, vsid, va; + unsigned long vpn; + unsigned long slot, vsid; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = pSeries_lpar_hpte_find(va, psize, ssize); + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0); + pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0); } /* Flag bits for H_BULK_REMOVE */ @@ -381,12 +367,12 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea, */ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) { + unsigned long vpn; unsigned long i, pix, rc; unsigned long flags = 0; struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); unsigned long param[9]; - unsigned long va; unsigned long hash, index, shift, hidx, slot; real_pte_t pte; int psize, ssize; @@ -398,21 +384,21 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) ssize = batch->ssize; pix = 0; for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { - pSeries_lpar_hpte_invalidate(slot, va, psize, + pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, local); } else { param[pix] = HBR_REQUEST | HBR_AVPN | slot; - param[pix+1] = hpte_encode_avpn(va, psize, + param[pix+1] = hpte_encode_avpn(vpn, psize, ssize); pix += 2; if (pix == 8) { diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 3ccebc83dc02..261a577a3dd2 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -65,27 +65,43 @@ pcibios_find_pci_bus(struct device_node *dn) EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); /** - * pcibios_remove_pci_devices - remove all devices under this bus + * __pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * @purge_pe: destroy the PE on removal of PCI devices * * Remove all of the PCI devices under this bus both from the * linux pci device tree, and from the powerpc EEH address cache. + * By default, the corresponding PE will be destroied during the + * normal PCI hotplug path. For PCI hotplug during EEH recovery, + * the corresponding PE won't be destroied and deallocated. */ -void pcibios_remove_pci_devices(struct pci_bus *bus) +void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe) { - struct pci_dev *dev, *tmp; + struct pci_dev *dev, *tmp; struct pci_bus *child_bus; /* First go down child busses */ list_for_each_entry(child_bus, &bus->children, node) - pcibios_remove_pci_devices(child_bus); + __pcibios_remove_pci_devices(child_bus, purge_pe); pr_debug("PCI: Removing devices on bus %04x:%02x\n", - pci_domain_nr(bus), bus->number); + pci_domain_nr(bus), bus->number); list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { pr_debug(" * Removing %s...\n", pci_name(dev)); - eeh_remove_bus_device(dev); - pci_stop_and_remove_bus_device(dev); - } + eeh_remove_bus_device(dev, purge_pe); + pci_stop_and_remove_bus_device(dev); + } +} + +/** + * pcibios_remove_pci_devices - remove all devices under this bus + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + */ +void pcibios_remove_pci_devices(struct pci_bus *bus) +{ + __pcibios_remove_pci_devices(bus, 1); } EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 987f441525cb..3a56a639a92e 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -60,6 +60,8 @@ static cpumask_t cpus_in_xmon = CPU_MASK_NONE; static unsigned long xmon_taken = 1; static int xmon_owner; static int xmon_gate; +#else +#define xmon_owner 0 #endif /* CONFIG_SMP */ static unsigned long in_xmon __read_mostly = 0; @@ -202,7 +204,13 @@ Commands:\n\ di dump instructions\n\ df dump float values\n\ dd dump double values\n\ - dl dump the kernel log buffer\n\ + dl dump the kernel log buffer\n" +#ifdef CONFIG_PPC64 + "\ + dp[#] dump paca for current cpu, or cpu #\n\ + dpa dump paca for all possible cpus\n" +#endif + "\ dr dump stream of raw bytes\n\ e print exception information\n\ f flush cache\n\ @@ -2009,6 +2017,95 @@ static void xmon_rawdump (unsigned long adrs, long ndump) printf("\n"); } +#ifdef CONFIG_PPC64 +static void dump_one_paca(int cpu) +{ + struct paca_struct *p; + + if (setjmp(bus_error_jmp) != 0) { + printf("*** Error dumping paca for cpu 0x%x!\n", cpu); + return; + } + + catch_memory_errors = 1; + sync(); + + p = &paca[cpu]; + + printf("paca for cpu 0x%x @ %p:\n", cpu, p); + + printf(" %-*s = %s\n", 16, "possible", cpu_possible(cpu) ? "yes" : "no"); + printf(" %-*s = %s\n", 16, "present", cpu_present(cpu) ? "yes" : "no"); + printf(" %-*s = %s\n", 16, "online", cpu_online(cpu) ? "yes" : "no"); + +#define DUMP(paca, name, format) \ + printf(" %-*s = %#-*"format"\t(0x%lx)\n", 16, #name, 18, paca->name, \ + offsetof(struct paca_struct, name)); + + DUMP(p, lock_token, "x"); + DUMP(p, paca_index, "x"); + DUMP(p, kernel_toc, "lx"); + DUMP(p, kernelbase, "lx"); + DUMP(p, kernel_msr, "lx"); +#ifdef CONFIG_PPC_STD_MMU_64 + DUMP(p, stab_real, "lx"); + DUMP(p, stab_addr, "lx"); +#endif + DUMP(p, emergency_sp, "p"); + DUMP(p, data_offset, "lx"); + DUMP(p, hw_cpu_id, "x"); + DUMP(p, cpu_start, "x"); + DUMP(p, kexec_state, "x"); + DUMP(p, __current, "p"); + DUMP(p, kstack, "lx"); + DUMP(p, stab_rr, "lx"); + DUMP(p, saved_r1, "lx"); + DUMP(p, trap_save, "x"); + DUMP(p, soft_enabled, "x"); + DUMP(p, irq_happened, "x"); + DUMP(p, io_sync, "x"); + DUMP(p, irq_work_pending, "x"); + DUMP(p, nap_state_lost, "x"); + +#undef DUMP + + catch_memory_errors = 0; + sync(); +} + +static void dump_all_pacas(void) +{ + int cpu; + + if (num_possible_cpus() == 0) { + printf("No possible cpus, use 'dp #' to dump individual cpus\n"); + return; + } + + for_each_possible_cpu(cpu) + dump_one_paca(cpu); +} + +static void dump_pacas(void) +{ + unsigned long num; + int c; + + c = inchar(); + if (c == 'a') { + dump_all_pacas(); + return; + } + + termch = c; /* Put c back, it wasn't 'a' */ + + if (scanhex(&num)) + dump_one_paca(num); + else + dump_one_paca(xmon_owner); +} +#endif + #define isxdigit(c) (('0' <= (c) && (c) <= '9') \ || ('a' <= (c) && (c) <= 'f') \ || ('A' <= (c) && (c) <= 'F')) @@ -2018,6 +2115,14 @@ dump(void) int c; c = inchar(); + +#ifdef CONFIG_PPC64 + if (c == 'p') { + dump_pacas(); + return; + } +#endif + if ((isxdigit(c) && c != 'f' && c != 'd') || c == '\n') termch = c; scanhex((void *)&adrs); |